]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
d3dceee2e521192cc999d12c0a07a3f336ea1c40
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
77
78 #define dout_subsys ceph_subsys_client
79
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
83
84 #include "Client.h"
85 #include "Inode.h"
86 #include "Dentry.h"
87 #include "Delegation.h"
88 #include "Dir.h"
89 #include "ClientSnapRealm.h"
90 #include "Fh.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
95
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
98
99 #include "include/cephfs/ceph_statx.h"
100
101 #if HAVE_GETGROUPLIST
102 #include <grp.h>
103 #include <pwd.h>
104 #include <unistd.h>
105 #endif
106
107 #undef dout_prefix
108 #define dout_prefix *_dout << "client." << whoami << " "
109
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111
112 // FreeBSD fails to define this
113 #ifndef O_DSYNC
114 #define O_DSYNC 0x0
115 #endif
116 // Darwin fails to define this
117 #ifndef O_RSYNC
118 #define O_RSYNC 0x0
119 #endif
120
121 #ifndef O_DIRECT
122 #define O_DIRECT 0x0
123 #endif
124
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126
127 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
128 {
129 Client *client = static_cast<Client*>(p);
130 client->flush_set_callback(oset);
131 }
132
133
134 // -------------
135
136 Client::CommandHook::CommandHook(Client *client) :
137 m_client(client)
138 {
139 }
140
141 int Client::CommandHook::call(
142 std::string_view command,
143 const cmdmap_t& cmdmap,
144 Formatter *f,
145 std::ostream& errss,
146 bufferlist& out)
147 {
148 f->open_object_section("result");
149 {
150 std::lock_guard l{m_client->client_lock};
151 if (command == "mds_requests")
152 m_client->dump_mds_requests(f);
153 else if (command == "mds_sessions")
154 m_client->dump_mds_sessions(f);
155 else if (command == "dump_cache")
156 m_client->dump_cache(f);
157 else if (command == "kick_stale_sessions")
158 m_client->_kick_stale_sessions();
159 else if (command == "status")
160 m_client->dump_status(f);
161 else
162 ceph_abort_msg("bad command registered");
163 }
164 f->close_section();
165 return 0;
166 }
167
168
169 // -------------
170
171 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
172 : inode(in), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
174 perms(perms)
175 { }
176
177 void Client::_reset_faked_inos()
178 {
179 ino_t start = 1024;
180 free_faked_inos.clear();
181 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
182 last_used_faked_ino = 0;
183 last_used_faked_root = 0;
184 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
185 }
186
187 void Client::_assign_faked_ino(Inode *in)
188 {
189 if (0 == last_used_faked_ino)
190 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
192 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
193 last_used_faked_ino = 2048;
194 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
195 }
196 ceph_assert(it != free_faked_inos.end());
197 if (last_used_faked_ino < it.get_start()) {
198 ceph_assert(it.get_len() > 0);
199 last_used_faked_ino = it.get_start();
200 } else {
201 ++last_used_faked_ino;
202 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
203 }
204 in->faked_ino = last_used_faked_ino;
205 free_faked_inos.erase(in->faked_ino);
206 faked_ino_map[in->faked_ino] = in->vino();
207 }
208
209 /*
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
215 */
216 void Client::_assign_faked_root(Inode *in)
217 {
218 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
219 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
220 last_used_faked_root = 0;
221 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
222 }
223 assert(it != free_faked_inos.end());
224 vinodeno_t inode_info = in->vino();
225 uint64_t inode_num = (uint64_t)inode_info.ino;
226 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
227 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it.get_start() + it.get_len() > last_used_faked_root);
229
230 in->faked_ino = last_used_faked_root;
231 free_faked_inos.erase(in->faked_ino);
232 faked_ino_map[in->faked_ino] = in->vino();
233 }
234
235 void Client::_release_faked_ino(Inode *in)
236 {
237 free_faked_inos.insert(in->faked_ino);
238 faked_ino_map.erase(in->faked_ino);
239 }
240
241 vinodeno_t Client::_map_faked_ino(ino_t ino)
242 {
243 vinodeno_t vino;
244 if (ino == 1)
245 vino = root->vino();
246 else if (faked_ino_map.count(ino))
247 vino = faked_ino_map[ino];
248 else
249 vino = vinodeno_t(0, CEPH_NOSNAP);
250 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
251 return vino;
252 }
253
254 vinodeno_t Client::map_faked_ino(ino_t ino)
255 {
256 std::lock_guard lock(client_lock);
257 return _map_faked_ino(ino);
258 }
259
260 // cons/des
261
262 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
263 : Dispatcher(m->cct),
264 timer(m->cct, client_lock),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
274 m_command_hook(this),
275 fscid(0)
276 {
277 _reset_faked_inos();
278
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
281 fuse_default_permissions = cct->_conf.get_val<bool>(
282 "fuse_default_permissions");
283
284 if (cct->_conf->client_acl_type == "posix_acl")
285 acl_type = POSIX_ACL;
286
287 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
288
289 // file handles
290 free_fd_set.insert(10, 1<<30);
291
292 mdsmap.reset(new MDSMap);
293
294 // osd interfaces
295 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
296 &client_lock));
297 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
298 client_flush_set_callback, // all commit callback
299 (void*)this,
300 cct->_conf->client_oc_size,
301 cct->_conf->client_oc_max_objects,
302 cct->_conf->client_oc_max_dirty,
303 cct->_conf->client_oc_target_dirty,
304 cct->_conf->client_oc_max_dirty_age,
305 true));
306 objecter_finisher.start();
307 filer.reset(new Filer(objecter, &objecter_finisher));
308 objecter->enable_blacklist_events();
309 }
310
311
312 Client::~Client()
313 {
314 ceph_assert(ceph_mutex_is_not_locked(client_lock));
315
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
319 std::lock_guard l{client_lock};
320 tear_down_cache();
321 }
322
323 void Client::tear_down_cache()
324 {
325 // fd's
326 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
327 it != fd_map.end();
328 ++it) {
329 Fh *fh = it->second;
330 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
331 _release_fh(fh);
332 }
333 fd_map.clear();
334
335 while (!opened_dirs.empty()) {
336 dir_result_t *dirp = *opened_dirs.begin();
337 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
338 _closedir(dirp);
339 }
340
341 // caps!
342 // *** FIXME ***
343
344 // empty lru
345 trim_cache();
346 ceph_assert(lru.lru_get_size() == 0);
347
348 // close root ino
349 ceph_assert(inode_map.size() <= 1 + root_parents.size());
350 if (root && inode_map.size() == 1 + root_parents.size()) {
351 delete root;
352 root = 0;
353 root_ancestor = 0;
354 while (!root_parents.empty())
355 root_parents.erase(root_parents.begin());
356 inode_map.clear();
357 _reset_faked_inos();
358 }
359
360 ceph_assert(inode_map.empty());
361 }
362
363 inodeno_t Client::get_root_ino()
364 {
365 std::lock_guard l(client_lock);
366 if (use_faked_inos())
367 return root->faked_ino;
368 else
369 return root->ino;
370 }
371
372 Inode *Client::get_root()
373 {
374 std::lock_guard l(client_lock);
375 root->ll_get();
376 return root;
377 }
378
379
380 // debug crapola
381
382 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
383 {
384 filepath path;
385 in->make_long_path(path);
386 ldout(cct, 1) << "dump_inode: "
387 << (disconnected ? "DISCONNECTED ":"")
388 << "inode " << in->ino
389 << " " << path
390 << " ref " << in->get_num_ref()
391 << *in << dendl;
392
393 if (f) {
394 f->open_object_section("inode");
395 f->dump_stream("path") << path;
396 if (disconnected)
397 f->dump_int("disconnected", 1);
398 in->dump(f);
399 f->close_section();
400 }
401
402 did.insert(in);
403 if (in->dir) {
404 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
405 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
406 it != in->dir->dentries.end();
407 ++it) {
408 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
409 if (f) {
410 f->open_object_section("dentry");
411 it->second->dump(f);
412 f->close_section();
413 }
414 if (it->second->inode)
415 dump_inode(f, it->second->inode.get(), did, false);
416 }
417 }
418 }
419
420 void Client::dump_cache(Formatter *f)
421 {
422 set<Inode*> did;
423
424 ldout(cct, 1) << __func__ << dendl;
425
426 if (f)
427 f->open_array_section("cache");
428
429 if (root)
430 dump_inode(f, root, did, true);
431
432 // make a second pass to catch anything disconnected
433 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
434 it != inode_map.end();
435 ++it) {
436 if (did.count(it->second))
437 continue;
438 dump_inode(f, it->second, did, true);
439 }
440
441 if (f)
442 f->close_section();
443 }
444
445 void Client::dump_status(Formatter *f)
446 {
447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
448
449 ldout(cct, 1) << __func__ << dendl;
450
451 const epoch_t osd_epoch
452 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
453
454 if (f) {
455 f->open_object_section("metadata");
456 for (const auto& kv : metadata)
457 f->dump_string(kv.first.c_str(), kv.second);
458 f->close_section();
459
460 f->dump_int("dentry_count", lru.lru_get_size());
461 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
462 f->dump_int("id", get_nodeid().v);
463 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
464 f->dump_object("inst", inst);
465 f->dump_object("addr", inst.addr);
466 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
467 f->dump_string("addr_str", inst.addr.get_legacy_str());
468 f->dump_int("inode_count", inode_map.size());
469 f->dump_int("mds_epoch", mdsmap->get_epoch());
470 f->dump_int("osd_epoch", osd_epoch);
471 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
472 f->dump_bool("blacklisted", blacklisted);
473 }
474 }
475
476 int Client::init()
477 {
478 timer.init();
479 objectcacher->start();
480 {
481 std::lock_guard l{client_lock};
482 ceph_assert(!initialized);
483 messenger->add_dispatcher_tail(this);
484 }
485 _finish_init();
486 return 0;
487 }
488
489 void Client::_finish_init()
490 {
491 {
492 std::lock_guard l{client_lock};
493 // logger
494 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
495 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
496 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
497 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
498 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
499 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
500 logger.reset(plb.create_perf_counters());
501 cct->get_perfcounters_collection()->add(logger.get());
502 }
503
504 cct->_conf.add_observer(this);
505
506 AdminSocket* admin_socket = cct->get_admin_socket();
507 int ret = admin_socket->register_command("mds_requests",
508 &m_command_hook,
509 "show in-progress mds requests");
510 if (ret < 0) {
511 lderr(cct) << "error registering admin socket command: "
512 << cpp_strerror(-ret) << dendl;
513 }
514 ret = admin_socket->register_command("mds_sessions",
515 &m_command_hook,
516 "show mds session state");
517 if (ret < 0) {
518 lderr(cct) << "error registering admin socket command: "
519 << cpp_strerror(-ret) << dendl;
520 }
521 ret = admin_socket->register_command("dump_cache",
522 &m_command_hook,
523 "show in-memory metadata cache contents");
524 if (ret < 0) {
525 lderr(cct) << "error registering admin socket command: "
526 << cpp_strerror(-ret) << dendl;
527 }
528 ret = admin_socket->register_command("kick_stale_sessions",
529 &m_command_hook,
530 "kick sessions that were remote reset");
531 if (ret < 0) {
532 lderr(cct) << "error registering admin socket command: "
533 << cpp_strerror(-ret) << dendl;
534 }
535 ret = admin_socket->register_command("status",
536 &m_command_hook,
537 "show overall client status");
538 if (ret < 0) {
539 lderr(cct) << "error registering admin socket command: "
540 << cpp_strerror(-ret) << dendl;
541 }
542
543 std::lock_guard l{client_lock};
544 initialized = true;
545 }
546
547 void Client::shutdown()
548 {
549 ldout(cct, 1) << __func__ << dendl;
550
551 // If we were not mounted, but were being used for sending
552 // MDS commands, we may have sessions that need closing.
553 {
554 std::lock_guard l{client_lock};
555 _close_sessions();
556 }
557 cct->_conf.remove_observer(this);
558
559 cct->get_admin_socket()->unregister_commands(&m_command_hook);
560
561 if (ino_invalidate_cb) {
562 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
563 async_ino_invalidator.wait_for_empty();
564 async_ino_invalidator.stop();
565 }
566
567 if (dentry_invalidate_cb) {
568 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
569 async_dentry_invalidator.wait_for_empty();
570 async_dentry_invalidator.stop();
571 }
572
573 if (switch_interrupt_cb) {
574 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
575 interrupt_finisher.wait_for_empty();
576 interrupt_finisher.stop();
577 }
578
579 if (remount_cb) {
580 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
581 remount_finisher.wait_for_empty();
582 remount_finisher.stop();
583 }
584
585 objectcacher->stop(); // outside of client_lock! this does a join.
586 {
587 std::lock_guard l{client_lock};
588 ceph_assert(initialized);
589 initialized = false;
590 timer.shutdown();
591 }
592 objecter_finisher.wait_for_empty();
593 objecter_finisher.stop();
594
595 if (logger) {
596 cct->get_perfcounters_collection()->remove(logger.get());
597 logger.reset();
598 }
599 }
600
601
602 // ===================
603 // metadata cache stuff
604
605 void Client::trim_cache(bool trim_kernel_dcache)
606 {
607 uint64_t max = cct->_conf->client_cache_size;
608 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
609 unsigned last = 0;
610 while (lru.lru_get_size() != last) {
611 last = lru.lru_get_size();
612
613 if (!unmounting && lru.lru_get_size() <= max) break;
614
615 // trim!
616 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
617 if (!dn)
618 break; // done
619
620 trim_dentry(dn);
621 }
622
623 if (trim_kernel_dcache && lru.lru_get_size() > max)
624 _invalidate_kernel_dcache();
625
626 // hose root?
627 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
628 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
629 delete root;
630 root = 0;
631 root_ancestor = 0;
632 while (!root_parents.empty())
633 root_parents.erase(root_parents.begin());
634 inode_map.clear();
635 _reset_faked_inos();
636 }
637 }
638
639 void Client::trim_cache_for_reconnect(MetaSession *s)
640 {
641 mds_rank_t mds = s->mds_num;
642 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
643
644 int trimmed = 0;
645 list<Dentry*> skipped;
646 while (lru.lru_get_size() > 0) {
647 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
648 if (!dn)
649 break;
650
651 if ((dn->inode && dn->inode->caps.count(mds)) ||
652 dn->dir->parent_inode->caps.count(mds)) {
653 trim_dentry(dn);
654 trimmed++;
655 } else
656 skipped.push_back(dn);
657 }
658
659 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
660 lru.lru_insert_mid(*p);
661
662 ldout(cct, 20) << __func__ << " mds." << mds
663 << " trimmed " << trimmed << " dentries" << dendl;
664
665 if (s->caps.size() > 0)
666 _invalidate_kernel_dcache();
667 }
668
669 void Client::trim_dentry(Dentry *dn)
670 {
671 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
672 << " in dir "
673 << std::hex << dn->dir->parent_inode->ino << std::dec
674 << dendl;
675 if (dn->inode) {
676 Inode *diri = dn->dir->parent_inode;
677 diri->dir_release_count++;
678 clear_dir_complete_and_ordered(diri, true);
679 }
680 unlink(dn, false, false); // drop dir, drop dentry
681 }
682
683
684 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
685 uint64_t truncate_seq, uint64_t truncate_size)
686 {
687 uint64_t prior_size = in->size;
688
689 if (truncate_seq > in->truncate_seq ||
690 (truncate_seq == in->truncate_seq && size > in->size)) {
691 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
692 in->size = size;
693 in->reported_size = size;
694 if (truncate_seq != in->truncate_seq) {
695 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
696 << truncate_seq << dendl;
697 in->truncate_seq = truncate_seq;
698 in->oset.truncate_seq = truncate_seq;
699
700 // truncate cached file data
701 if (prior_size > size) {
702 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
703 }
704 }
705
706 // truncate inline data
707 if (in->inline_version < CEPH_INLINE_NONE) {
708 uint32_t len = in->inline_data.length();
709 if (size < len)
710 in->inline_data.splice(size, len - size);
711 }
712 }
713 if (truncate_seq >= in->truncate_seq &&
714 in->truncate_size != truncate_size) {
715 if (in->is_file()) {
716 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
717 << truncate_size << dendl;
718 in->truncate_size = truncate_size;
719 in->oset.truncate_size = truncate_size;
720 } else {
721 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
722 }
723 }
724 }
725
726 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
727 utime_t ctime, utime_t mtime, utime_t atime)
728 {
729 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
730 << " ctime " << ctime << " mtime " << mtime << dendl;
731
732 if (time_warp_seq > in->time_warp_seq)
733 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
734 << " is higher than local time_warp_seq "
735 << in->time_warp_seq << dendl;
736
737 int warn = false;
738 // be careful with size, mtime, atime
739 if (issued & (CEPH_CAP_FILE_EXCL|
740 CEPH_CAP_FILE_WR|
741 CEPH_CAP_FILE_BUFFER|
742 CEPH_CAP_AUTH_EXCL|
743 CEPH_CAP_XATTR_EXCL)) {
744 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
745 if (ctime > in->ctime)
746 in->ctime = ctime;
747 if (time_warp_seq > in->time_warp_seq) {
748 //the mds updated times, so take those!
749 in->mtime = mtime;
750 in->atime = atime;
751 in->time_warp_seq = time_warp_seq;
752 } else if (time_warp_seq == in->time_warp_seq) {
753 //take max times
754 if (mtime > in->mtime)
755 in->mtime = mtime;
756 if (atime > in->atime)
757 in->atime = atime;
758 } else if (issued & CEPH_CAP_FILE_EXCL) {
759 //ignore mds values as we have a higher seq
760 } else warn = true;
761 } else {
762 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
763 if (time_warp_seq >= in->time_warp_seq) {
764 in->ctime = ctime;
765 in->mtime = mtime;
766 in->atime = atime;
767 in->time_warp_seq = time_warp_seq;
768 } else warn = true;
769 }
770 if (warn) {
771 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
772 << time_warp_seq << " is lower than local time_warp_seq "
773 << in->time_warp_seq
774 << dendl;
775 }
776 }
777
778 void Client::_fragmap_remove_non_leaves(Inode *in)
779 {
780 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
781 if (!in->dirfragtree.is_leaf(p->first))
782 in->fragmap.erase(p++);
783 else
784 ++p;
785 }
786
787 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
788 {
789 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
790 if (p->second == mds)
791 in->fragmap.erase(p++);
792 else
793 ++p;
794 }
795
796 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
797 MetaSession *session,
798 const UserPerm& request_perms)
799 {
800 Inode *in;
801 bool was_new = false;
802 if (inode_map.count(st->vino)) {
803 in = inode_map[st->vino];
804 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
805 } else {
806 in = new Inode(this, st->vino, &st->layout);
807 inode_map[st->vino] = in;
808
809 if (use_faked_inos())
810 _assign_faked_ino(in);
811
812 if (!root) {
813 root = in;
814 if (use_faked_inos())
815 _assign_faked_root(root);
816 root_ancestor = in;
817 cwd = root;
818 } else if (!mounted) {
819 root_parents[root_ancestor] = in;
820 root_ancestor = in;
821 }
822
823 // immutable bits
824 in->ino = st->vino.ino;
825 in->snapid = st->vino.snapid;
826 in->mode = st->mode & S_IFMT;
827 was_new = true;
828 }
829
830 in->rdev = st->rdev;
831 if (in->is_symlink())
832 in->symlink = st->symlink;
833
834 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
835 bool new_version = false;
836 if (in->version == 0 ||
837 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
838 (in->version & ~1) < st->version))
839 new_version = true;
840
841 int issued;
842 in->caps_issued(&issued);
843 issued |= in->caps_dirty();
844 int new_issued = ~issued & (int)st->cap.caps;
845
846 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
847 !(issued & CEPH_CAP_AUTH_EXCL)) {
848 in->mode = st->mode;
849 in->uid = st->uid;
850 in->gid = st->gid;
851 in->btime = st->btime;
852 in->snap_btime = st->snap_btime;
853 }
854
855 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
856 !(issued & CEPH_CAP_LINK_EXCL)) {
857 in->nlink = st->nlink;
858 }
859
860 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
861 update_inode_file_time(in, issued, st->time_warp_seq,
862 st->ctime, st->mtime, st->atime);
863 }
864
865 if (new_version ||
866 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
867 in->layout = st->layout;
868 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
869 }
870
871 if (in->is_dir()) {
872 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
873 in->dirstat = st->dirstat;
874 }
875 // dir_layout/rstat/quota are not tracked by capability, update them only if
876 // the inode stat is from auth mds
877 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
878 in->dir_layout = st->dir_layout;
879 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
880 in->rstat = st->rstat;
881 in->quota = st->quota;
882 in->dir_pin = st->dir_pin;
883 }
884 // move me if/when version reflects fragtree changes.
885 if (in->dirfragtree != st->dirfragtree) {
886 in->dirfragtree = st->dirfragtree;
887 _fragmap_remove_non_leaves(in);
888 }
889 }
890
891 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
892 st->xattrbl.length() &&
893 st->xattr_version > in->xattr_version) {
894 auto p = st->xattrbl.cbegin();
895 decode(in->xattrs, p);
896 in->xattr_version = st->xattr_version;
897 }
898
899 if (st->inline_version > in->inline_version) {
900 in->inline_data = st->inline_data;
901 in->inline_version = st->inline_version;
902 }
903
904 /* always take a newer change attr */
905 if (st->change_attr > in->change_attr)
906 in->change_attr = st->change_attr;
907
908 if (st->version > in->version)
909 in->version = st->version;
910
911 if (was_new)
912 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
913
914 if (!st->cap.caps)
915 return in; // as with readdir returning indoes in different snaprealms (no caps!)
916
917 if (in->snapid == CEPH_NOSNAP) {
918 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
919 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
920 st->cap.flags, request_perms);
921 if (in->auth_cap && in->auth_cap->session == session) {
922 in->max_size = st->max_size;
923 in->rstat = st->rstat;
924 }
925
926 // setting I_COMPLETE needs to happen after adding the cap
927 if (in->is_dir() &&
928 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
929 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
930 in->dirstat.nfiles == 0 &&
931 in->dirstat.nsubdirs == 0) {
932 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
933 in->flags |= I_COMPLETE | I_DIR_ORDERED;
934 if (in->dir) {
935 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
936 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
937 in->dir->readdir_cache.clear();
938 for (const auto& p : in->dir->dentries) {
939 unlink(p.second, true, true); // keep dir, keep dentry
940 }
941 if (in->dir->dentries.empty())
942 close_dir(in->dir);
943 }
944 }
945 } else {
946 in->snap_caps |= st->cap.caps;
947 }
948
949 return in;
950 }
951
952
953 /*
954 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
955 */
956 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
957 Inode *in, utime_t from, MetaSession *session,
958 Dentry *old_dentry)
959 {
960 Dentry *dn = NULL;
961 if (dir->dentries.count(dname))
962 dn = dir->dentries[dname];
963
964 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
965 << " in dir " << dir->parent_inode->vino() << " dn " << dn
966 << dendl;
967
968 if (dn && dn->inode) {
969 if (dn->inode->vino() == in->vino()) {
970 touch_dn(dn);
971 ldout(cct, 12) << " had dentry " << dname
972 << " with correct vino " << dn->inode->vino()
973 << dendl;
974 } else {
975 ldout(cct, 12) << " had dentry " << dname
976 << " with WRONG vino " << dn->inode->vino()
977 << dendl;
978 unlink(dn, true, true); // keep dir, keep dentry
979 }
980 }
981
982 if (!dn || !dn->inode) {
983 InodeRef tmp_ref(in);
984 if (old_dentry) {
985 if (old_dentry->dir != dir) {
986 Inode *old_diri = old_dentry->dir->parent_inode;
987 old_diri->dir_ordered_count++;
988 clear_dir_complete_and_ordered(old_diri, false);
989 }
990 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
991 }
992 Inode *diri = dir->parent_inode;
993 diri->dir_ordered_count++;
994 clear_dir_complete_and_ordered(diri, false);
995 dn = link(dir, dname, in, dn);
996 }
997
998 update_dentry_lease(dn, dlease, from, session);
999 return dn;
1000 }
1001
1002 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1003 {
1004 utime_t dttl = from;
1005 dttl += (float)dlease->duration_ms / 1000.0;
1006
1007 ceph_assert(dn);
1008
1009 if (dlease->mask & CEPH_LEASE_VALID) {
1010 if (dttl > dn->lease_ttl) {
1011 ldout(cct, 10) << "got dentry lease on " << dn->name
1012 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1013 dn->lease_ttl = dttl;
1014 dn->lease_mds = session->mds_num;
1015 dn->lease_seq = dlease->seq;
1016 dn->lease_gen = session->cap_gen;
1017 }
1018 }
1019 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1020 }
1021
1022
1023 /*
1024 * update MDS location cache for a single inode
1025 */
1026 void Client::update_dir_dist(Inode *in, DirStat *dst)
1027 {
1028 // auth
1029 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1030 if (dst->auth >= 0) {
1031 in->fragmap[dst->frag] = dst->auth;
1032 } else {
1033 in->fragmap.erase(dst->frag);
1034 }
1035 if (!in->dirfragtree.is_leaf(dst->frag)) {
1036 in->dirfragtree.force_to_leaf(cct, dst->frag);
1037 _fragmap_remove_non_leaves(in);
1038 }
1039
1040 // replicated
1041 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1042 }
1043
1044 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1045 {
1046 if (diri->flags & I_COMPLETE) {
1047 if (complete) {
1048 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1049 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1050 } else {
1051 if (diri->flags & I_DIR_ORDERED) {
1052 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1053 diri->flags &= ~I_DIR_ORDERED;
1054 }
1055 }
1056 if (diri->dir)
1057 diri->dir->readdir_cache.clear();
1058 }
1059 }
1060
1061 /*
1062 * insert results from readdir or lssnap into the metadata cache.
1063 */
1064 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1065
1066 auto& reply = request->reply;
1067 ConnectionRef con = request->reply->get_connection();
1068 uint64_t features;
1069 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1070 features = (uint64_t)-1;
1071 }
1072 else {
1073 features = con->get_features();
1074 }
1075
1076 dir_result_t *dirp = request->dirp;
1077 ceph_assert(dirp);
1078
1079 // the extra buffer list is only set for readdir and lssnap replies
1080 auto p = reply->get_extra_bl().cbegin();
1081 if (!p.end()) {
1082 // snapdir?
1083 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1084 ceph_assert(diri);
1085 diri = open_snapdir(diri);
1086 }
1087
1088 // only open dir if we're actually adding stuff to it!
1089 Dir *dir = diri->open_dir();
1090 ceph_assert(dir);
1091
1092 // dirstat
1093 DirStat dst(p, features);
1094 __u32 numdn;
1095 __u16 flags;
1096 decode(numdn, p);
1097 decode(flags, p);
1098
1099 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1100 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1101
1102 frag_t fg = (unsigned)request->head.args.readdir.frag;
1103 unsigned readdir_offset = dirp->next_offset;
1104 string readdir_start = dirp->last_name;
1105 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1106
1107 unsigned last_hash = 0;
1108 if (hash_order) {
1109 if (!readdir_start.empty()) {
1110 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1111 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1112 /* mds understands offset_hash */
1113 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1114 }
1115 }
1116
1117 if (fg != dst.frag) {
1118 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1119 fg = dst.frag;
1120 if (!hash_order) {
1121 readdir_offset = 2;
1122 readdir_start.clear();
1123 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1124 }
1125 }
1126
1127 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1128 << ", hash_order=" << hash_order
1129 << ", readdir_start " << readdir_start
1130 << ", last_hash " << last_hash
1131 << ", next_offset " << readdir_offset << dendl;
1132
1133 if (diri->snapid != CEPH_SNAPDIR &&
1134 fg.is_leftmost() && readdir_offset == 2 &&
1135 !(hash_order && last_hash)) {
1136 dirp->release_count = diri->dir_release_count;
1137 dirp->ordered_count = diri->dir_ordered_count;
1138 dirp->start_shared_gen = diri->shared_gen;
1139 dirp->cache_index = 0;
1140 }
1141
1142 dirp->buffer_frag = fg;
1143
1144 _readdir_drop_dirp_buffer(dirp);
1145 dirp->buffer.reserve(numdn);
1146
1147 string dname;
1148 LeaseStat dlease;
1149 for (unsigned i=0; i<numdn; i++) {
1150 decode(dname, p);
1151 dlease.decode(p, features);
1152 InodeStat ist(p, features);
1153
1154 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1155
1156 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1157 request->perms);
1158 Dentry *dn;
1159 if (diri->dir->dentries.count(dname)) {
1160 Dentry *olddn = diri->dir->dentries[dname];
1161 if (olddn->inode != in) {
1162 // replace incorrect dentry
1163 unlink(olddn, true, true); // keep dir, dentry
1164 dn = link(dir, dname, in, olddn);
1165 ceph_assert(dn == olddn);
1166 } else {
1167 // keep existing dn
1168 dn = olddn;
1169 touch_dn(dn);
1170 }
1171 } else {
1172 // new dn
1173 dn = link(dir, dname, in, NULL);
1174 }
1175
1176 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1177 if (hash_order) {
1178 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1179 if (hash != last_hash)
1180 readdir_offset = 2;
1181 last_hash = hash;
1182 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1183 } else {
1184 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1185 }
1186 // add to readdir cache
1187 if (dirp->release_count == diri->dir_release_count &&
1188 dirp->ordered_count == diri->dir_ordered_count &&
1189 dirp->start_shared_gen == diri->shared_gen) {
1190 if (dirp->cache_index == dir->readdir_cache.size()) {
1191 if (i == 0) {
1192 ceph_assert(!dirp->inode->is_complete_and_ordered());
1193 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1194 }
1195 dir->readdir_cache.push_back(dn);
1196 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1197 if (dirp->inode->is_complete_and_ordered())
1198 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1199 else
1200 dir->readdir_cache[dirp->cache_index] = dn;
1201 } else {
1202 ceph_abort_msg("unexpected readdir buffer idx");
1203 }
1204 dirp->cache_index++;
1205 }
1206 // add to cached result list
1207 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1208 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1209 }
1210
1211 if (numdn > 0)
1212 dirp->last_name = dname;
1213 if (end)
1214 dirp->next_offset = 2;
1215 else
1216 dirp->next_offset = readdir_offset;
1217
1218 if (dir->is_empty())
1219 close_dir(dir);
1220 }
1221 }
1222
1223 /** insert_trace
1224 *
1225 * insert a trace from a MDS reply into the cache.
1226 */
1227 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1228 {
1229 auto& reply = request->reply;
1230 int op = request->get_op();
1231
1232 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1233 << " is_target=" << (int)reply->head.is_target
1234 << " is_dentry=" << (int)reply->head.is_dentry
1235 << dendl;
1236
1237 auto p = reply->get_trace_bl().cbegin();
1238 if (request->got_unsafe) {
1239 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1240 ceph_assert(p.end());
1241 return NULL;
1242 }
1243
1244 if (p.end()) {
1245 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1246
1247 Dentry *d = request->dentry();
1248 if (d) {
1249 Inode *diri = d->dir->parent_inode;
1250 diri->dir_release_count++;
1251 clear_dir_complete_and_ordered(diri, true);
1252 }
1253
1254 if (d && reply->get_result() == 0) {
1255 if (op == CEPH_MDS_OP_RENAME) {
1256 // rename
1257 Dentry *od = request->old_dentry();
1258 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1259 ceph_assert(od);
1260 unlink(od, true, true); // keep dir, dentry
1261 } else if (op == CEPH_MDS_OP_RMDIR ||
1262 op == CEPH_MDS_OP_UNLINK) {
1263 // unlink, rmdir
1264 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1265 unlink(d, true, true); // keep dir, dentry
1266 }
1267 }
1268 return NULL;
1269 }
1270
1271 ConnectionRef con = request->reply->get_connection();
1272 uint64_t features;
1273 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1274 features = (uint64_t)-1;
1275 }
1276 else {
1277 features = con->get_features();
1278 }
1279 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1280
1281 // snap trace
1282 SnapRealm *realm = NULL;
1283 if (reply->snapbl.length())
1284 update_snap_trace(reply->snapbl, &realm);
1285
1286 ldout(cct, 10) << " hrm "
1287 << " is_target=" << (int)reply->head.is_target
1288 << " is_dentry=" << (int)reply->head.is_dentry
1289 << dendl;
1290
1291 InodeStat dirst;
1292 DirStat dst;
1293 string dname;
1294 LeaseStat dlease;
1295 InodeStat ist;
1296
1297 if (reply->head.is_dentry) {
1298 dirst.decode(p, features);
1299 dst.decode(p, features);
1300 decode(dname, p);
1301 dlease.decode(p, features);
1302 }
1303
1304 Inode *in = 0;
1305 if (reply->head.is_target) {
1306 ist.decode(p, features);
1307 if (cct->_conf->client_debug_getattr_caps) {
1308 unsigned wanted = 0;
1309 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1310 wanted = request->head.args.getattr.mask;
1311 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1312 wanted = request->head.args.open.mask;
1313
1314 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1315 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1316 ceph_abort_msg("MDS reply does not contain xattrs");
1317 }
1318
1319 in = add_update_inode(&ist, request->sent_stamp, session,
1320 request->perms);
1321 }
1322
1323 Inode *diri = NULL;
1324 if (reply->head.is_dentry) {
1325 diri = add_update_inode(&dirst, request->sent_stamp, session,
1326 request->perms);
1327 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1328
1329 if (in) {
1330 Dir *dir = diri->open_dir();
1331 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1332 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1333 } else {
1334 Dentry *dn = NULL;
1335 if (diri->dir && diri->dir->dentries.count(dname)) {
1336 dn = diri->dir->dentries[dname];
1337 if (dn->inode) {
1338 diri->dir_ordered_count++;
1339 clear_dir_complete_and_ordered(diri, false);
1340 unlink(dn, true, true); // keep dir, dentry
1341 }
1342 }
1343 if (dlease.duration_ms > 0) {
1344 if (!dn) {
1345 Dir *dir = diri->open_dir();
1346 dn = link(dir, dname, NULL, NULL);
1347 }
1348 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1349 }
1350 }
1351 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1352 op == CEPH_MDS_OP_MKSNAP) {
1353 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1354 // fake it for snap lookup
1355 vinodeno_t vino = ist.vino;
1356 vino.snapid = CEPH_SNAPDIR;
1357 ceph_assert(inode_map.count(vino));
1358 diri = inode_map[vino];
1359
1360 string dname = request->path.last_dentry();
1361
1362 LeaseStat dlease;
1363 dlease.duration_ms = 0;
1364
1365 if (in) {
1366 Dir *dir = diri->open_dir();
1367 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1368 } else {
1369 if (diri->dir && diri->dir->dentries.count(dname)) {
1370 Dentry *dn = diri->dir->dentries[dname];
1371 if (dn->inode)
1372 unlink(dn, true, true); // keep dir, dentry
1373 }
1374 }
1375 }
1376
1377 if (in) {
1378 if (op == CEPH_MDS_OP_READDIR ||
1379 op == CEPH_MDS_OP_LSSNAP) {
1380 insert_readdir_results(request, session, in);
1381 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1382 // hack: return parent inode instead
1383 in = diri;
1384 }
1385
1386 if (request->dentry() == NULL && in != request->inode()) {
1387 // pin the target inode if its parent dentry is not pinned
1388 request->set_other_inode(in);
1389 }
1390 }
1391
1392 if (realm)
1393 put_snap_realm(realm);
1394
1395 request->target = in;
1396 return in;
1397 }
1398
1399 // -------
1400
1401 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1402 {
1403 mds_rank_t mds = MDS_RANK_NONE;
1404 __u32 hash = 0;
1405 bool is_hash = false;
1406
1407 Inode *in = NULL;
1408 Dentry *de = NULL;
1409
1410 if (req->resend_mds >= 0) {
1411 mds = req->resend_mds;
1412 req->resend_mds = -1;
1413 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1414 goto out;
1415 }
1416
1417 if (cct->_conf->client_use_random_mds)
1418 goto random_mds;
1419
1420 in = req->inode();
1421 de = req->dentry();
1422 if (in) {
1423 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1424 if (req->path.depth()) {
1425 hash = in->hash_dentry_name(req->path[0]);
1426 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1427 << " on " << req->path[0]
1428 << " => " << hash << dendl;
1429 is_hash = true;
1430 }
1431 } else if (de) {
1432 if (de->inode) {
1433 in = de->inode.get();
1434 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1435 } else {
1436 in = de->dir->parent_inode;
1437 hash = in->hash_dentry_name(de->name);
1438 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1439 << " on " << de->name
1440 << " => " << hash << dendl;
1441 is_hash = true;
1442 }
1443 }
1444 if (in) {
1445 if (in->snapid != CEPH_NOSNAP) {
1446 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1447 while (in->snapid != CEPH_NOSNAP) {
1448 if (in->snapid == CEPH_SNAPDIR)
1449 in = in->snapdir_parent.get();
1450 else if (!in->dentries.empty())
1451 /* In most cases there will only be one dentry, so getting it
1452 * will be the correct action. If there are multiple hard links,
1453 * I think the MDS should be able to redirect as needed*/
1454 in = in->get_first_parent()->dir->parent_inode;
1455 else {
1456 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1457 break;
1458 }
1459 }
1460 is_hash = false;
1461 }
1462
1463 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1464 << " hash=" << hash << dendl;
1465
1466 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1467 frag_t fg = in->dirfragtree[hash];
1468 if (in->fragmap.count(fg)) {
1469 mds = in->fragmap[fg];
1470 if (phash_diri)
1471 *phash_diri = in;
1472 } else if (in->auth_cap) {
1473 mds = in->auth_cap->session->mds_num;
1474 }
1475 if (mds >= 0) {
1476 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1477 goto out;
1478 }
1479 }
1480
1481 if (in->auth_cap && req->auth_is_best()) {
1482 mds = in->auth_cap->session->mds_num;
1483 } else if (!in->caps.empty()) {
1484 mds = in->caps.begin()->second.session->mds_num;
1485 } else {
1486 goto random_mds;
1487 }
1488 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1489
1490 goto out;
1491 }
1492
1493 random_mds:
1494 if (mds < 0) {
1495 mds = _get_random_up_mds();
1496 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1497 }
1498
1499 out:
1500 ldout(cct, 20) << "mds is " << mds << dendl;
1501 return mds;
1502 }
1503
1504
1505 void Client::connect_mds_targets(mds_rank_t mds)
1506 {
1507 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1508 ceph_assert(mds_sessions.count(mds));
1509 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1510 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1511 q != info.export_targets.end();
1512 ++q) {
1513 if (mds_sessions.count(*q) == 0 &&
1514 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1515 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1516 << " export target mds." << *q << dendl;
1517 _open_mds_session(*q);
1518 }
1519 }
1520 }
1521
1522 void Client::dump_mds_sessions(Formatter *f)
1523 {
1524 f->dump_int("id", get_nodeid().v);
1525 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1526 f->dump_object("inst", inst);
1527 f->dump_stream("inst_str") << inst;
1528 f->dump_stream("addr_str") << inst.addr;
1529 f->open_array_section("sessions");
1530 for (const auto &p : mds_sessions) {
1531 f->open_object_section("session");
1532 p.second.dump(f);
1533 f->close_section();
1534 }
1535 f->close_section();
1536 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1537 }
1538 void Client::dump_mds_requests(Formatter *f)
1539 {
1540 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1541 p != mds_requests.end();
1542 ++p) {
1543 f->open_object_section("request");
1544 p->second->dump(f);
1545 f->close_section();
1546 }
1547 }
1548
1549 int Client::verify_reply_trace(int r, MetaSession *session,
1550 MetaRequest *request, const MConstRef<MClientReply>& reply,
1551 InodeRef *ptarget, bool *pcreated,
1552 const UserPerm& perms)
1553 {
1554 // check whether this request actually did the create, and set created flag
1555 bufferlist extra_bl;
1556 inodeno_t created_ino;
1557 bool got_created_ino = false;
1558 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1559
1560 extra_bl = reply->get_extra_bl();
1561 if (extra_bl.length() >= 8) {
1562 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1563 struct openc_response_t ocres;
1564
1565 decode(ocres, extra_bl);
1566 created_ino = ocres.created_ino;
1567 /*
1568 * The userland cephfs client doesn't have a way to do an async create
1569 * (yet), so just discard delegated_inos for now. Eventually we should
1570 * store them and use them in create calls, even if they are synchronous,
1571 * if only for testing purposes.
1572 */
1573 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1574 } else {
1575 // u64 containing number of created ino
1576 decode(created_ino, extra_bl);
1577 }
1578 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1579 got_created_ino = true;
1580 }
1581
1582 if (pcreated)
1583 *pcreated = got_created_ino;
1584
1585 if (request->target) {
1586 *ptarget = request->target;
1587 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1588 } else {
1589 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1590 (*ptarget) = p->second;
1591 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1592 } else {
1593 // we got a traceless reply, and need to look up what we just
1594 // created. for now, do this by name. someday, do this by the
1595 // ino... which we know! FIXME.
1596 InodeRef target;
1597 Dentry *d = request->dentry();
1598 if (d) {
1599 if (d->dir) {
1600 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1601 << d->dir->parent_inode->ino << "/" << d->name
1602 << " got_ino " << got_created_ino
1603 << " ino " << created_ino
1604 << dendl;
1605 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1606 &target, perms);
1607 } else {
1608 // if the dentry is not linked, just do our best. see #5021.
1609 ceph_abort_msg("how did this happen? i want logs!");
1610 }
1611 } else {
1612 Inode *in = request->inode();
1613 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1614 << in->ino << dendl;
1615 r = _getattr(in, request->regetattr_mask, perms, true);
1616 target = in;
1617 }
1618 if (r >= 0) {
1619 // verify ino returned in reply and trace_dist are the same
1620 if (got_created_ino &&
1621 created_ino.val != target->ino.val) {
1622 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1623 r = -EINTR;
1624 }
1625 if (ptarget)
1626 ptarget->swap(target);
1627 }
1628 }
1629 }
1630
1631 return r;
1632 }
1633
1634
1635 /**
1636 * make a request
1637 *
1638 * Blocking helper to make an MDS request.
1639 *
1640 * If the ptarget flag is set, behavior changes slightly: the caller
1641 * expects to get a pointer to the inode we are creating or operating
1642 * on. As a result, we will follow up any traceless mutation reply
1643 * with a getattr or lookup to transparently handle a traceless reply
1644 * from the MDS (as when the MDS restarts and the client has to replay
1645 * a request).
1646 *
1647 * @param request the MetaRequest to execute
1648 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1649 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1650 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1651 * @param use_mds [optional] prefer a specific mds (-1 for default)
1652 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1653 */
1654 int Client::make_request(MetaRequest *request,
1655 const UserPerm& perms,
1656 InodeRef *ptarget, bool *pcreated,
1657 mds_rank_t use_mds,
1658 bufferlist *pdirbl)
1659 {
1660 int r = 0;
1661
1662 // assign a unique tid
1663 ceph_tid_t tid = ++last_tid;
1664 request->set_tid(tid);
1665
1666 // and timestamp
1667 request->op_stamp = ceph_clock_now();
1668
1669 // make note
1670 mds_requests[tid] = request->get();
1671 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1672 oldest_tid = tid;
1673
1674 request->set_caller_perms(perms);
1675
1676 if (cct->_conf->client_inject_fixed_oldest_tid) {
1677 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1678 request->set_oldest_client_tid(1);
1679 } else {
1680 request->set_oldest_client_tid(oldest_tid);
1681 }
1682
1683 // hack target mds?
1684 if (use_mds >= 0)
1685 request->resend_mds = use_mds;
1686
1687 MetaSession *session = NULL;
1688 while (1) {
1689 if (request->aborted())
1690 break;
1691
1692 if (blacklisted) {
1693 request->abort(-EBLACKLISTED);
1694 break;
1695 }
1696
1697 // set up wait cond
1698 ceph::condition_variable caller_cond;
1699 request->caller_cond = &caller_cond;
1700
1701 // choose mds
1702 Inode *hash_diri = NULL;
1703 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1704 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1705 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1706 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1707 if (hash_diri) {
1708 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1709 _fragmap_remove_stopped_mds(hash_diri, mds);
1710 } else {
1711 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1712 request->resend_mds = _get_random_up_mds();
1713 }
1714 } else {
1715 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1716 wait_on_list(waiting_for_mdsmap);
1717 }
1718 continue;
1719 }
1720
1721 // open a session?
1722 if (!have_open_session(mds)) {
1723 session = _get_or_open_mds_session(mds);
1724
1725 // wait
1726 if (session->state == MetaSession::STATE_OPENING) {
1727 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1728 wait_on_context_list(session->waiting_for_open);
1729 // Abort requests on REJECT from MDS
1730 if (rejected_by_mds.count(mds)) {
1731 request->abort(-EPERM);
1732 break;
1733 }
1734 continue;
1735 }
1736
1737 if (!have_open_session(mds))
1738 continue;
1739 } else {
1740 session = &mds_sessions.at(mds);
1741 }
1742
1743 // send request.
1744 send_request(request, session);
1745
1746 // wait for signal
1747 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1748 request->kick = false;
1749 std::unique_lock l{client_lock, std::adopt_lock};
1750 caller_cond.wait(l, [request] {
1751 return (request->reply || // reply
1752 request->resend_mds >= 0 || // forward
1753 request->kick);
1754 });
1755 l.release();
1756 request->caller_cond = nullptr;
1757
1758 // did we get a reply?
1759 if (request->reply)
1760 break;
1761 }
1762
1763 if (!request->reply) {
1764 ceph_assert(request->aborted());
1765 ceph_assert(!request->got_unsafe);
1766 r = request->get_abort_code();
1767 request->item.remove_myself();
1768 unregister_request(request);
1769 put_request(request);
1770 return r;
1771 }
1772
1773 // got it!
1774 auto reply = std::move(request->reply);
1775 r = reply->get_result();
1776 if (r >= 0)
1777 request->success = true;
1778
1779 // kick dispatcher (we've got it!)
1780 ceph_assert(request->dispatch_cond);
1781 request->dispatch_cond->notify_all();
1782 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1783 request->dispatch_cond = 0;
1784
1785 if (r >= 0 && ptarget)
1786 r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1787
1788 if (pdirbl)
1789 *pdirbl = reply->get_extra_bl();
1790
1791 // -- log times --
1792 utime_t lat = ceph_clock_now();
1793 lat -= request->sent_stamp;
1794 ldout(cct, 20) << "lat " << lat << dendl;
1795 logger->tinc(l_c_lat, lat);
1796 logger->tinc(l_c_reply, lat);
1797
1798 put_request(request);
1799 return r;
1800 }
1801
1802 void Client::unregister_request(MetaRequest *req)
1803 {
1804 mds_requests.erase(req->tid);
1805 if (req->tid == oldest_tid) {
1806 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1807 while (true) {
1808 if (p == mds_requests.end()) {
1809 oldest_tid = 0;
1810 break;
1811 }
1812 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1813 oldest_tid = p->first;
1814 break;
1815 }
1816 ++p;
1817 }
1818 }
1819 put_request(req);
1820 }
1821
1822 void Client::put_request(MetaRequest *request)
1823 {
1824 if (request->_put()) {
1825 int op = -1;
1826 if (request->success)
1827 op = request->get_op();
1828 InodeRef other_in;
1829 request->take_other_inode(&other_in);
1830 delete request;
1831
1832 if (other_in &&
1833 (op == CEPH_MDS_OP_RMDIR ||
1834 op == CEPH_MDS_OP_RENAME ||
1835 op == CEPH_MDS_OP_RMSNAP)) {
1836 _try_to_trim_inode(other_in.get(), false);
1837 }
1838 }
1839 }
1840
1841 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1842 mds_rank_t mds, int drop,
1843 int unless, int force)
1844 {
1845 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1846 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1847 << ", force:" << force << ")" << dendl;
1848 int released = 0;
1849 auto it = in->caps.find(mds);
1850 if (it != in->caps.end()) {
1851 Cap &cap = it->second;
1852 drop &= ~(in->dirty_caps | get_caps_used(in));
1853 if ((drop & cap.issued) &&
1854 !(unless & cap.issued)) {
1855 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1856 cap.issued &= ~drop;
1857 cap.implemented &= ~drop;
1858 released = 1;
1859 } else {
1860 released = force;
1861 }
1862 if (released) {
1863 cap.wanted = in->caps_wanted();
1864 if (&cap == in->auth_cap &&
1865 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1866 in->requested_max_size = 0;
1867 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1868 }
1869 ceph_mds_request_release rel;
1870 rel.ino = in->ino;
1871 rel.cap_id = cap.cap_id;
1872 rel.seq = cap.seq;
1873 rel.issue_seq = cap.issue_seq;
1874 rel.mseq = cap.mseq;
1875 rel.caps = cap.implemented;
1876 rel.wanted = cap.wanted;
1877 rel.dname_len = 0;
1878 rel.dname_seq = 0;
1879 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1880 }
1881 }
1882 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1883 << released << dendl;
1884 return released;
1885 }
1886
1887 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1888 mds_rank_t mds, int drop, int unless)
1889 {
1890 ldout(cct, 20) << __func__ << " enter(dn:"
1891 << dn << ")" << dendl;
1892 int released = 0;
1893 if (dn->dir)
1894 released = encode_inode_release(dn->dir->parent_inode, req,
1895 mds, drop, unless, 1);
1896 if (released && dn->lease_mds == mds) {
1897 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1898 auto& rel = req->cap_releases.back();
1899 rel.item.dname_len = dn->name.length();
1900 rel.item.dname_seq = dn->lease_seq;
1901 rel.dname = dn->name;
1902 }
1903 ldout(cct, 25) << __func__ << " exit(dn:"
1904 << dn << ")" << dendl;
1905 }
1906
1907
1908 /*
1909 * This requires the MClientRequest *request member to be set.
1910 * It will error out horribly without one.
1911 * Additionally, if you set any *drop member, you'd better have
1912 * set the corresponding dentry!
1913 */
1914 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1915 {
1916 ldout(cct, 20) << __func__ << " enter (req: "
1917 << req << ", mds: " << mds << ")" << dendl;
1918 if (req->inode_drop && req->inode())
1919 encode_inode_release(req->inode(), req,
1920 mds, req->inode_drop,
1921 req->inode_unless);
1922
1923 if (req->old_inode_drop && req->old_inode())
1924 encode_inode_release(req->old_inode(), req,
1925 mds, req->old_inode_drop,
1926 req->old_inode_unless);
1927 if (req->other_inode_drop && req->other_inode())
1928 encode_inode_release(req->other_inode(), req,
1929 mds, req->other_inode_drop,
1930 req->other_inode_unless);
1931
1932 if (req->dentry_drop && req->dentry())
1933 encode_dentry_release(req->dentry(), req,
1934 mds, req->dentry_drop,
1935 req->dentry_unless);
1936
1937 if (req->old_dentry_drop && req->old_dentry())
1938 encode_dentry_release(req->old_dentry(), req,
1939 mds, req->old_dentry_drop,
1940 req->old_dentry_unless);
1941 ldout(cct, 25) << __func__ << " exit (req: "
1942 << req << ", mds " << mds <<dendl;
1943 }
1944
1945 bool Client::have_open_session(mds_rank_t mds)
1946 {
1947 const auto &it = mds_sessions.find(mds);
1948 return it != mds_sessions.end() &&
1949 (it->second.state == MetaSession::STATE_OPEN ||
1950 it->second.state == MetaSession::STATE_STALE);
1951 }
1952
1953 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1954 {
1955 const auto &it = mds_sessions.find(mds);
1956 if (it == mds_sessions.end() || it->second.con != con) {
1957 return NULL;
1958 } else {
1959 return &it->second;
1960 }
1961 }
1962
1963 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1964 {
1965 auto it = mds_sessions.find(mds);
1966 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1967 }
1968
1969 /**
1970 * Populate a map of strings with client-identifying metadata,
1971 * such as the hostname. Call this once at initialization.
1972 */
1973 void Client::populate_metadata(const std::string &mount_root)
1974 {
1975 // Hostname
1976 struct utsname u;
1977 int r = uname(&u);
1978 if (r >= 0) {
1979 metadata["hostname"] = u.nodename;
1980 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1981 } else {
1982 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1983 }
1984
1985 metadata["pid"] = stringify(getpid());
1986
1987 // Ceph entity id (the '0' in "client.0")
1988 metadata["entity_id"] = cct->_conf->name.get_id();
1989
1990 // Our mount position
1991 if (!mount_root.empty()) {
1992 metadata["root"] = mount_root;
1993 }
1994
1995 // Ceph version
1996 metadata["ceph_version"] = pretty_version_to_str();
1997 metadata["ceph_sha1"] = git_version_to_str();
1998
1999 // Apply any metadata from the user's configured overrides
2000 std::vector<std::string> tokens;
2001 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2002 for (const auto &i : tokens) {
2003 auto eqpos = i.find("=");
2004 // Throw out anything that isn't of the form "<str>=<str>"
2005 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2006 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2007 continue;
2008 }
2009 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2010 }
2011 }
2012
2013 /**
2014 * Optionally add or override client metadata fields.
2015 */
2016 void Client::update_metadata(std::string const &k, std::string const &v)
2017 {
2018 std::lock_guard l(client_lock);
2019 ceph_assert(initialized);
2020
2021 auto it = metadata.find(k);
2022 if (it != metadata.end()) {
2023 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2024 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2025 }
2026
2027 metadata[k] = v;
2028 }
2029
2030 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2031 {
2032 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2033 auto addrs = mdsmap->get_addrs(mds);
2034 auto em = mds_sessions.emplace(std::piecewise_construct,
2035 std::forward_as_tuple(mds),
2036 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2037 ceph_assert(em.second); /* not already present */
2038 MetaSession *session = &em.first->second;
2039
2040 // Maybe skip sending a request to open if this MDS daemon
2041 // has previously sent us a REJECT.
2042 if (rejected_by_mds.count(mds)) {
2043 if (rejected_by_mds[mds] == session->addrs) {
2044 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
2045 "because we were rejected" << dendl;
2046 return session;
2047 } else {
2048 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
2049 "rejected us, trying with new inst" << dendl;
2050 rejected_by_mds.erase(mds);
2051 }
2052 }
2053
2054 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2055 m->metadata = metadata;
2056 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2057 session->con->send_message2(std::move(m));
2058 return session;
2059 }
2060
2061 void Client::_close_mds_session(MetaSession *s)
2062 {
2063 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2064 s->state = MetaSession::STATE_CLOSING;
2065 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2066 }
2067
2068 void Client::_closed_mds_session(MetaSession *s)
2069 {
2070 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2071 s->state = MetaSession::STATE_CLOSED;
2072 s->con->mark_down();
2073 signal_context_list(s->waiting_for_open);
2074 mount_cond.notify_all();
2075 remove_session_caps(s);
2076 kick_requests_closed(s);
2077 mds_sessions.erase(s->mds_num);
2078 }
2079
2080 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2081 {
2082 mds_rank_t from = mds_rank_t(m->get_source().num());
2083 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2084
2085 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2086 if (!session) {
2087 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2088 return;
2089 }
2090
2091 switch (m->get_op()) {
2092 case CEPH_SESSION_OPEN:
2093 {
2094 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2095 missing_features -= m->supported_features;
2096 if (!missing_features.empty()) {
2097 lderr(cct) << "mds." << from << " lacks required features '"
2098 << missing_features << "', closing session " << dendl;
2099 rejected_by_mds[session->mds_num] = session->addrs;
2100 _close_mds_session(session);
2101 _closed_mds_session(session);
2102 break;
2103 }
2104 session->mds_features = std::move(m->supported_features);
2105
2106 renew_caps(session);
2107 session->state = MetaSession::STATE_OPEN;
2108 if (unmounting)
2109 mount_cond.notify_all();
2110 else
2111 connect_mds_targets(from);
2112 signal_context_list(session->waiting_for_open);
2113 break;
2114 }
2115
2116 case CEPH_SESSION_CLOSE:
2117 _closed_mds_session(session);
2118 break;
2119
2120 case CEPH_SESSION_RENEWCAPS:
2121 if (session->cap_renew_seq == m->get_seq()) {
2122 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2123 session->cap_ttl =
2124 session->last_cap_renew_request + mdsmap->get_session_timeout();
2125 if (was_stale)
2126 wake_up_session_caps(session, false);
2127 }
2128 break;
2129
2130 case CEPH_SESSION_STALE:
2131 // invalidate session caps/leases
2132 session->cap_gen++;
2133 session->cap_ttl = ceph_clock_now();
2134 session->cap_ttl -= 1;
2135 renew_caps(session);
2136 break;
2137
2138 case CEPH_SESSION_RECALL_STATE:
2139 trim_caps(session, m->get_max_caps());
2140 break;
2141
2142 case CEPH_SESSION_FLUSHMSG:
2143 /* flush cap release */
2144 if (auto& m = session->release; m) {
2145 session->con->send_message2(std::move(m));
2146 }
2147 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2148 break;
2149
2150 case CEPH_SESSION_FORCE_RO:
2151 force_session_readonly(session);
2152 break;
2153
2154 case CEPH_SESSION_REJECT:
2155 {
2156 std::string_view error_str;
2157 auto it = m->metadata.find("error_string");
2158 if (it != m->metadata.end())
2159 error_str = it->second;
2160 else
2161 error_str = "unknown error";
2162 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2163
2164 rejected_by_mds[session->mds_num] = session->addrs;
2165 _closed_mds_session(session);
2166 }
2167 break;
2168
2169 default:
2170 ceph_abort();
2171 }
2172 }
2173
2174 bool Client::_any_stale_sessions() const
2175 {
2176 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2177
2178 for (const auto &p : mds_sessions) {
2179 if (p.second.state == MetaSession::STATE_STALE) {
2180 return true;
2181 }
2182 }
2183
2184 return false;
2185 }
2186
2187 void Client::_kick_stale_sessions()
2188 {
2189 ldout(cct, 1) << __func__ << dendl;
2190
2191 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2192 MetaSession &s = it->second;
2193 ++it;
2194 if (s.state == MetaSession::STATE_STALE)
2195 _closed_mds_session(&s);
2196 }
2197 }
2198
2199 void Client::send_request(MetaRequest *request, MetaSession *session,
2200 bool drop_cap_releases)
2201 {
2202 // make the request
2203 mds_rank_t mds = session->mds_num;
2204 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2205 << " for mds." << mds << dendl;
2206 auto r = build_client_request(request);
2207 if (request->dentry()) {
2208 r->set_dentry_wanted();
2209 }
2210 if (request->got_unsafe) {
2211 r->set_replayed_op();
2212 if (request->target)
2213 r->head.ino = request->target->ino;
2214 } else {
2215 encode_cap_releases(request, mds);
2216 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2217 request->cap_releases.clear();
2218 else
2219 r->releases.swap(request->cap_releases);
2220 }
2221 r->set_mdsmap_epoch(mdsmap->get_epoch());
2222 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2223 objecter->with_osdmap([r](const OSDMap& o) {
2224 r->set_osdmap_epoch(o.get_epoch());
2225 });
2226 }
2227
2228 if (request->mds == -1) {
2229 request->sent_stamp = ceph_clock_now();
2230 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2231 }
2232 request->mds = mds;
2233
2234 Inode *in = request->inode();
2235 if (in) {
2236 auto it = in->caps.find(mds);
2237 if (it != in->caps.end()) {
2238 request->sent_on_mseq = it->second.mseq;
2239 }
2240 }
2241
2242 session->requests.push_back(&request->item);
2243
2244 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2245 session->con->send_message2(std::move(r));
2246 }
2247
2248 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2249 {
2250 auto req = make_message<MClientRequest>(request->get_op());
2251 req->set_tid(request->tid);
2252 req->set_stamp(request->op_stamp);
2253 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2254
2255 // if the filepath's haven't been set, set them!
2256 if (request->path.empty()) {
2257 Inode *in = request->inode();
2258 Dentry *de = request->dentry();
2259 if (in)
2260 in->make_nosnap_relative_path(request->path);
2261 else if (de) {
2262 if (de->inode)
2263 de->inode->make_nosnap_relative_path(request->path);
2264 else if (de->dir) {
2265 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2266 request->path.push_dentry(de->name);
2267 }
2268 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2269 << " No path, inode, or appropriately-endowed dentry given!"
2270 << dendl;
2271 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2272 << " No path, inode, or dentry given!"
2273 << dendl;
2274 }
2275 req->set_filepath(request->get_filepath());
2276 req->set_filepath2(request->get_filepath2());
2277 req->set_data(request->data);
2278 req->set_retry_attempt(request->retry_attempt++);
2279 req->head.num_fwd = request->num_fwd;
2280 const gid_t *_gids;
2281 int gid_count = request->perms.get_gids(&_gids);
2282 req->set_gid_list(gid_count, _gids);
2283 return req;
2284 }
2285
2286
2287
2288 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2289 {
2290 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2291 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2292 if (!session) {
2293 return;
2294 }
2295 ceph_tid_t tid = fwd->get_tid();
2296
2297 if (mds_requests.count(tid) == 0) {
2298 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2299 return;
2300 }
2301
2302 MetaRequest *request = mds_requests[tid];
2303 ceph_assert(request);
2304
2305 // reset retry counter
2306 request->retry_attempt = 0;
2307
2308 // request not forwarded, or dest mds has no session.
2309 // resend.
2310 ldout(cct, 10) << __func__ << " tid " << tid
2311 << " fwd " << fwd->get_num_fwd()
2312 << " to mds." << fwd->get_dest_mds()
2313 << ", resending to " << fwd->get_dest_mds()
2314 << dendl;
2315
2316 request->mds = -1;
2317 request->item.remove_myself();
2318 request->num_fwd = fwd->get_num_fwd();
2319 request->resend_mds = fwd->get_dest_mds();
2320 request->caller_cond->notify_all();
2321 }
2322
2323 bool Client::is_dir_operation(MetaRequest *req)
2324 {
2325 int op = req->get_op();
2326 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2327 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2328 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2329 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2330 return true;
2331 return false;
2332 }
2333
2334 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2335 {
2336 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2337 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2338 if (!session) {
2339 return;
2340 }
2341
2342 ceph_tid_t tid = reply->get_tid();
2343 bool is_safe = reply->is_safe();
2344
2345 if (mds_requests.count(tid) == 0) {
2346 lderr(cct) << __func__ << " no pending request on tid " << tid
2347 << " safe is:" << is_safe << dendl;
2348 return;
2349 }
2350 MetaRequest *request = mds_requests.at(tid);
2351
2352 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2353 << " tid " << tid << dendl;
2354
2355 if (request->got_unsafe && !is_safe) {
2356 //duplicate response
2357 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2358 << mds_num << " safe:" << is_safe << dendl;
2359 return;
2360 }
2361
2362 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2363 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2364 << " from mds." << request->mds << dendl;
2365 request->send_to_auth = true;
2366 request->resend_mds = choose_target_mds(request);
2367 Inode *in = request->inode();
2368 std::map<mds_rank_t, Cap>::const_iterator it;
2369 if (request->resend_mds >= 0 &&
2370 request->resend_mds == request->mds &&
2371 (in == NULL ||
2372 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2373 request->sent_on_mseq == it->second.mseq)) {
2374 ldout(cct, 20) << "have to return ESTALE" << dendl;
2375 } else {
2376 request->caller_cond->notify_all();
2377 return;
2378 }
2379 }
2380
2381 ceph_assert(!request->reply);
2382 request->reply = reply;
2383 insert_trace(request, session);
2384
2385 // Handle unsafe reply
2386 if (!is_safe) {
2387 request->got_unsafe = true;
2388 session->unsafe_requests.push_back(&request->unsafe_item);
2389 if (is_dir_operation(request)) {
2390 Inode *dir = request->inode();
2391 ceph_assert(dir);
2392 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2393 }
2394 if (request->target) {
2395 InodeRef &in = request->target;
2396 in->unsafe_ops.push_back(&request->unsafe_target_item);
2397 }
2398 }
2399
2400 // Only signal the caller once (on the first reply):
2401 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2402 if (!is_safe || !request->got_unsafe) {
2403 ceph::condition_variable cond;
2404 request->dispatch_cond = &cond;
2405
2406 // wake up waiter
2407 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2408 request->caller_cond->notify_all();
2409
2410 // wake for kick back
2411 std::unique_lock l{client_lock, std::adopt_lock};
2412 cond.wait(l, [tid, request, &cond, this] {
2413 if (request->dispatch_cond) {
2414 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2415 << tid << " " << &cond << dendl;
2416 }
2417 return !request->dispatch_cond;
2418 });
2419 l.release();
2420 }
2421
2422 if (is_safe) {
2423 // the filesystem change is committed to disk
2424 // we're done, clean up
2425 if (request->got_unsafe) {
2426 request->unsafe_item.remove_myself();
2427 request->unsafe_dir_item.remove_myself();
2428 request->unsafe_target_item.remove_myself();
2429 signal_cond_list(request->waitfor_safe);
2430 }
2431 request->item.remove_myself();
2432 unregister_request(request);
2433 }
2434 if (unmounting)
2435 mount_cond.notify_all();
2436 }
2437
2438 void Client::_handle_full_flag(int64_t pool)
2439 {
2440 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2441 << "on " << pool << dendl;
2442 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2443 // to do this rather than blocking, because otherwise when we fill up we
2444 // potentially lock caps forever on files with dirty pages, and we need
2445 // to be able to release those caps to the MDS so that it can delete files
2446 // and free up space.
2447 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2448
2449 // For all inodes with layouts in this pool and a pending flush write op
2450 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2451 // from ObjectCacher so that it doesn't re-issue the write in response to
2452 // the ENOSPC error.
2453 // Fortunately since we're cancelling everything in a given pool, we don't
2454 // need to know which ops belong to which ObjectSet, we can just blow all
2455 // the un-flushed cached data away and mark any dirty inodes' async_err
2456 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2457 // affecting this pool, and all the objectsets we're purging were also
2458 // in this pool.
2459 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2460 i != inode_map.end(); ++i)
2461 {
2462 Inode *inode = i->second;
2463 if (inode->oset.dirty_or_tx
2464 && (pool == -1 || inode->layout.pool_id == pool)) {
2465 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2466 << " has dirty objects, purging and setting ENOSPC" << dendl;
2467 objectcacher->purge_set(&inode->oset);
2468 inode->set_async_err(-ENOSPC);
2469 }
2470 }
2471
2472 if (cancelled_epoch != (epoch_t)-1) {
2473 set_cap_epoch_barrier(cancelled_epoch);
2474 }
2475 }
2476
2477 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2478 {
2479 std::set<entity_addr_t> new_blacklists;
2480 objecter->consume_blacklist_events(&new_blacklists);
2481
2482 const auto myaddrs = messenger->get_myaddrs();
2483 bool new_blacklist = false;
2484 bool prenautilus = objecter->with_osdmap(
2485 [&](const OSDMap& o) {
2486 return o.require_osd_release < ceph_release_t::nautilus;
2487 });
2488 if (!blacklisted) {
2489 for (auto a : myaddrs.v) {
2490 // blacklist entries are always TYPE_ANY for nautilus+
2491 a.set_type(entity_addr_t::TYPE_ANY);
2492 if (new_blacklists.count(a)) {
2493 new_blacklist = true;
2494 break;
2495 }
2496 if (prenautilus) {
2497 // ...except pre-nautilus, they were TYPE_LEGACY
2498 a.set_type(entity_addr_t::TYPE_LEGACY);
2499 if (new_blacklists.count(a)) {
2500 new_blacklist = true;
2501 break;
2502 }
2503 }
2504 }
2505 }
2506 if (new_blacklist) {
2507 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2508 return o.get_epoch();
2509 });
2510 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2511 blacklisted = true;
2512
2513 _abort_mds_sessions(-EBLACKLISTED);
2514
2515 // Since we know all our OSD ops will fail, cancel them all preemtively,
2516 // so that on an unhealthy cluster we can umount promptly even if e.g.
2517 // some PGs were inaccessible.
2518 objecter->op_cancel_writes(-EBLACKLISTED);
2519
2520 } else if (blacklisted) {
2521 // Handle case where we were blacklisted but no longer are
2522 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2523 return o.is_blacklisted(myaddrs);});
2524 }
2525
2526 // Always subscribe to next osdmap for blacklisted client
2527 // until this client is not blacklisted.
2528 if (blacklisted) {
2529 objecter->maybe_request_map();
2530 }
2531
2532 if (objecter->osdmap_full_flag()) {
2533 _handle_full_flag(-1);
2534 } else {
2535 // Accumulate local list of full pools so that I can drop
2536 // the objecter lock before re-entering objecter in
2537 // cancel_writes
2538 std::vector<int64_t> full_pools;
2539
2540 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2541 for (const auto& kv : o.get_pools()) {
2542 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2543 full_pools.push_back(kv.first);
2544 }
2545 }
2546 });
2547
2548 for (auto p : full_pools)
2549 _handle_full_flag(p);
2550
2551 // Subscribe to subsequent maps to watch for the full flag going
2552 // away. For the global full flag objecter does this for us, but
2553 // it pays no attention to the per-pool full flag so in this branch
2554 // we do it ourselves.
2555 if (!full_pools.empty()) {
2556 objecter->maybe_request_map();
2557 }
2558 }
2559 }
2560
2561
2562 // ------------------------
2563 // incoming messages
2564
2565
2566 bool Client::ms_dispatch2(const MessageRef &m)
2567 {
2568 std::lock_guard l(client_lock);
2569 if (!initialized) {
2570 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2571 return true;
2572 }
2573
2574 switch (m->get_type()) {
2575 // mounting and mds sessions
2576 case CEPH_MSG_MDS_MAP:
2577 handle_mds_map(ref_cast<MMDSMap>(m));
2578 break;
2579 case CEPH_MSG_FS_MAP:
2580 handle_fs_map(ref_cast<MFSMap>(m));
2581 break;
2582 case CEPH_MSG_FS_MAP_USER:
2583 handle_fs_map_user(ref_cast<MFSMapUser>(m));
2584 break;
2585 case CEPH_MSG_CLIENT_SESSION:
2586 handle_client_session(ref_cast<MClientSession>(m));
2587 break;
2588
2589 case CEPH_MSG_OSD_MAP:
2590 handle_osd_map(ref_cast<MOSDMap>(m));
2591 break;
2592
2593 // requests
2594 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2595 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2596 break;
2597 case CEPH_MSG_CLIENT_REPLY:
2598 handle_client_reply(ref_cast<MClientReply>(m));
2599 break;
2600
2601 // reclaim reply
2602 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2603 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2604 break;
2605
2606 case CEPH_MSG_CLIENT_SNAP:
2607 handle_snap(ref_cast<MClientSnap>(m));
2608 break;
2609 case CEPH_MSG_CLIENT_CAPS:
2610 handle_caps(ref_cast<MClientCaps>(m));
2611 break;
2612 case CEPH_MSG_CLIENT_LEASE:
2613 handle_lease(ref_cast<MClientLease>(m));
2614 break;
2615 case MSG_COMMAND_REPLY:
2616 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2617 handle_command_reply(ref_cast<MCommandReply>(m));
2618 } else {
2619 return false;
2620 }
2621 break;
2622 case CEPH_MSG_CLIENT_QUOTA:
2623 handle_quota(ref_cast<MClientQuota>(m));
2624 break;
2625
2626 default:
2627 return false;
2628 }
2629
2630 // unmounting?
2631 if (unmounting) {
2632 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2633 << "+" << inode_map.size() << dendl;
2634 long unsigned size = lru.lru_get_size() + inode_map.size();
2635 trim_cache();
2636 if (size < lru.lru_get_size() + inode_map.size()) {
2637 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2638 mount_cond.notify_all();
2639 } else {
2640 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2641 << "+" << inode_map.size() << dendl;
2642 }
2643 }
2644
2645 return true;
2646 }
2647
2648 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2649 {
2650 fsmap.reset(new FSMap(m->get_fsmap()));
2651
2652 signal_cond_list(waiting_for_fsmap);
2653
2654 monclient->sub_got("fsmap", fsmap->get_epoch());
2655 }
2656
2657 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2658 {
2659 fsmap_user.reset(new FSMapUser);
2660 *fsmap_user = m->get_fsmap();
2661
2662 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2663 signal_cond_list(waiting_for_fsmap);
2664 }
2665
2666 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2667 {
2668 mds_gid_t old_inc, new_inc;
2669 if (m->get_epoch() <= mdsmap->get_epoch()) {
2670 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2671 << " is identical to or older than our "
2672 << mdsmap->get_epoch() << dendl;
2673 return;
2674 }
2675
2676 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2677
2678 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2679 oldmap.swap(mdsmap);
2680
2681 mdsmap->decode(m->get_encoded());
2682
2683 // Cancel any commands for missing or laggy GIDs
2684 std::list<ceph_tid_t> cancel_ops;
2685 auto &commands = command_table.get_commands();
2686 for (const auto &i : commands) {
2687 auto &op = i.second;
2688 const mds_gid_t op_mds_gid = op.mds_gid;
2689 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2690 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2691 cancel_ops.push_back(i.first);
2692 if (op.outs) {
2693 std::ostringstream ss;
2694 ss << "MDS " << op_mds_gid << " went away";
2695 *(op.outs) = ss.str();
2696 }
2697 op.con->mark_down();
2698 if (op.on_finish) {
2699 op.on_finish->complete(-ETIMEDOUT);
2700 }
2701 }
2702 }
2703
2704 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2705 i != cancel_ops.end(); ++i) {
2706 command_table.erase(*i);
2707 }
2708
2709 // reset session
2710 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2711 mds_rank_t mds = p->first;
2712 MetaSession *session = &p->second;
2713 ++p;
2714
2715 int oldstate = oldmap->get_state(mds);
2716 int newstate = mdsmap->get_state(mds);
2717 if (!mdsmap->is_up(mds)) {
2718 session->con->mark_down();
2719 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2720 old_inc = oldmap->get_incarnation(mds);
2721 new_inc = mdsmap->get_incarnation(mds);
2722 if (old_inc != new_inc) {
2723 ldout(cct, 1) << "mds incarnation changed from "
2724 << old_inc << " to " << new_inc << dendl;
2725 oldstate = MDSMap::STATE_NULL;
2726 }
2727 session->con->mark_down();
2728 session->addrs = mdsmap->get_addrs(mds);
2729 // When new MDS starts to take over, notify kernel to trim unused entries
2730 // in its dcache/icache. Hopefully, the kernel will release some unused
2731 // inodes before the new MDS enters reconnect state.
2732 trim_cache_for_reconnect(session);
2733 } else if (oldstate == newstate)
2734 continue; // no change
2735
2736 session->mds_state = newstate;
2737 if (newstate == MDSMap::STATE_RECONNECT) {
2738 session->con = messenger->connect_to_mds(session->addrs);
2739 send_reconnect(session);
2740 } else if (newstate > MDSMap::STATE_RECONNECT) {
2741 if (oldstate < MDSMap::STATE_RECONNECT) {
2742 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2743 _closed_mds_session(session);
2744 continue;
2745 }
2746 if (newstate >= MDSMap::STATE_ACTIVE) {
2747 if (oldstate < MDSMap::STATE_ACTIVE) {
2748 // kick new requests
2749 kick_requests(session);
2750 kick_flushing_caps(session);
2751 signal_context_list(session->waiting_for_open);
2752 wake_up_session_caps(session, true);
2753 }
2754 connect_mds_targets(mds);
2755 }
2756 } else if (newstate == MDSMap::STATE_NULL &&
2757 mds >= mdsmap->get_max_mds()) {
2758 _closed_mds_session(session);
2759 }
2760 }
2761
2762 // kick any waiting threads
2763 signal_cond_list(waiting_for_mdsmap);
2764
2765 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2766 }
2767
2768 void Client::send_reconnect(MetaSession *session)
2769 {
2770 mds_rank_t mds = session->mds_num;
2771 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2772
2773 // trim unused caps to reduce MDS's cache rejoin time
2774 trim_cache_for_reconnect(session);
2775
2776 session->readonly = false;
2777
2778 session->release.reset();
2779
2780 // reset my cap seq number
2781 session->seq = 0;
2782 //connect to the mds' offload targets
2783 connect_mds_targets(mds);
2784 //make sure unsafe requests get saved
2785 resend_unsafe_requests(session);
2786
2787 early_kick_flushing_caps(session);
2788
2789 auto m = make_message<MClientReconnect>();
2790 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2791
2792 // i have an open session.
2793 ceph::unordered_set<inodeno_t> did_snaprealm;
2794 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2795 p != inode_map.end();
2796 ++p) {
2797 Inode *in = p->second;
2798 auto it = in->caps.find(mds);
2799 if (it != in->caps.end()) {
2800 if (allow_multi &&
2801 m->get_approx_size() >=
2802 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2803 m->mark_more();
2804 session->con->send_message2(std::move(m));
2805
2806 m = make_message<MClientReconnect>();
2807 }
2808
2809 Cap &cap = it->second;
2810 ldout(cct, 10) << " caps on " << p->first
2811 << " " << ccap_string(cap.issued)
2812 << " wants " << ccap_string(in->caps_wanted())
2813 << dendl;
2814 filepath path;
2815 in->make_long_path(path);
2816 ldout(cct, 10) << " path " << path << dendl;
2817
2818 bufferlist flockbl;
2819 _encode_filelocks(in, flockbl);
2820
2821 cap.seq = 0; // reset seq.
2822 cap.issue_seq = 0; // reset seq.
2823 cap.mseq = 0; // reset seq.
2824 // cap gen should catch up with session cap_gen
2825 if (cap.gen < session->cap_gen) {
2826 cap.gen = session->cap_gen;
2827 cap.issued = cap.implemented = CEPH_CAP_PIN;
2828 } else {
2829 cap.issued = cap.implemented;
2830 }
2831 snapid_t snap_follows = 0;
2832 if (!in->cap_snaps.empty())
2833 snap_follows = in->cap_snaps.begin()->first;
2834
2835 m->add_cap(p->first.ino,
2836 cap.cap_id,
2837 path.get_ino(), path.get_path(), // ino
2838 in->caps_wanted(), // wanted
2839 cap.issued, // issued
2840 in->snaprealm->ino,
2841 snap_follows,
2842 flockbl);
2843
2844 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2845 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2846 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2847 did_snaprealm.insert(in->snaprealm->ino);
2848 }
2849 }
2850 }
2851
2852 if (!allow_multi)
2853 m->set_encoding_version(0); // use connection features to choose encoding
2854 session->con->send_message2(std::move(m));
2855
2856 mount_cond.notify_all();
2857
2858 if (session->reclaim_state == MetaSession::RECLAIMING)
2859 signal_cond_list(waiting_for_reclaim);
2860 }
2861
2862
2863 void Client::kick_requests(MetaSession *session)
2864 {
2865 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2866 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2867 p != mds_requests.end();
2868 ++p) {
2869 MetaRequest *req = p->second;
2870 if (req->got_unsafe)
2871 continue;
2872 if (req->aborted()) {
2873 if (req->caller_cond) {
2874 req->kick = true;
2875 req->caller_cond->notify_all();
2876 }
2877 continue;
2878 }
2879 if (req->retry_attempt > 0)
2880 continue; // new requests only
2881 if (req->mds == session->mds_num) {
2882 send_request(p->second, session);
2883 }
2884 }
2885 }
2886
2887 void Client::resend_unsafe_requests(MetaSession *session)
2888 {
2889 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2890 !iter.end();
2891 ++iter)
2892 send_request(*iter, session);
2893
2894 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2895 // process completed requests in clientreplay stage.
2896 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2897 p != mds_requests.end();
2898 ++p) {
2899 MetaRequest *req = p->second;
2900 if (req->got_unsafe)
2901 continue;
2902 if (req->aborted())
2903 continue;
2904 if (req->retry_attempt == 0)
2905 continue; // old requests only
2906 if (req->mds == session->mds_num)
2907 send_request(req, session, true);
2908 }
2909 }
2910
2911 void Client::wait_unsafe_requests()
2912 {
2913 list<MetaRequest*> last_unsafe_reqs;
2914 for (const auto &p : mds_sessions) {
2915 const MetaSession &s = p.second;
2916 if (!s.unsafe_requests.empty()) {
2917 MetaRequest *req = s.unsafe_requests.back();
2918 req->get();
2919 last_unsafe_reqs.push_back(req);
2920 }
2921 }
2922
2923 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2924 p != last_unsafe_reqs.end();
2925 ++p) {
2926 MetaRequest *req = *p;
2927 if (req->unsafe_item.is_on_list())
2928 wait_on_list(req->waitfor_safe);
2929 put_request(req);
2930 }
2931 }
2932
2933 void Client::kick_requests_closed(MetaSession *session)
2934 {
2935 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2936 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2937 p != mds_requests.end(); ) {
2938 MetaRequest *req = p->second;
2939 ++p;
2940 if (req->mds == session->mds_num) {
2941 if (req->caller_cond) {
2942 req->kick = true;
2943 req->caller_cond->notify_all();
2944 }
2945 req->item.remove_myself();
2946 if (req->got_unsafe) {
2947 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2948 req->unsafe_item.remove_myself();
2949 if (is_dir_operation(req)) {
2950 Inode *dir = req->inode();
2951 assert(dir);
2952 dir->set_async_err(-EIO);
2953 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2954 << dir->ino << " " << req->get_tid() << dendl;
2955 req->unsafe_dir_item.remove_myself();
2956 }
2957 if (req->target) {
2958 InodeRef &in = req->target;
2959 in->set_async_err(-EIO);
2960 lderr(cct) << "kick_requests_closed drop req of inode : "
2961 << in->ino << " " << req->get_tid() << dendl;
2962 req->unsafe_target_item.remove_myself();
2963 }
2964 signal_cond_list(req->waitfor_safe);
2965 unregister_request(req);
2966 }
2967 }
2968 }
2969 ceph_assert(session->requests.empty());
2970 ceph_assert(session->unsafe_requests.empty());
2971 }
2972
2973
2974
2975
2976 /************
2977 * leases
2978 */
2979
2980 void Client::got_mds_push(MetaSession *s)
2981 {
2982 s->seq++;
2983 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2984 if (s->state == MetaSession::STATE_CLOSING) {
2985 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2986 }
2987 }
2988
2989 void Client::handle_lease(const MConstRef<MClientLease>& m)
2990 {
2991 ldout(cct, 10) << __func__ << " " << *m << dendl;
2992
2993 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2994
2995 mds_rank_t mds = mds_rank_t(m->get_source().num());
2996 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2997 if (!session) {
2998 return;
2999 }
3000
3001 got_mds_push(session);
3002
3003 ceph_seq_t seq = m->get_seq();
3004
3005 Inode *in;
3006 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3007 if (inode_map.count(vino) == 0) {
3008 ldout(cct, 10) << " don't have vino " << vino << dendl;
3009 goto revoke;
3010 }
3011 in = inode_map[vino];
3012
3013 if (m->get_mask() & CEPH_LEASE_VALID) {
3014 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3015 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3016 goto revoke;
3017 }
3018 Dentry *dn = in->dir->dentries[m->dname];
3019 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3020 dn->lease_mds = -1;
3021 }
3022
3023 revoke:
3024 {
3025 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3026 m->get_mask(), m->get_ino(),
3027 m->get_first(), m->get_last(), m->dname);
3028 m->get_connection()->send_message2(std::move(reply));
3029 }
3030 }
3031
3032 void Client::put_inode(Inode *in, int n)
3033 {
3034 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3035 int left = in->_put(n);
3036 if (left == 0) {
3037 // release any caps
3038 remove_all_caps(in);
3039
3040 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3041 bool unclean = objectcacher->release_set(&in->oset);
3042 ceph_assert(!unclean);
3043 inode_map.erase(in->vino());
3044 if (use_faked_inos())
3045 _release_faked_ino(in);
3046
3047 if (in == root) {
3048 root = 0;
3049 root_ancestor = 0;
3050 while (!root_parents.empty())
3051 root_parents.erase(root_parents.begin());
3052 }
3053
3054 delete in;
3055 }
3056 }
3057
3058 void Client::close_dir(Dir *dir)
3059 {
3060 Inode *in = dir->parent_inode;
3061 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3062 ceph_assert(dir->is_empty());
3063 ceph_assert(in->dir == dir);
3064 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3065 if (!in->dentries.empty())
3066 in->get_first_parent()->put(); // unpin dentry
3067
3068 delete in->dir;
3069 in->dir = 0;
3070 put_inode(in); // unpin inode
3071 }
3072
3073 /**
3074 * Don't call this with in==NULL, use get_or_create for that
3075 * leave dn set to default NULL unless you're trying to add
3076 * a new inode to a pre-created Dentry
3077 */
3078 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3079 {
3080 if (!dn) {
3081 // create a new Dentry
3082 dn = new Dentry(dir, name);
3083
3084 lru.lru_insert_mid(dn); // mid or top?
3085
3086 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3087 << " dn " << dn << " (new dn)" << dendl;
3088 } else {
3089 ceph_assert(!dn->inode);
3090 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3091 << " dn " << dn << " (old dn)" << dendl;
3092 }
3093
3094 if (in) { // link to inode
3095 InodeRef tmp_ref;
3096 // only one parent for directories!
3097 if (in->is_dir() && !in->dentries.empty()) {
3098 tmp_ref = in; // prevent unlink below from freeing the inode.
3099 Dentry *olddn = in->get_first_parent();
3100 ceph_assert(olddn->dir != dir || olddn->name != name);
3101 Inode *old_diri = olddn->dir->parent_inode;
3102 old_diri->dir_release_count++;
3103 clear_dir_complete_and_ordered(old_diri, true);
3104 unlink(olddn, true, true); // keep dir, dentry
3105 }
3106
3107 dn->link(in);
3108 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3109 }
3110
3111 return dn;
3112 }
3113
3114 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3115 {
3116 InodeRef in(dn->inode);
3117 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3118 << " inode " << dn->inode << dendl;
3119
3120 // unlink from inode
3121 if (dn->inode) {
3122 dn->unlink();
3123 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3124 }
3125
3126 if (keepdentry) {
3127 dn->lease_mds = -1;
3128 } else {
3129 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3130
3131 // unlink from dir
3132 Dir *dir = dn->dir;
3133 dn->detach();
3134
3135 // delete den
3136 lru.lru_remove(dn);
3137 dn->put();
3138
3139 if (dir->is_empty() && !keepdir)
3140 close_dir(dir);
3141 }
3142 }
3143
3144 /**
3145 * For asynchronous flushes, check for errors from the IO and
3146 * update the inode if necessary
3147 */
3148 class C_Client_FlushComplete : public Context {
3149 private:
3150 Client *client;
3151 InodeRef inode;
3152 public:
3153 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3154 void finish(int r) override {
3155 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3156 if (r != 0) {
3157 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3158 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3159 << " 0x" << std::hex << inode->ino << std::dec
3160 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3161 inode->set_async_err(r);
3162 }
3163 }
3164 };
3165
3166
3167 /****
3168 * caps
3169 */
3170
3171 void Client::get_cap_ref(Inode *in, int cap)
3172 {
3173 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3174 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3175 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3176 in->get();
3177 }
3178 if ((cap & CEPH_CAP_FILE_CACHE) &&
3179 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3180 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3181 in->get();
3182 }
3183 in->get_cap_ref(cap);
3184 }
3185
3186 void Client::put_cap_ref(Inode *in, int cap)
3187 {
3188 int last = in->put_cap_ref(cap);
3189 if (last) {
3190 int put_nref = 0;
3191 int drop = last & ~in->caps_issued();
3192 if (in->snapid == CEPH_NOSNAP) {
3193 if ((last & CEPH_CAP_FILE_WR) &&
3194 !in->cap_snaps.empty() &&
3195 in->cap_snaps.rbegin()->second.writing) {
3196 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3197 in->cap_snaps.rbegin()->second.writing = 0;
3198 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3199 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3200 }
3201 if (last & CEPH_CAP_FILE_BUFFER) {
3202 for (auto &p : in->cap_snaps)
3203 p.second.dirty_data = 0;
3204 signal_cond_list(in->waitfor_commit);
3205 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3206 ++put_nref;
3207 }
3208 }
3209 if (last & CEPH_CAP_FILE_CACHE) {
3210 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3211 ++put_nref;
3212 }
3213 if (drop)
3214 check_caps(in, 0);
3215 if (put_nref)
3216 put_inode(in, put_nref);
3217 }
3218 }
3219
3220 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3221 {
3222 int r = check_pool_perm(in, need);
3223 if (r < 0)
3224 return r;
3225
3226 while (1) {
3227 int file_wanted = in->caps_file_wanted();
3228 if ((file_wanted & need) != need) {
3229 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3230 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3231 << dendl;
3232 return -EBADF;
3233 }
3234
3235 int implemented;
3236 int have = in->caps_issued(&implemented);
3237
3238 bool waitfor_caps = false;
3239 bool waitfor_commit = false;
3240
3241 if (have & need & CEPH_CAP_FILE_WR) {
3242 if (endoff > 0) {
3243 if ((endoff >= (loff_t)in->max_size ||
3244 endoff > (loff_t)(in->size << 1)) &&
3245 endoff > (loff_t)in->wanted_max_size) {
3246 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3247 in->wanted_max_size = endoff;
3248 }
3249 if (in->wanted_max_size > in->max_size &&
3250 in->wanted_max_size > in->requested_max_size)
3251 check_caps(in, 0);
3252 }
3253
3254 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3255 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3256 waitfor_caps = true;
3257 }
3258 if (!in->cap_snaps.empty()) {
3259 if (in->cap_snaps.rbegin()->second.writing) {
3260 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3261 waitfor_caps = true;
3262 }
3263 for (auto &p : in->cap_snaps) {
3264 if (p.second.dirty_data) {
3265 waitfor_commit = true;
3266 break;
3267 }
3268 }
3269 if (waitfor_commit) {
3270 _flush(in, new C_Client_FlushComplete(this, in));
3271 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3272 }
3273 }
3274 }
3275
3276 if (!waitfor_caps && !waitfor_commit) {
3277 if ((have & need) == need) {
3278 int revoking = implemented & ~have;
3279 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3280 << " need " << ccap_string(need) << " want " << ccap_string(want)
3281 << " revoking " << ccap_string(revoking)
3282 << dendl;
3283 if ((revoking & want) == 0) {
3284 *phave = need | (have & want);
3285 in->get_cap_ref(need);
3286 return 0;
3287 }
3288 }
3289 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3290 waitfor_caps = true;
3291 }
3292
3293 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3294 in->auth_cap->session->readonly)
3295 return -EROFS;
3296
3297 if (in->flags & I_CAP_DROPPED) {
3298 int mds_wanted = in->caps_mds_wanted();
3299 if ((mds_wanted & need) != need) {
3300 int ret = _renew_caps(in);
3301 if (ret < 0)
3302 return ret;
3303 continue;
3304 }
3305 if (!(file_wanted & ~mds_wanted))
3306 in->flags &= ~I_CAP_DROPPED;
3307 }
3308
3309 if (waitfor_caps)
3310 wait_on_list(in->waitfor_caps);
3311 else if (waitfor_commit)
3312 wait_on_list(in->waitfor_commit);
3313 }
3314 }
3315
3316 int Client::get_caps_used(Inode *in)
3317 {
3318 unsigned used = in->caps_used();
3319 if (!(used & CEPH_CAP_FILE_CACHE) &&
3320 !objectcacher->set_is_empty(&in->oset))
3321 used |= CEPH_CAP_FILE_CACHE;
3322 return used;
3323 }
3324
3325 void Client::cap_delay_requeue(Inode *in)
3326 {
3327 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3328 in->hold_caps_until = ceph_clock_now();
3329 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3330 delayed_list.push_back(&in->delay_cap_item);
3331 }
3332
3333 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3334 int flags, int used, int want, int retain,
3335 int flush, ceph_tid_t flush_tid)
3336 {
3337 int held = cap->issued | cap->implemented;
3338 int revoking = cap->implemented & ~cap->issued;
3339 retain &= ~revoking;
3340 int dropping = cap->issued & ~retain;
3341 int op = CEPH_CAP_OP_UPDATE;
3342
3343 ldout(cct, 10) << __func__ << " " << *in
3344 << " mds." << session->mds_num << " seq " << cap->seq
3345 << " used " << ccap_string(used)
3346 << " want " << ccap_string(want)
3347 << " flush " << ccap_string(flush)
3348 << " retain " << ccap_string(retain)
3349 << " held "<< ccap_string(held)
3350 << " revoking " << ccap_string(revoking)
3351 << " dropping " << ccap_string(dropping)
3352 << dendl;
3353
3354 if (cct->_conf->client_inject_release_failure && revoking) {
3355 const int would_have_issued = cap->issued & retain;
3356 const int would_have_implemented = cap->implemented & (cap->issued | used);
3357 // Simulated bug:
3358 // - tell the server we think issued is whatever they issued plus whatever we implemented
3359 // - leave what we have implemented in place
3360 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3361 cap->issued = cap->issued | cap->implemented;
3362
3363 // Make an exception for revoking xattr caps: we are injecting
3364 // failure to release other caps, but allow xattr because client
3365 // will block on xattr ops if it can't release these to MDS (#9800)
3366 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3367 cap->issued ^= xattr_mask & revoking;
3368 cap->implemented ^= xattr_mask & revoking;
3369
3370 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3371 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3372 } else {
3373 // Normal behaviour
3374 cap->issued &= retain;
3375 cap->implemented &= cap->issued | used;
3376 }
3377
3378 snapid_t follows = 0;
3379
3380 if (flush)
3381 follows = in->snaprealm->get_snap_context().seq;
3382
3383 auto m = make_message<MClientCaps>(op,
3384 in->ino,
3385 0,
3386 cap->cap_id, cap->seq,
3387 cap->implemented,
3388 want,
3389 flush,
3390 cap->mseq,
3391 cap_epoch_barrier);
3392 m->caller_uid = in->cap_dirtier_uid;
3393 m->caller_gid = in->cap_dirtier_gid;
3394
3395 m->head.issue_seq = cap->issue_seq;
3396 m->set_tid(flush_tid);
3397
3398 m->head.uid = in->uid;
3399 m->head.gid = in->gid;
3400 m->head.mode = in->mode;
3401
3402 m->head.nlink = in->nlink;
3403
3404 if (flush & CEPH_CAP_XATTR_EXCL) {
3405 encode(in->xattrs, m->xattrbl);
3406 m->head.xattr_version = in->xattr_version;
3407 }
3408
3409 m->size = in->size;
3410 m->max_size = in->max_size;
3411 m->truncate_seq = in->truncate_seq;
3412 m->truncate_size = in->truncate_size;
3413 m->mtime = in->mtime;
3414 m->atime = in->atime;
3415 m->ctime = in->ctime;
3416 m->btime = in->btime;
3417 m->time_warp_seq = in->time_warp_seq;
3418 m->change_attr = in->change_attr;
3419
3420 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3421 !in->cap_snaps.empty() &&
3422 in->cap_snaps.rbegin()->second.flush_tid == 0)
3423 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3424 m->flags = flags;
3425
3426 if (flush & CEPH_CAP_FILE_WR) {
3427 m->inline_version = in->inline_version;
3428 m->inline_data = in->inline_data;
3429 }
3430
3431 in->reported_size = in->size;
3432 m->set_snap_follows(follows);
3433 cap->wanted = want;
3434 if (cap == in->auth_cap) {
3435 if (want & CEPH_CAP_ANY_FILE_WR) {
3436 m->set_max_size(in->wanted_max_size);
3437 in->requested_max_size = in->wanted_max_size;
3438 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3439 } else {
3440 in->requested_max_size = 0;
3441 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3442 }
3443 }
3444
3445 if (!session->flushing_caps_tids.empty())
3446 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3447
3448 session->con->send_message2(std::move(m));
3449 }
3450
3451 static bool is_max_size_approaching(Inode *in)
3452 {
3453 /* mds will adjust max size according to the reported size */
3454 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3455 return false;
3456 if (in->size >= in->max_size)
3457 return true;
3458 /* half of previous max_size increment has been used */
3459 if (in->max_size > in->reported_size &&
3460 (in->size << 1) >= in->max_size + in->reported_size)
3461 return true;
3462 return false;
3463 }
3464
3465 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3466 {
3467 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3468 return used;
3469 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3470 return used;
3471
3472 if (issued & CEPH_CAP_FILE_LAZYIO) {
3473 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3474 used &= ~CEPH_CAP_FILE_CACHE;
3475 used |= CEPH_CAP_FILE_LAZYIO;
3476 }
3477 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3478 used &= ~CEPH_CAP_FILE_BUFFER;
3479 used |= CEPH_CAP_FILE_LAZYIO;
3480 }
3481 } else {
3482 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3483 used &= ~CEPH_CAP_FILE_CACHE;
3484 used |= CEPH_CAP_FILE_LAZYIO;
3485 }
3486 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3487 used &= ~CEPH_CAP_FILE_BUFFER;
3488 used |= CEPH_CAP_FILE_LAZYIO;
3489 }
3490 }
3491 return used;
3492 }
3493
3494 /**
3495 * check_caps
3496 *
3497 * Examine currently used and wanted versus held caps. Release, flush or ack
3498 * revoked caps to the MDS as appropriate.
3499 *
3500 * @param in the inode to check
3501 * @param flags flags to apply to cap check
3502 */
3503 void Client::check_caps(Inode *in, unsigned flags)
3504 {
3505 unsigned wanted = in->caps_wanted();
3506 unsigned used = get_caps_used(in);
3507 unsigned cap_used;
3508
3509 int implemented;
3510 int issued = in->caps_issued(&implemented);
3511 int revoking = implemented & ~issued;
3512
3513 int orig_used = used;
3514 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3515
3516 int retain = wanted | used | CEPH_CAP_PIN;
3517 if (!unmounting && in->nlink > 0) {
3518 if (wanted) {
3519 retain |= CEPH_CAP_ANY;
3520 } else if (in->is_dir() &&
3521 (issued & CEPH_CAP_FILE_SHARED) &&
3522 (in->flags & I_COMPLETE)) {
3523 // we do this here because we don't want to drop to Fs (and then
3524 // drop the Fs if we do a create!) if that alone makes us send lookups
3525 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3526 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3527 retain |= wanted;
3528 } else {
3529 retain |= CEPH_CAP_ANY_SHARED;
3530 // keep RD only if we didn't have the file open RW,
3531 // because then the mds would revoke it anyway to
3532 // journal max_size=0.
3533 if (in->max_size == 0)
3534 retain |= CEPH_CAP_ANY_RD;
3535 }
3536 }
3537
3538 ldout(cct, 10) << __func__ << " on " << *in
3539 << " wanted " << ccap_string(wanted)
3540 << " used " << ccap_string(used)
3541 << " issued " << ccap_string(issued)
3542 << " revoking " << ccap_string(revoking)
3543 << " flags=" << flags
3544 << dendl;
3545
3546 if (in->snapid != CEPH_NOSNAP)
3547 return; //snap caps last forever, can't write
3548
3549 if (in->caps.empty())
3550 return; // guard if at end of func
3551
3552 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3553 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3554 if (_release(in))
3555 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3556 }
3557
3558
3559 for (auto &p : in->caps) {
3560 mds_rank_t mds = p.first;
3561 Cap &cap = p.second;
3562
3563 MetaSession *session = &mds_sessions.at(mds);
3564
3565 cap_used = used;
3566 if (in->auth_cap && &cap != in->auth_cap)
3567 cap_used &= ~in->auth_cap->issued;
3568
3569 revoking = cap.implemented & ~cap.issued;
3570
3571 ldout(cct, 10) << " cap mds." << mds
3572 << " issued " << ccap_string(cap.issued)
3573 << " implemented " << ccap_string(cap.implemented)
3574 << " revoking " << ccap_string(revoking) << dendl;
3575
3576 if (in->wanted_max_size > in->max_size &&
3577 in->wanted_max_size > in->requested_max_size &&
3578 &cap == in->auth_cap)
3579 goto ack;
3580
3581 /* approaching file_max? */
3582 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3583 &cap == in->auth_cap &&
3584 is_max_size_approaching(in)) {
3585 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3586 << ", reported " << in->reported_size << dendl;
3587 goto ack;
3588 }
3589
3590 /* completed revocation? */
3591 if (revoking && (revoking & cap_used) == 0) {
3592 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3593 goto ack;
3594 }
3595
3596 /* want more caps from mds? */
3597 if (wanted & ~(cap.wanted | cap.issued))
3598 goto ack;
3599
3600 if (!revoking && unmounting && (cap_used == 0))
3601 goto ack;
3602
3603 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3604 !in->dirty_caps) // and we have no dirty caps
3605 continue;
3606
3607 if (!(flags & CHECK_CAPS_NODELAY)) {
3608 ldout(cct, 10) << "delaying cap release" << dendl;
3609 cap_delay_requeue(in);
3610 continue;
3611 }
3612
3613 ack:
3614 if (&cap == in->auth_cap) {
3615 if (in->flags & I_KICK_FLUSH) {
3616 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3617 << " to mds." << mds << dendl;
3618 kick_flushing_caps(in, session);
3619 }
3620 if (!in->cap_snaps.empty() &&
3621 in->cap_snaps.rbegin()->second.flush_tid == 0)
3622 flush_snaps(in);
3623 }
3624
3625 int flushing;
3626 ceph_tid_t flush_tid;
3627 if (in->auth_cap == &cap && in->dirty_caps) {
3628 flushing = mark_caps_flushing(in, &flush_tid);
3629 } else {
3630 flushing = 0;
3631 flush_tid = 0;
3632 }
3633
3634 int msg_flags = (flags & CHECK_CAPS_SYNCHRONOUS) ? MClientCaps::FLAG_SYNC : 0;
3635 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3636 flushing, flush_tid);
3637 }
3638 }
3639
3640
3641 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3642 {
3643 int used = get_caps_used(in);
3644 int dirty = in->caps_dirty();
3645 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3646
3647 if (in->cap_snaps.size() &&
3648 in->cap_snaps.rbegin()->second.writing) {
3649 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3650 return;
3651 } else if (in->caps_dirty() ||
3652 (used & CEPH_CAP_FILE_WR) ||
3653 (dirty & CEPH_CAP_ANY_WR)) {
3654 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3655 ceph_assert(capsnapem.second); /* element inserted */
3656 CapSnap &capsnap = capsnapem.first->second;
3657 capsnap.context = old_snapc;
3658 capsnap.issued = in->caps_issued();
3659 capsnap.dirty = in->caps_dirty();
3660
3661 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3662
3663 capsnap.uid = in->uid;
3664 capsnap.gid = in->gid;
3665 capsnap.mode = in->mode;
3666 capsnap.btime = in->btime;
3667 capsnap.xattrs = in->xattrs;
3668 capsnap.xattr_version = in->xattr_version;
3669 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3670 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3671
3672 if (used & CEPH_CAP_FILE_WR) {
3673 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3674 capsnap.writing = 1;
3675 } else {
3676 finish_cap_snap(in, capsnap, used);
3677 }
3678 } else {
3679 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3680 }
3681 }
3682
3683 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3684 {
3685 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3686 capsnap.size = in->size;
3687 capsnap.mtime = in->mtime;
3688 capsnap.atime = in->atime;
3689 capsnap.ctime = in->ctime;
3690 capsnap.time_warp_seq = in->time_warp_seq;
3691 capsnap.change_attr = in->change_attr;
3692 capsnap.dirty |= in->caps_dirty();
3693
3694 /* Only reset it if it wasn't set before */
3695 if (capsnap.cap_dirtier_uid == -1) {
3696 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3697 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3698 }
3699
3700 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3701 capsnap.inline_data = in->inline_data;
3702 capsnap.inline_version = in->inline_version;
3703 }
3704
3705 if (used & CEPH_CAP_FILE_BUFFER) {
3706 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3707 << " WRBUFFER, delaying" << dendl;
3708 } else {
3709 capsnap.dirty_data = 0;
3710 flush_snaps(in);
3711 }
3712 }
3713
3714 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3715 {
3716 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3717 in->cap_snaps.at(seq).dirty_data = 0;
3718 flush_snaps(in);
3719 }
3720
3721 void Client::send_flush_snap(Inode *in, MetaSession *session,
3722 snapid_t follows, CapSnap& capsnap)
3723 {
3724 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3725 in->ino, in->snaprealm->ino, 0,
3726 in->auth_cap->mseq, cap_epoch_barrier);
3727 m->caller_uid = capsnap.cap_dirtier_uid;
3728 m->caller_gid = capsnap.cap_dirtier_gid;
3729
3730 m->set_client_tid(capsnap.flush_tid);
3731 m->head.snap_follows = follows;
3732
3733 m->head.caps = capsnap.issued;
3734 m->head.dirty = capsnap.dirty;
3735
3736 m->head.uid = capsnap.uid;
3737 m->head.gid = capsnap.gid;
3738 m->head.mode = capsnap.mode;
3739 m->btime = capsnap.btime;
3740
3741 m->size = capsnap.size;
3742
3743 m->head.xattr_version = capsnap.xattr_version;
3744 encode(capsnap.xattrs, m->xattrbl);
3745
3746 m->ctime = capsnap.ctime;
3747 m->btime = capsnap.btime;
3748 m->mtime = capsnap.mtime;
3749 m->atime = capsnap.atime;
3750 m->time_warp_seq = capsnap.time_warp_seq;
3751 m->change_attr = capsnap.change_attr;
3752
3753 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3754 m->inline_version = in->inline_version;
3755 m->inline_data = in->inline_data;
3756 }
3757
3758 ceph_assert(!session->flushing_caps_tids.empty());
3759 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3760
3761 session->con->send_message2(std::move(m));
3762 }
3763
3764 void Client::flush_snaps(Inode *in)
3765 {
3766 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3767 ceph_assert(in->cap_snaps.size());
3768
3769 // pick auth mds
3770 ceph_assert(in->auth_cap);
3771 MetaSession *session = in->auth_cap->session;
3772
3773 for (auto &p : in->cap_snaps) {
3774 CapSnap &capsnap = p.second;
3775 // only do new flush
3776 if (capsnap.flush_tid > 0)
3777 continue;
3778
3779 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3780 << " follows " << p.first
3781 << " size " << capsnap.size
3782 << " mtime " << capsnap.mtime
3783 << " dirty_data=" << capsnap.dirty_data
3784 << " writing=" << capsnap.writing
3785 << " on " << *in << dendl;
3786 if (capsnap.dirty_data || capsnap.writing)
3787 break;
3788
3789 capsnap.flush_tid = ++last_flush_tid;
3790 session->flushing_caps_tids.insert(capsnap.flush_tid);
3791 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3792 if (!in->flushing_cap_item.is_on_list())
3793 session->flushing_caps.push_back(&in->flushing_cap_item);
3794
3795 send_flush_snap(in, session, p.first, capsnap);
3796 }
3797 }
3798
3799 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3800 {
3801 ceph::condition_variable cond;
3802 ls.push_back(&cond);
3803 std::unique_lock l{client_lock, std::adopt_lock};
3804 cond.wait(l);
3805 l.release();
3806 ls.remove(&cond);
3807 }
3808
3809 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3810 {
3811 for (auto cond : ls) {
3812 cond->notify_all();
3813 }
3814 }
3815
3816 void Client::wait_on_context_list(list<Context*>& ls)
3817 {
3818 ceph::condition_variable cond;
3819 bool done = false;
3820 int r;
3821 ls.push_back(new C_Cond(cond, &done, &r));
3822 std::unique_lock l{client_lock, std::adopt_lock};
3823 cond.wait(l, [&done] { return done;});
3824 l.release();
3825 }
3826
3827 void Client::signal_context_list(list<Context*>& ls)
3828 {
3829 while (!ls.empty()) {
3830 ls.front()->complete(0);
3831 ls.pop_front();
3832 }
3833 }
3834
3835 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3836 {
3837 for (const auto &cap : s->caps) {
3838 auto &in = cap->inode;
3839 if (reconnect) {
3840 in.requested_max_size = 0;
3841 in.wanted_max_size = 0;
3842 } else {
3843 if (cap->gen < s->cap_gen) {
3844 // mds did not re-issue stale cap.
3845 cap->issued = cap->implemented = CEPH_CAP_PIN;
3846 // make sure mds knows what we want.
3847 if (in.caps_file_wanted() & ~cap->wanted)
3848 in.flags |= I_CAP_DROPPED;
3849 }
3850 }
3851 signal_cond_list(in.waitfor_caps);
3852 }
3853 }
3854
3855
3856 // flush dirty data (from objectcache)
3857
3858 class C_Client_CacheInvalidate : public Context {
3859 private:
3860 Client *client;
3861 vinodeno_t ino;
3862 int64_t offset, length;
3863 public:
3864 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3865 client(c), offset(off), length(len) {
3866 if (client->use_faked_inos())
3867 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3868 else
3869 ino = in->vino();
3870 }
3871 void finish(int r) override {
3872 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3873 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3874 client->_async_invalidate(ino, offset, length);
3875 }
3876 };
3877
3878 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3879 {
3880 if (unmounting)
3881 return;
3882 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3883 ino_invalidate_cb(callback_handle, ino, off, len);
3884 }
3885
3886 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3887
3888 if (ino_invalidate_cb)
3889 // we queue the invalidate, which calls the callback and decrements the ref
3890 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3891 }
3892
3893 void Client::_invalidate_inode_cache(Inode *in)
3894 {
3895 ldout(cct, 10) << __func__ << " " << *in << dendl;
3896
3897 // invalidate our userspace inode cache
3898 if (cct->_conf->client_oc) {
3899 objectcacher->release_set(&in->oset);
3900 if (!objectcacher->set_is_empty(&in->oset))
3901 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3902 }
3903
3904 _schedule_invalidate_callback(in, 0, 0);
3905 }
3906
3907 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3908 {
3909 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3910
3911 // invalidate our userspace inode cache
3912 if (cct->_conf->client_oc) {
3913 vector<ObjectExtent> ls;
3914 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3915 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3916 }
3917
3918 _schedule_invalidate_callback(in, off, len);
3919 }
3920
3921 bool Client::_release(Inode *in)
3922 {
3923 ldout(cct, 20) << "_release " << *in << dendl;
3924 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3925 _invalidate_inode_cache(in);
3926 return true;
3927 }
3928 return false;
3929 }
3930
3931 bool Client::_flush(Inode *in, Context *onfinish)
3932 {
3933 ldout(cct, 10) << "_flush " << *in << dendl;
3934
3935 if (!in->oset.dirty_or_tx) {
3936 ldout(cct, 10) << " nothing to flush" << dendl;
3937 onfinish->complete(0);
3938 return true;
3939 }
3940
3941 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3942 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3943 objectcacher->purge_set(&in->oset);
3944 if (onfinish) {
3945 onfinish->complete(-ENOSPC);
3946 }
3947 return true;
3948 }
3949
3950 return objectcacher->flush_set(&in->oset, onfinish);
3951 }
3952
3953 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3954 {
3955 ceph_assert(ceph_mutex_is_locked(client_lock));
3956 if (!in->oset.dirty_or_tx) {
3957 ldout(cct, 10) << " nothing to flush" << dendl;
3958 return;
3959 }
3960
3961 C_SaferCond onflush("Client::_flush_range flock");
3962 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3963 offset, size, &onflush);
3964 if (!ret) {
3965 // wait for flush
3966 client_lock.unlock();
3967 onflush.wait();
3968 client_lock.lock();
3969 }
3970 }
3971
3972 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3973 {
3974 // std::lock_guard l(client_lock);
3975 ceph_assert(ceph_mutex_is_locked(client_lock)); // will be called via dispatch() -> objecter -> ...
3976 Inode *in = static_cast<Inode *>(oset->parent);
3977 ceph_assert(in);
3978 _flushed(in);
3979 }
3980
3981 void Client::_flushed(Inode *in)
3982 {
3983 ldout(cct, 10) << "_flushed " << *in << dendl;
3984
3985 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3986 }
3987
3988
3989
3990 // checks common to add_update_cap, handle_cap_grant
3991 void Client::check_cap_issue(Inode *in, unsigned issued)
3992 {
3993 unsigned had = in->caps_issued();
3994
3995 if ((issued & CEPH_CAP_FILE_CACHE) &&
3996 !(had & CEPH_CAP_FILE_CACHE))
3997 in->cache_gen++;
3998
3999 if ((issued & CEPH_CAP_FILE_SHARED) &&
4000 !(had & CEPH_CAP_FILE_SHARED)) {
4001 in->shared_gen++;
4002
4003 if (in->is_dir())
4004 clear_dir_complete_and_ordered(in, true);
4005 }
4006 }
4007
4008 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4009 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4010 inodeno_t realm, int flags, const UserPerm& cap_perms)
4011 {
4012 if (!in->is_any_caps()) {
4013 ceph_assert(in->snaprealm == 0);
4014 in->snaprealm = get_snap_realm(realm);
4015 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4016 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4017 } else {
4018 ceph_assert(in->snaprealm);
4019 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4020 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4021 in->snaprealm_item.remove_myself();
4022 auto oldrealm = in->snaprealm;
4023 in->snaprealm = get_snap_realm(realm);
4024 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4025 put_snap_realm(oldrealm);
4026 }
4027 }
4028
4029 mds_rank_t mds = mds_session->mds_num;
4030 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4031 Cap &cap = capem.first->second;
4032 if (!capem.second) {
4033 if (cap.gen < mds_session->cap_gen)
4034 cap.issued = cap.implemented = CEPH_CAP_PIN;
4035
4036 /*
4037 * auth mds of the inode changed. we received the cap export
4038 * message, but still haven't received the cap import message.
4039 * handle_cap_export() updated the new auth MDS' cap.
4040 *
4041 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4042 * a message that was send before the cap import message. So
4043 * don't remove caps.
4044 */
4045 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4046 if (&cap != in->auth_cap)
4047 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4048
4049 ceph_assert(cap.cap_id == cap_id);
4050 seq = cap.seq;
4051 mseq = cap.mseq;
4052 issued |= cap.issued;
4053 flags |= CEPH_CAP_FLAG_AUTH;
4054 }
4055 }
4056
4057 check_cap_issue(in, issued);
4058
4059 if (flags & CEPH_CAP_FLAG_AUTH) {
4060 if (in->auth_cap != &cap &&
4061 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4062 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4063 ldout(cct, 10) << __func__ << " changing auth cap: "
4064 << "add myself to new auth MDS' flushing caps list" << dendl;
4065 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4066 }
4067 in->auth_cap = &cap;
4068 }
4069 }
4070
4071 unsigned old_caps = cap.issued;
4072 cap.cap_id = cap_id;
4073 cap.issued = issued;
4074 cap.implemented |= issued;
4075 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4076 cap.wanted = wanted;
4077 else
4078 cap.wanted |= wanted;
4079 cap.seq = seq;
4080 cap.issue_seq = seq;
4081 cap.mseq = mseq;
4082 cap.gen = mds_session->cap_gen;
4083 cap.latest_perms = cap_perms;
4084 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4085 << " from mds." << mds
4086 << " on " << *in
4087 << dendl;
4088
4089 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4090 // non-auth MDS is revoking the newly grant caps ?
4091 for (auto &p : in->caps) {
4092 if (&p.second == &cap)
4093 continue;
4094 if (p.second.implemented & ~p.second.issued & issued) {
4095 check_caps(in, CHECK_CAPS_NODELAY);
4096 break;
4097 }
4098 }
4099 }
4100
4101 if (issued & ~old_caps)
4102 signal_cond_list(in->waitfor_caps);
4103 }
4104
4105 void Client::remove_cap(Cap *cap, bool queue_release)
4106 {
4107 auto &in = cap->inode;
4108 MetaSession *session = cap->session;
4109 mds_rank_t mds = cap->session->mds_num;
4110
4111 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4112
4113 if (queue_release) {
4114 session->enqueue_cap_release(
4115 in.ino,
4116 cap->cap_id,
4117 cap->issue_seq,
4118 cap->mseq,
4119 cap_epoch_barrier);
4120 }
4121
4122 if (in.auth_cap == cap) {
4123 if (in.flushing_cap_item.is_on_list()) {
4124 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4125 in.flushing_cap_item.remove_myself();
4126 }
4127 in.auth_cap = NULL;
4128 }
4129 size_t n = in.caps.erase(mds);
4130 ceph_assert(n == 1);
4131 cap = nullptr;
4132
4133 if (!in.is_any_caps()) {
4134 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4135 in.snaprealm_item.remove_myself();
4136 put_snap_realm(in.snaprealm);
4137 in.snaprealm = 0;
4138 }
4139 }
4140
4141 void Client::remove_all_caps(Inode *in)
4142 {
4143 while (!in->caps.empty())
4144 remove_cap(&in->caps.begin()->second, true);
4145 }
4146
4147 void Client::remove_session_caps(MetaSession *s)
4148 {
4149 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4150
4151 while (s->caps.size()) {
4152 Cap *cap = *s->caps.begin();
4153 InodeRef in(&cap->inode);
4154 bool dirty_caps = false;
4155 if (in->auth_cap == cap) {
4156 dirty_caps = in->dirty_caps | in->flushing_caps;
4157 in->wanted_max_size = 0;
4158 in->requested_max_size = 0;
4159 }
4160 if (cap->wanted | cap->issued)
4161 in->flags |= I_CAP_DROPPED;
4162 remove_cap(cap, false);
4163 in->cap_snaps.clear();
4164 if (dirty_caps) {
4165 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4166 if (in->flushing_caps) {
4167 num_flushing_caps--;
4168 in->flushing_cap_tids.clear();
4169 }
4170 in->flushing_caps = 0;
4171 in->mark_caps_clean();
4172 put_inode(in.get());
4173 }
4174 signal_cond_list(in->waitfor_caps);
4175 }
4176 s->flushing_caps_tids.clear();
4177 sync_cond.notify_all();
4178 }
4179
4180 int Client::_do_remount(bool retry_on_error)
4181 {
4182 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4183
4184 errno = 0;
4185 int r = remount_cb(callback_handle);
4186 if (r == 0) {
4187 retries_on_invalidate = 0;
4188 } else {
4189 int e = errno;
4190 client_t whoami = get_nodeid();
4191 if (r == -1) {
4192 lderr(cct) <<
4193 "failed to remount (to trim kernel dentries): "
4194 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4195 } else {
4196 lderr(cct) <<
4197 "failed to remount (to trim kernel dentries): "
4198 "return code = " << r << dendl;
4199 }
4200 bool should_abort =
4201 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4202 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4203 !(retry_on_error && (++retries_on_invalidate < max_retries));
4204 if (should_abort && !unmounting) {
4205 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4206 ceph_abort();
4207 }
4208 }
4209 return r;
4210 }
4211
4212 class C_Client_Remount : public Context {
4213 private:
4214 Client *client;
4215 public:
4216 explicit C_Client_Remount(Client *c) : client(c) {}
4217 void finish(int r) override {
4218 ceph_assert(r == 0);
4219 client->_do_remount(true);
4220 }
4221 };
4222
4223 void Client::_invalidate_kernel_dcache()
4224 {
4225 if (unmounting)
4226 return;
4227 if (can_invalidate_dentries) {
4228 if (dentry_invalidate_cb && root->dir) {
4229 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4230 p != root->dir->dentries.end();
4231 ++p) {
4232 if (p->second->inode)
4233 _schedule_invalidate_dentry_callback(p->second, false);
4234 }
4235 }
4236 } else if (remount_cb) {
4237 // Hacky:
4238 // when remounting a file system, linux kernel trims all unused dentries in the fs
4239 remount_finisher.queue(new C_Client_Remount(this));
4240 }
4241 }
4242
4243 void Client::_trim_negative_child_dentries(InodeRef& in)
4244 {
4245 if (!in->is_dir())
4246 return;
4247
4248 Dir* dir = in->dir;
4249 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4250 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4251 Dentry *dn = p->second;
4252 ++p;
4253 ceph_assert(!dn->inode);
4254 if (dn->lru_is_expireable())
4255 unlink(dn, true, false); // keep dir, drop dentry
4256 }
4257 if (dir->dentries.empty()) {
4258 close_dir(dir);
4259 }
4260 }
4261
4262 if (in->flags & I_SNAPDIR_OPEN) {
4263 InodeRef snapdir = open_snapdir(in.get());
4264 _trim_negative_child_dentries(snapdir);
4265 }
4266 }
4267
4268 void Client::trim_caps(MetaSession *s, uint64_t max)
4269 {
4270 mds_rank_t mds = s->mds_num;
4271 size_t caps_size = s->caps.size();
4272 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4273 << " caps " << caps_size << dendl;
4274
4275 uint64_t trimmed = 0;
4276 auto p = s->caps.begin();
4277 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4278 * looking at from getting deleted during traversal. */
4279 while ((caps_size - trimmed) > max && !p.end()) {
4280 Cap *cap = *p;
4281 InodeRef in(&cap->inode);
4282
4283 // Increment p early because it will be invalidated if cap
4284 // is deleted inside remove_cap
4285 ++p;
4286
4287 if (in->caps.size() > 1 && cap != in->auth_cap) {
4288 int mine = cap->issued | cap->implemented;
4289 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4290 // disposable non-auth cap
4291 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4292 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4293 cap = (remove_cap(cap, true), nullptr);
4294 trimmed++;
4295 }
4296 } else {
4297 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4298 _trim_negative_child_dentries(in);
4299 bool all = true;
4300 auto q = in->dentries.begin();
4301 while (q != in->dentries.end()) {
4302 Dentry *dn = *q;
4303 ++q;
4304 if (dn->lru_is_expireable()) {
4305 if (can_invalidate_dentries &&
4306 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4307 // Only issue one of these per DN for inodes in root: handle
4308 // others more efficiently by calling for root-child DNs at
4309 // the end of this function.
4310 _schedule_invalidate_dentry_callback(dn, true);
4311 }
4312 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4313 to_trim.insert(dn);
4314 } else {
4315 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4316 all = false;
4317 }
4318 }
4319 if (all && in->ino != MDS_INO_ROOT) {
4320 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4321 trimmed++;
4322 }
4323 }
4324 }
4325 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4326 for (const auto &dn : to_trim) {
4327 trim_dentry(dn);
4328 }
4329 to_trim.clear();
4330
4331 caps_size = s->caps.size();
4332 if (caps_size > (size_t)max)
4333 _invalidate_kernel_dcache();
4334 }
4335
4336 void Client::force_session_readonly(MetaSession *s)
4337 {
4338 s->readonly = true;
4339 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4340 auto &in = (*p)->inode;
4341 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4342 signal_cond_list(in.waitfor_caps);
4343 }
4344 }
4345
4346 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4347 {
4348 MetaSession *session = in->auth_cap->session;
4349
4350 int flushing = in->dirty_caps;
4351 ceph_assert(flushing);
4352
4353 ceph_tid_t flush_tid = ++last_flush_tid;
4354 in->flushing_cap_tids[flush_tid] = flushing;
4355
4356 if (!in->flushing_caps) {
4357 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4358 num_flushing_caps++;
4359 } else {
4360 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4361 }
4362
4363 in->flushing_caps |= flushing;
4364 in->mark_caps_clean();
4365
4366 if (!in->flushing_cap_item.is_on_list())
4367 session->flushing_caps.push_back(&in->flushing_cap_item);
4368 session->flushing_caps_tids.insert(flush_tid);
4369
4370 *ptid = flush_tid;
4371 return flushing;
4372 }
4373
4374 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4375 {
4376 for (auto &p : in->cap_snaps) {
4377 CapSnap &capsnap = p.second;
4378 if (capsnap.flush_tid > 0) {
4379 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4380 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4381 }
4382 }
4383 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4384 it != in->flushing_cap_tids.end();
4385 ++it) {
4386 old_s->flushing_caps_tids.erase(it->first);
4387 new_s->flushing_caps_tids.insert(it->first);
4388 }
4389 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4390 }
4391
4392 /*
4393 * Flush all caps back to the MDS. Because the callers generally wait on the
4394 * result of this function (syncfs and umount cases), we set
4395 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4396 */
4397 void Client::flush_caps_sync()
4398 {
4399 ldout(cct, 10) << __func__ << dendl;
4400 xlist<Inode*>::iterator p = delayed_list.begin();
4401 while (!p.end()) {
4402 unsigned flags = CHECK_CAPS_NODELAY;
4403 Inode *in = *p;
4404
4405 ++p;
4406 delayed_list.pop_front();
4407 if (p.end() && dirty_list.empty())
4408 flags |= CHECK_CAPS_SYNCHRONOUS;
4409 check_caps(in, flags);
4410 }
4411
4412 // other caps, too
4413 p = dirty_list.begin();
4414 while (!p.end()) {
4415 unsigned flags = CHECK_CAPS_NODELAY;
4416 Inode *in = *p;
4417
4418 ++p;
4419 if (p.end())
4420 flags |= CHECK_CAPS_SYNCHRONOUS;
4421 check_caps(in, flags);
4422 }
4423 }
4424
4425 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4426 {
4427 while (in->flushing_caps) {
4428 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4429 ceph_assert(it != in->flushing_cap_tids.end());
4430 if (it->first > want)
4431 break;
4432 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4433 << ccap_string(it->second) << " want " << want
4434 << " last " << it->first << dendl;
4435 wait_on_list(in->waitfor_caps);
4436 }
4437 }
4438
4439 void Client::wait_sync_caps(ceph_tid_t want)
4440 {
4441 retry:
4442 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4443 << num_flushing_caps << " total flushing)" << dendl;
4444 for (auto &p : mds_sessions) {
4445 MetaSession *s = &p.second;
4446 if (s->flushing_caps_tids.empty())
4447 continue;
4448 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4449 if (oldest_tid <= want) {
4450 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4451 << " (want " << want << ")" << dendl;
4452 std::unique_lock l{client_lock, std::adopt_lock};
4453 sync_cond.wait(l);
4454 l.release();
4455 goto retry;
4456 }
4457 }
4458 }
4459
4460 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4461 {
4462 in->flags &= ~I_KICK_FLUSH;
4463
4464 Cap *cap = in->auth_cap;
4465 ceph_assert(cap->session == session);
4466
4467 ceph_tid_t last_snap_flush = 0;
4468 for (auto p = in->flushing_cap_tids.rbegin();
4469 p != in->flushing_cap_tids.rend();
4470 ++p) {
4471 if (!p->second) {
4472 last_snap_flush = p->first;
4473 break;
4474 }
4475 }
4476
4477 int wanted = in->caps_wanted();
4478 int used = get_caps_used(in) | in->caps_dirty();
4479 auto it = in->cap_snaps.begin();
4480 for (auto& p : in->flushing_cap_tids) {
4481 if (p.second) {
4482 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4483 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4484 p.second, p.first);
4485 } else {
4486 ceph_assert(it != in->cap_snaps.end());
4487 ceph_assert(it->second.flush_tid == p.first);
4488 send_flush_snap(in, session, it->first, it->second);
4489 ++it;
4490 }
4491 }
4492 }
4493
4494 void Client::kick_flushing_caps(MetaSession *session)
4495 {
4496 mds_rank_t mds = session->mds_num;
4497 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4498
4499 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4500 Inode *in = *p;
4501 if (in->flags & I_KICK_FLUSH) {
4502 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4503 kick_flushing_caps(in, session);
4504 }
4505 }
4506 }
4507
4508 void Client::early_kick_flushing_caps(MetaSession *session)
4509 {
4510 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4511 Inode *in = *p;
4512 Cap *cap = in->auth_cap;
4513 ceph_assert(cap);
4514
4515 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4516 // stage. This guarantees that MDS processes the cap flush message before issuing
4517 // the flushing caps to other client.
4518 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4519 in->flags |= I_KICK_FLUSH;
4520 continue;
4521 }
4522
4523 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4524 << " to mds." << session->mds_num << dendl;
4525 // send_reconnect() also will reset these sequence numbers. make sure
4526 // sequence numbers in cap flush message match later reconnect message.
4527 cap->seq = 0;
4528 cap->issue_seq = 0;
4529 cap->mseq = 0;
4530 cap->issued = cap->implemented;
4531
4532 kick_flushing_caps(in, session);
4533 }
4534 }
4535
4536 void SnapRealm::build_snap_context()
4537 {
4538 set<snapid_t> snaps;
4539 snapid_t max_seq = seq;
4540
4541 // start with prior_parents?
4542 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4543 snaps.insert(prior_parent_snaps[i]);
4544
4545 // current parent's snaps
4546 if (pparent) {
4547 const SnapContext& psnapc = pparent->get_snap_context();
4548 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4549 if (psnapc.snaps[i] >= parent_since)
4550 snaps.insert(psnapc.snaps[i]);
4551 if (psnapc.seq > max_seq)
4552 max_seq = psnapc.seq;
4553 }
4554
4555 // my snaps
4556 for (unsigned i=0; i<my_snaps.size(); i++)
4557 snaps.insert(my_snaps[i]);
4558
4559 // ok!
4560 cached_snap_context.seq = max_seq;
4561 cached_snap_context.snaps.resize(0);
4562 cached_snap_context.snaps.reserve(snaps.size());
4563 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4564 cached_snap_context.snaps.push_back(*p);
4565 }
4566
4567 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4568 {
4569 list<SnapRealm*> q;
4570 q.push_back(realm);
4571
4572 while (!q.empty()) {
4573 realm = q.front();
4574 q.pop_front();
4575
4576 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4577 realm->invalidate_cache();
4578
4579 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4580 p != realm->pchildren.end();
4581 ++p)
4582 q.push_back(*p);
4583 }
4584 }
4585
4586 SnapRealm *Client::get_snap_realm(inodeno_t r)
4587 {
4588 SnapRealm *realm = snap_realms[r];
4589 if (!realm)
4590 snap_realms[r] = realm = new SnapRealm(r);
4591 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4592 realm->nref++;
4593 return realm;
4594 }
4595
4596 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4597 {
4598 if (snap_realms.count(r) == 0) {
4599 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4600 return NULL;
4601 }
4602 SnapRealm *realm = snap_realms[r];
4603 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4604 realm->nref++;
4605 return realm;
4606 }
4607
4608 void Client::put_snap_realm(SnapRealm *realm)
4609 {
4610 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4611 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4612 if (--realm->nref == 0) {
4613 snap_realms.erase(realm->ino);
4614 if (realm->pparent) {
4615 realm->pparent->pchildren.erase(realm);
4616 put_snap_realm(realm->pparent);
4617 }
4618 delete realm;
4619 }
4620 }
4621
4622 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4623 {
4624 if (realm->parent != parent) {
4625 ldout(cct, 10) << __func__ << " " << *realm
4626 << " " << realm->parent << " -> " << parent << dendl;
4627 realm->parent = parent;
4628 if (realm->pparent) {
4629 realm->pparent->pchildren.erase(realm);
4630 put_snap_realm(realm->pparent);
4631 }
4632 realm->pparent = get_snap_realm(parent);
4633 realm->pparent->pchildren.insert(realm);
4634 return true;
4635 }
4636 return false;
4637 }
4638
4639 static bool has_new_snaps(const SnapContext& old_snapc,
4640 const SnapContext& new_snapc)
4641 {
4642 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4643 }
4644
4645
4646 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4647 {
4648 SnapRealm *first_realm = NULL;
4649 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4650
4651 map<SnapRealm*, SnapContext> dirty_realms;
4652
4653 auto p = bl.cbegin();
4654 while (!p.end()) {
4655 SnapRealmInfo info;
4656 decode(info, p);
4657 SnapRealm *realm = get_snap_realm(info.ino());
4658
4659 bool invalidate = false;
4660
4661 if (info.seq() > realm->seq) {
4662 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4663 << dendl;
4664
4665 if (flush) {
4666 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4667 // flush me + children
4668 list<SnapRealm*> q;
4669 q.push_back(realm);
4670 while (!q.empty()) {
4671 SnapRealm *realm = q.front();
4672 q.pop_front();
4673
4674 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4675 p != realm->pchildren.end();
4676 ++p)
4677 q.push_back(*p);
4678
4679 if (dirty_realms.count(realm) == 0) {
4680 realm->nref++;
4681 dirty_realms[realm] = realm->get_snap_context();
4682 }
4683 }
4684 }
4685
4686 // update
4687 realm->seq = info.seq();
4688 realm->created = info.created();
4689 realm->parent_since = info.parent_since();
4690 realm->prior_parent_snaps = info.prior_parent_snaps;
4691 realm->my_snaps = info.my_snaps;
4692 invalidate = true;
4693 }
4694
4695 // _always_ verify parent
4696 if (adjust_realm_parent(realm, info.parent()))
4697 invalidate = true;
4698
4699 if (invalidate) {
4700 invalidate_snaprealm_and_children(realm);
4701 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4702 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4703 } else {
4704 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4705 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4706 }
4707
4708 if (!first_realm)
4709 first_realm = realm;
4710 else
4711 put_snap_realm(realm);
4712 }
4713
4714 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4715 q != dirty_realms.end();
4716 ++q) {
4717 SnapRealm *realm = q->first;
4718 // if there are new snaps ?
4719 if (has_new_snaps(q->second, realm->get_snap_context())) {
4720 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4721 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4722 while (!r.end()) {
4723 Inode *in = *r;
4724 ++r;
4725 queue_cap_snap(in, q->second);
4726 }
4727 } else {
4728 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4729 }
4730 put_snap_realm(realm);
4731 }
4732
4733 if (realm_ret)
4734 *realm_ret = first_realm;
4735 else
4736 put_snap_realm(first_realm);
4737 }
4738
4739 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4740 {
4741 ldout(cct, 10) << __func__ << " " << *m << dendl;
4742 mds_rank_t mds = mds_rank_t(m->get_source().num());
4743 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4744 if (!session) {
4745 return;
4746 }
4747
4748 got_mds_push(session);
4749
4750 map<Inode*, SnapContext> to_move;
4751 SnapRealm *realm = 0;
4752
4753 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4754 ceph_assert(m->head.split);
4755 SnapRealmInfo info;
4756 auto p = m->bl.cbegin();
4757 decode(info, p);
4758 ceph_assert(info.ino() == m->head.split);
4759
4760 // flush, then move, ino's.
4761 realm = get_snap_realm(info.ino());
4762 ldout(cct, 10) << " splitting off " << *realm << dendl;
4763 for (auto& ino : m->split_inos) {
4764 vinodeno_t vino(ino, CEPH_NOSNAP);
4765 if (inode_map.count(vino)) {
4766 Inode *in = inode_map[vino];
4767 if (!in->snaprealm || in->snaprealm == realm)
4768 continue;
4769 if (in->snaprealm->created > info.created()) {
4770 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4771 << *in->snaprealm << dendl;
4772 continue;
4773 }
4774 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4775
4776
4777 in->snaprealm_item.remove_myself();
4778 to_move[in] = in->snaprealm->get_snap_context();
4779 put_snap_realm(in->snaprealm);
4780 }
4781 }
4782
4783 // move child snaprealms, too
4784 for (auto& child_realm : m->split_realms) {
4785 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4786 SnapRealm *child = get_snap_realm_maybe(child_realm);
4787 if (!child)
4788 continue;
4789 adjust_realm_parent(child, realm->ino);
4790 put_snap_realm(child);
4791 }
4792 }
4793
4794 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4795
4796 if (realm) {
4797 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4798 Inode *in = p->first;
4799 in->snaprealm = realm;
4800 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4801 realm->nref++;
4802 // queue for snap writeback
4803 if (has_new_snaps(p->second, realm->get_snap_context()))
4804 queue_cap_snap(in, p->second);
4805 }
4806 put_snap_realm(realm);
4807 }
4808 }
4809
4810 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4811 {
4812 mds_rank_t mds = mds_rank_t(m->get_source().num());
4813 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4814 if (!session) {
4815 return;
4816 }
4817
4818 got_mds_push(session);
4819
4820 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4821
4822 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4823 if (inode_map.count(vino)) {
4824 Inode *in = NULL;
4825 in = inode_map[vino];
4826
4827 if (in) {
4828 in->quota = m->quota;
4829 in->rstat = m->rstat;
4830 }
4831 }
4832 }
4833
4834 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4835 {
4836 mds_rank_t mds = mds_rank_t(m->get_source().num());
4837 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4838 if (!session) {
4839 return;
4840 }
4841
4842 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4843 // Pause RADOS operations until we see the required epoch
4844 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4845 }
4846
4847 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4848 // Record the barrier so that we will transmit it to MDS when releasing
4849 set_cap_epoch_barrier(m->osd_epoch_barrier);
4850 }
4851
4852 got_mds_push(session);
4853
4854 Inode *in;
4855 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4856 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4857 in = it->second;
4858 } else {
4859 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4860 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4861 session->enqueue_cap_release(
4862 m->get_ino(),
4863 m->get_cap_id(),
4864 m->get_seq(),
4865 m->get_mseq(),
4866 cap_epoch_barrier);
4867 } else {
4868 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4869 }
4870
4871 // in case the mds is waiting on e.g. a revocation
4872 flush_cap_releases();
4873 return;
4874 }
4875
4876 switch (m->get_op()) {
4877 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4878 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4879 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4880 }
4881
4882 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4883 Cap &cap = in->caps.at(mds);
4884
4885 switch (m->get_op()) {
4886 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4887 case CEPH_CAP_OP_IMPORT:
4888 case CEPH_CAP_OP_REVOKE:
4889 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4890 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4891 }
4892 } else {
4893 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4894 return;
4895 }
4896 }
4897
4898 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4899 {
4900 mds_rank_t mds = session->mds_num;
4901
4902 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4903 << " IMPORT from mds." << mds << dendl;
4904
4905 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4906 Cap *cap = NULL;
4907 UserPerm cap_perms;
4908 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4909 cap = &it->second;
4910 cap_perms = cap->latest_perms;
4911 }
4912
4913 // add/update it
4914 SnapRealm *realm = NULL;
4915 update_snap_trace(m->snapbl, &realm);
4916
4917 int issued = m->get_caps();
4918 int wanted = m->get_wanted();
4919 add_update_cap(in, session, m->get_cap_id(),
4920 issued, wanted, m->get_seq(), m->get_mseq(),
4921 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4922
4923 if (cap && cap->cap_id == m->peer.cap_id) {
4924 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4925 }
4926
4927 if (realm)
4928 put_snap_realm(realm);
4929
4930 if (in->auth_cap && in->auth_cap->session == session) {
4931 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4932 in->requested_max_size > m->get_max_size()) {
4933 in->requested_max_size = 0;
4934 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
4935 }
4936 // reflush any/all caps (if we are now the auth_cap)
4937 kick_flushing_caps(in, session);
4938 }
4939 }
4940
4941 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4942 {
4943 mds_rank_t mds = session->mds_num;
4944
4945 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4946 << " EXPORT from mds." << mds << dendl;
4947
4948 auto it = in->caps.find(mds);
4949 if (it != in->caps.end()) {
4950 Cap &cap = it->second;
4951 if (cap.cap_id == m->get_cap_id()) {
4952 if (m->peer.cap_id) {
4953 const auto peer_mds = mds_rank_t(m->peer.mds);
4954 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4955 auto it = in->caps.find(peer_mds);
4956 if (it != in->caps.end()) {
4957 Cap &tcap = it->second;
4958 if (tcap.cap_id == m->peer.cap_id &&
4959 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4960 tcap.cap_id = m->peer.cap_id;
4961 tcap.seq = m->peer.seq - 1;
4962 tcap.issue_seq = tcap.seq;
4963 tcap.issued |= cap.issued;
4964 tcap.implemented |= cap.issued;
4965 if (&cap == in->auth_cap)
4966 in->auth_cap = &tcap;
4967 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4968 adjust_session_flushing_caps(in, session, tsession);
4969 }
4970 } else {
4971 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4972 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4973 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4974 cap.latest_perms);
4975 }
4976 } else {
4977 if (cap.wanted | cap.issued)
4978 in->flags |= I_CAP_DROPPED;
4979 }
4980
4981 remove_cap(&cap, false);
4982 }
4983 }
4984 }
4985
4986 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4987 {
4988 mds_rank_t mds = session->mds_num;
4989 ceph_assert(in->caps.count(mds));
4990
4991 ldout(cct, 10) << __func__ << " on ino " << *in
4992 << " size " << in->size << " -> " << m->get_size()
4993 << dendl;
4994
4995 int issued;
4996 in->caps_issued(&issued);
4997 issued |= in->caps_dirty();
4998 update_inode_file_size(in, issued, m->get_size(),
4999 m->get_truncate_seq(), m->get_truncate_size());
5000 }
5001
5002 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5003 {
5004 ceph_tid_t flush_ack_tid = m->get_client_tid();
5005 int dirty = m->get_dirty();
5006 int cleaned = 0;
5007 int flushed = 0;
5008
5009 auto it = in->flushing_cap_tids.begin();
5010 if (it->first < flush_ack_tid) {
5011 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5012 << " got unexpected flush ack tid " << flush_ack_tid
5013 << " expected is " << it->first << dendl;
5014 }
5015 for (; it != in->flushing_cap_tids.end(); ) {
5016 if (!it->second) {
5017 // cap snap
5018 ++it;
5019 continue;
5020 }
5021 if (it->first == flush_ack_tid)
5022 cleaned = it->second;
5023 if (it->first <= flush_ack_tid) {
5024 session->flushing_caps_tids.erase(it->first);
5025 in->flushing_cap_tids.erase(it++);
5026 ++flushed;
5027 continue;
5028 }
5029 cleaned &= ~it->second;
5030 if (!cleaned)
5031 break;
5032 ++it;
5033 }
5034
5035 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5036 << " cleaned " << ccap_string(cleaned) << " on " << *in
5037 << " with " << ccap_string(dirty) << dendl;
5038
5039 if (flushed) {
5040 signal_cond_list(in->waitfor_caps);
5041 if (session->flushing_caps_tids.empty() ||
5042 *session->flushing_caps_tids.begin() > flush_ack_tid)
5043 sync_cond.notify_all();
5044 }
5045
5046 if (!dirty) {
5047 in->cap_dirtier_uid = -1;
5048 in->cap_dirtier_gid = -1;
5049 }
5050
5051 if (!cleaned) {
5052 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5053 } else {
5054 if (in->flushing_caps) {
5055 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5056 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5057 in->flushing_caps &= ~cleaned;
5058 if (in->flushing_caps == 0) {
5059 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5060 num_flushing_caps--;
5061 if (in->flushing_cap_tids.empty())
5062 in->flushing_cap_item.remove_myself();
5063 }
5064 if (!in->caps_dirty())
5065 put_inode(in);
5066 }
5067 }
5068 }
5069
5070
5071 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5072 {
5073 ceph_tid_t flush_ack_tid = m->get_client_tid();
5074 mds_rank_t mds = session->mds_num;
5075 ceph_assert(in->caps.count(mds));
5076 snapid_t follows = m->get_snap_follows();
5077
5078 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5079 auto& capsnap = it->second;
5080 if (flush_ack_tid != capsnap.flush_tid) {
5081 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5082 } else {
5083 InodeRef tmp_ref(in);
5084 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5085 << " on " << *in << dendl;
5086 session->flushing_caps_tids.erase(capsnap.flush_tid);
5087 in->flushing_cap_tids.erase(capsnap.flush_tid);
5088 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5089 in->flushing_cap_item.remove_myself();
5090 in->cap_snaps.erase(it);
5091
5092 signal_cond_list(in->waitfor_caps);
5093 if (session->flushing_caps_tids.empty() ||
5094 *session->flushing_caps_tids.begin() > flush_ack_tid)
5095 sync_cond.notify_all();
5096 }
5097 } else {
5098 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5099 << " on " << *in << dendl;
5100 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5101 }
5102 }
5103
5104 class C_Client_DentryInvalidate : public Context {
5105 private:
5106 Client *client;
5107 vinodeno_t dirino;
5108 vinodeno_t ino;
5109 string name;
5110 public:
5111 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5112 client(c), name(dn->name) {
5113 if (client->use_faked_inos()) {
5114 dirino.ino = dn->dir->parent_inode->faked_ino;
5115 if (del)
5116 ino.ino = dn->inode->faked_ino;
5117 } else {
5118 dirino = dn->dir->parent_inode->vino();
5119 if (del)
5120 ino = dn->inode->vino();
5121 }
5122 if (!del)
5123 ino.ino = inodeno_t();
5124 }
5125 void finish(int r) override {
5126 // _async_dentry_invalidate is responsible for its own locking
5127 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5128 client->_async_dentry_invalidate(dirino, ino, name);
5129 }
5130 };
5131
5132 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5133 {
5134 if (unmounting)
5135 return;
5136 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5137 << " in dir " << dirino << dendl;
5138 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5139 }
5140
5141 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5142 {
5143 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5144 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5145 }
5146
5147 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5148 {
5149 int ref = in->get_num_ref();
5150 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5151
5152 if (in->dir && !in->dir->dentries.empty()) {
5153 for (auto p = in->dir->dentries.begin();
5154 p != in->dir->dentries.end(); ) {
5155 Dentry *dn = p->second;
5156 ++p;
5157 /* rmsnap removes whole subtree, need trim inodes recursively.
5158 * we don't need to invalidate dentries recursively. because
5159 * invalidating a directory dentry effectively invalidate
5160 * whole subtree */
5161 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5162 _try_to_trim_inode(dn->inode.get(), false);
5163
5164 if (dn->lru_is_expireable())
5165 unlink(dn, true, false); // keep dir, drop dentry
5166 }
5167 if (in->dir->dentries.empty()) {
5168 close_dir(in->dir);
5169 --ref;
5170 }
5171 }
5172
5173 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5174 InodeRef snapdir = open_snapdir(in);
5175 _try_to_trim_inode(snapdir.get(), false);
5176 --ref;
5177 }
5178
5179 if (ref > 0) {
5180 auto q = in->dentries.begin();
5181 while (q != in->dentries.end()) {
5182 Dentry *dn = *q;
5183 ++q;
5184 if( in->ll_ref > 0 && sched_inval) {
5185 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5186 // so in->dentries doesn't always reflect the state of kernel's dcache.
5187 _schedule_invalidate_dentry_callback(dn, true);
5188 }
5189 unlink(dn, true, true);
5190 }
5191 }
5192 }
5193
5194 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5195 {
5196 mds_rank_t mds = session->mds_num;
5197 int used = get_caps_used(in);
5198 int wanted = in->caps_wanted();
5199
5200 const unsigned new_caps = m->get_caps();
5201 const bool was_stale = session->cap_gen > cap->gen;
5202 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5203 << " mds." << mds << " seq " << m->get_seq()
5204 << " caps now " << ccap_string(new_caps)
5205 << " was " << ccap_string(cap->issued)
5206 << (was_stale ? " (stale)" : "") << dendl;
5207
5208 if (was_stale)
5209 cap->issued = cap->implemented = CEPH_CAP_PIN;
5210 cap->seq = m->get_seq();
5211 cap->gen = session->cap_gen;
5212
5213 check_cap_issue(in, new_caps);
5214
5215 // update inode
5216 int issued;
5217 in->caps_issued(&issued);
5218 issued |= in->caps_dirty();
5219
5220 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5221 !(issued & CEPH_CAP_AUTH_EXCL)) {
5222 in->mode = m->head.mode;
5223 in->uid = m->head.uid;
5224 in->gid = m->head.gid;
5225 in->btime = m->btime;
5226 }
5227 bool deleted_inode = false;
5228 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5229 !(issued & CEPH_CAP_LINK_EXCL)) {
5230 in->nlink = m->head.nlink;
5231 if (in->nlink == 0 &&
5232 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5233 deleted_inode = true;
5234 }
5235 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5236 m->xattrbl.length() &&
5237 m->head.xattr_version > in->xattr_version) {
5238 auto p = m->xattrbl.cbegin();
5239 decode(in->xattrs, p);
5240 in->xattr_version = m->head.xattr_version;
5241 }
5242
5243 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5244 in->dirstat.nfiles = m->get_nfiles();
5245 in->dirstat.nsubdirs = m->get_nsubdirs();
5246 }
5247
5248 if (new_caps & CEPH_CAP_ANY_RD) {
5249 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5250 m->get_ctime(), m->get_mtime(), m->get_atime());
5251 }
5252
5253 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5254 in->layout = m->get_layout();
5255 update_inode_file_size(in, issued, m->get_size(),
5256 m->get_truncate_seq(), m->get_truncate_size());
5257 }
5258
5259 if (m->inline_version > in->inline_version) {
5260 in->inline_data = m->inline_data;
5261 in->inline_version = m->inline_version;
5262 }
5263
5264 /* always take a newer change attr */
5265 if (m->get_change_attr() > in->change_attr)
5266 in->change_attr = m->get_change_attr();
5267
5268 // max_size
5269 if (cap == in->auth_cap &&
5270 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5271 (m->get_max_size() != in->max_size)) {
5272 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5273 in->max_size = m->get_max_size();
5274 if (in->max_size > in->wanted_max_size) {
5275 in->wanted_max_size = 0;
5276 in->requested_max_size = 0;
5277 }
5278 }
5279
5280 bool check = false;
5281 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5282 (wanted & ~(cap->wanted | new_caps))) {
5283 // If mds is importing cap, prior cap messages that update 'wanted'
5284 // may get dropped by mds (migrate seq mismatch).
5285 //
5286 // We don't send cap message to update 'wanted' if what we want are
5287 // already issued. If mds revokes caps, cap message that releases caps
5288 // also tells mds what we want. But if caps got revoked by mds forcedly
5289 // (session stale). We may haven't told mds what we want.
5290 check = true;
5291 }
5292
5293
5294 // update caps
5295 auto revoked = cap->issued & ~new_caps;
5296 if (revoked) {
5297 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5298 cap->issued = new_caps;
5299 cap->implemented |= new_caps;
5300
5301 // recall delegations if we're losing caps necessary for them
5302 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5303 in->recall_deleg(false);
5304 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5305 in->recall_deleg(true);
5306
5307 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5308 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5309 !_flush(in, new C_Client_FlushComplete(this, in))) {
5310 // waitin' for flush
5311 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5312 if (_release(in))
5313 check = true;
5314 } else {
5315 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5316 check = true;
5317 }
5318 } else if (cap->issued == new_caps) {
5319 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5320 } else {
5321 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5322 cap->issued = new_caps;
5323 cap->implemented |= new_caps;
5324
5325 if (cap == in->auth_cap) {
5326 // non-auth MDS is revoking the newly grant caps ?
5327 for (const auto &p : in->caps) {
5328 if (&p.second == cap)
5329 continue;
5330 if (p.second.implemented & ~p.second.issued & new_caps) {
5331 check = true;
5332 break;
5333 }
5334 }
5335 }
5336 }
5337
5338 if (check)
5339 check_caps(in, 0);
5340
5341 // wake up waiters
5342 if (new_caps)
5343 signal_cond_list(in->waitfor_caps);
5344
5345 // may drop inode's last ref
5346 if (deleted_inode)
5347 _try_to_trim_inode(in, true);
5348 }
5349
5350 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5351 {
5352 if (perms.uid() == 0)
5353 return 0;
5354
5355 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5356 int ret = _posix_acl_permission(in, perms, want);
5357 if (ret != -EAGAIN)
5358 return ret;
5359 }
5360
5361 // check permissions before doing anything else
5362 if (!in->check_mode(perms, want))
5363 return -EACCES;
5364 return 0;
5365 }
5366
5367 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5368 const UserPerm& perms)
5369 {
5370 int r = _getattr_for_perm(in, perms);
5371 if (r < 0)
5372 goto out;
5373
5374 r = 0;
5375 if (strncmp(name, "system.", 7) == 0) {
5376 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5377 r = -EPERM;
5378 } else {
5379 r = inode_permission(in, perms, want);
5380 }
5381 out:
5382 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5383 return r;
5384 }
5385
5386 ostream& operator<<(ostream &out, const UserPerm& perm) {
5387 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5388 return out;
5389 }
5390
5391 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5392 const UserPerm& perms)
5393 {
5394 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5395 int r = _getattr_for_perm(in, perms);
5396 if (r < 0)
5397 goto out;
5398
5399 if (mask & CEPH_SETATTR_SIZE) {
5400 r = inode_permission(in, perms, MAY_WRITE);
5401 if (r < 0)
5402 goto out;
5403 }
5404
5405 r = -EPERM;
5406 if (mask & CEPH_SETATTR_UID) {
5407 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5408 goto out;
5409 }
5410 if (mask & CEPH_SETATTR_GID) {
5411 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5412 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5413 goto out;
5414 }
5415
5416 if (mask & CEPH_SETATTR_MODE) {
5417 if (perms.uid() != 0 && perms.uid() != in->uid)
5418 goto out;
5419
5420 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5421 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5422 stx->stx_mode &= ~S_ISGID;
5423 }
5424
5425 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5426 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5427 if (perms.uid() != 0 && perms.uid() != in->uid) {
5428 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5429 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5430 check_mask |= CEPH_SETATTR_MTIME;
5431 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5432 check_mask |= CEPH_SETATTR_ATIME;
5433 if (check_mask & mask) {
5434 goto out;
5435 } else {
5436 r = inode_permission(in, perms, MAY_WRITE);
5437 if (r < 0)
5438 goto out;
5439 }
5440 }
5441 }
5442 r = 0;
5443 out:
5444 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5445 return r;
5446 }
5447
5448 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5449 {
5450 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5451 unsigned want = 0;
5452
5453 if ((flags & O_ACCMODE) == O_WRONLY)
5454 want = MAY_WRITE;
5455 else if ((flags & O_ACCMODE) == O_RDWR)
5456 want = MAY_READ | MAY_WRITE;
5457 else if ((flags & O_ACCMODE) == O_RDONLY)
5458 want = MAY_READ;
5459 if (flags & O_TRUNC)
5460 want |= MAY_WRITE;
5461
5462 int r = 0;
5463 switch (in->mode & S_IFMT) {
5464 case S_IFLNK:
5465 r = -ELOOP;
5466 goto out;
5467 case S_IFDIR:
5468 if (want & MAY_WRITE) {
5469 r = -EISDIR;
5470 goto out;
5471 }
5472 break;
5473 }
5474
5475 r = _getattr_for_perm(in, perms);
5476 if (r < 0)
5477 goto out;
5478
5479 r = inode_permission(in, perms, want);
5480 out:
5481 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5482 return r;
5483 }
5484
5485 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5486 {
5487 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5488 int r = _getattr_for_perm(dir, perms);
5489 if (r < 0)
5490 goto out;
5491
5492 r = inode_permission(dir, perms, MAY_EXEC);
5493 out:
5494 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5495 return r;
5496 }
5497
5498 int Client::may_create(Inode *dir, const UserPerm& perms)
5499 {
5500 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5501 int r = _getattr_for_perm(dir, perms);
5502 if (r < 0)
5503 goto out;
5504
5505 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5506 out:
5507 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5508 return r;
5509 }
5510
5511 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5512 {
5513 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5514 int r = _getattr_for_perm(dir, perms);
5515 if (r < 0)
5516 goto out;
5517
5518 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5519 if (r < 0)
5520 goto out;
5521
5522 /* 'name == NULL' means rmsnap */
5523 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5524 InodeRef otherin;
5525 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5526 if (r < 0)
5527 goto out;
5528 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5529 r = -EPERM;
5530 }
5531 out:
5532 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5533 return r;
5534 }
5535
5536 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5537 {
5538 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5539 int r = _getattr_for_perm(in, perms);
5540 if (r < 0)
5541 goto out;
5542
5543 if (perms.uid() == 0 || perms.uid() == in->uid) {
5544 r = 0;
5545 goto out;
5546 }
5547
5548 r = -EPERM;
5549 if (!S_ISREG(in->mode))
5550 goto out;
5551
5552 if (in->mode & S_ISUID)
5553 goto out;
5554
5555 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5556 goto out;
5557
5558 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5559 out:
5560 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5561 return r;
5562 }
5563
5564 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5565 {
5566 int mask = CEPH_STAT_CAP_MODE;
5567 bool force = false;
5568 if (acl_type != NO_ACL) {
5569 mask |= CEPH_STAT_CAP_XATTR;
5570 force = in->xattr_version == 0;
5571 }
5572 return _getattr(in, mask, perms, force);
5573 }
5574
5575 vinodeno_t Client::_get_vino(Inode *in)
5576 {
5577 /* The caller must hold the client lock */
5578 return vinodeno_t(in->ino, in->snapid);
5579 }
5580
5581 /**
5582 * Resolve an MDS spec to a list of MDS daemon GIDs.
5583 *
5584 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5585 * It may be '*' in which case it matches all GIDs.
5586 *
5587 * If no error is returned, the `targets` vector will be populated with at least
5588 * one MDS.
5589 */
5590 int Client::resolve_mds(
5591 const std::string &mds_spec,
5592 std::vector<mds_gid_t> *targets)
5593 {
5594 ceph_assert(fsmap);
5595 ceph_assert(targets != nullptr);
5596
5597 mds_role_t role;
5598 std::stringstream ss;
5599 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5600 if (role_r == 0) {
5601 // We got a role, resolve it to a GID
5602 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5603 << role << "'" << dendl;
5604 targets->push_back(
5605 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5606 return 0;
5607 }
5608
5609 std::string strtol_err;
5610 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5611 if (strtol_err.empty()) {
5612 // It is a possible GID
5613 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5614 if (fsmap->gid_exists(mds_gid)) {
5615 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5616 targets->push_back(mds_gid);
5617 } else {
5618 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5619 << dendl;
5620 return -ENOENT;
5621 }
5622 } else if (mds_spec == "*") {
5623 // It is a wildcard: use all MDSs
5624 const auto mds_info = fsmap->get_mds_info();
5625
5626 if (mds_info.empty()) {
5627 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5628 return -ENOENT;
5629 }
5630
5631 for (const auto i : mds_info) {
5632 targets->push_back(i.first);
5633 }
5634 } else {
5635 // It did not parse as an integer, it is not a wildcard, it must be a name
5636 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5637 if (mds_gid == 0) {
5638 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5639
5640 lderr(cct) << "FSMap: " << *fsmap << dendl;
5641
5642 return -ENOENT;
5643 } else {
5644 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5645 << "' to GID " << mds_gid << dendl;
5646 targets->push_back(mds_gid);
5647 }
5648 }
5649
5650 return 0;
5651 }
5652
5653
5654 /**
5655 * Authenticate with mon and establish global ID
5656 */
5657 int Client::authenticate()
5658 {
5659 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5660
5661 if (monclient->is_authenticated()) {
5662 return 0;
5663 }
5664
5665 client_lock.unlock();
5666 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5667 client_lock.lock();
5668 if (r < 0) {
5669 return r;
5670 }
5671
5672 whoami = monclient->get_global_id();
5673 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5674
5675 return 0;
5676 }
5677
5678 int Client::fetch_fsmap(bool user)
5679 {
5680 int r;
5681 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5682 // rather than MDSMap because no one MDSMap contains all the daemons, and
5683 // a `tell` can address any daemon.
5684 version_t fsmap_latest;
5685 do {
5686 C_SaferCond cond;
5687 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5688 client_lock.unlock();
5689 r = cond.wait();
5690 client_lock.lock();
5691 } while (r == -EAGAIN);
5692
5693 if (r < 0) {
5694 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5695 return r;
5696 }
5697
5698 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5699
5700 if (user) {
5701 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5702 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5703 monclient->renew_subs();
5704 wait_on_list(waiting_for_fsmap);
5705 }
5706 ceph_assert(fsmap_user);
5707 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5708 } else {
5709 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5710 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5711 monclient->renew_subs();
5712 wait_on_list(waiting_for_fsmap);
5713 }
5714 ceph_assert(fsmap);
5715 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5716 }
5717 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5718 << fsmap_latest << dendl;
5719 return 0;
5720 }
5721
5722 /**
5723 *
5724 * @mds_spec one of ID, rank, GID, "*"
5725 *
5726 */
5727 int Client::mds_command(
5728 const std::string &mds_spec,
5729 const vector<string>& cmd,
5730 const bufferlist& inbl,
5731 bufferlist *outbl,
5732 string *outs,
5733 Context *onfinish)
5734 {
5735 std::lock_guard lock(client_lock);
5736
5737 if (!initialized)
5738 return -ENOTCONN;
5739
5740 int r;
5741 r = authenticate();
5742 if (r < 0) {
5743 return r;
5744 }
5745
5746 r = fetch_fsmap(false);
5747 if (r < 0) {
5748 return r;
5749 }
5750
5751 // Look up MDS target(s) of the command
5752 std::vector<mds_gid_t> targets;
5753 r = resolve_mds(mds_spec, &targets);
5754 if (r < 0) {
5755 return r;
5756 }
5757
5758 // If daemons are laggy, we won't send them commands. If all
5759 // are laggy then we fail.
5760 std::vector<mds_gid_t> non_laggy;
5761 for (const auto gid : targets) {
5762 const auto info = fsmap->get_info_gid(gid);
5763 if (!info.laggy()) {
5764 non_laggy.push_back(gid);
5765 }
5766 }
5767 if (non_laggy.size() == 0) {
5768 *outs = "All targeted MDS daemons are laggy";
5769 return -ENOENT;
5770 }
5771
5772 if (metadata.empty()) {
5773 // We are called on an unmounted client, so metadata
5774 // won't be initialized yet.
5775 populate_metadata("");
5776 }
5777
5778 // Send commands to targets
5779 C_GatherBuilder gather(cct, onfinish);
5780 for (const auto target_gid : non_laggy) {
5781 const auto info = fsmap->get_info_gid(target_gid);
5782
5783 // Open a connection to the target MDS
5784 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5785
5786 // Generate MDSCommandOp state
5787 auto &op = command_table.start_command();
5788
5789 op.on_finish = gather.new_sub();
5790 op.cmd = cmd;
5791 op.outbl = outbl;
5792 op.outs = outs;
5793 op.inbl = inbl;
5794 op.mds_gid = target_gid;
5795 op.con = conn;
5796
5797 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5798 << " tid=" << op.tid << cmd << dendl;
5799
5800 // Construct and send MCommand
5801 auto m = op.get_message(monclient->get_fsid());
5802 conn->send_message2(std::move(m));
5803 }
5804 gather.activate();
5805
5806 return 0;
5807 }
5808
5809 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5810 {
5811 ceph_tid_t const tid = m->get_tid();
5812
5813 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5814
5815 if (!command_table.exists(tid)) {
5816 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5817 return;
5818 }
5819
5820 auto &op = command_table.get_command(tid);
5821 if (op.outbl) {
5822 *op.outbl = m->get_data();
5823 }
5824 if (op.outs) {
5825 *op.outs = m->rs;
5826 }
5827
5828 if (op.on_finish) {
5829 op.on_finish->complete(m->r);
5830 }
5831
5832 command_table.erase(tid);
5833 }
5834
5835 // -------------------
5836 // MOUNT
5837
5838 int Client::subscribe_mdsmap(const std::string &fs_name)
5839 {
5840 int r = authenticate();
5841 if (r < 0) {
5842 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5843 return r;
5844 }
5845
5846 std::string resolved_fs_name;
5847 if (fs_name.empty()) {
5848 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5849 if (resolved_fs_name.empty())
5850 // Try the backwards compatibility fs name option
5851 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5852 } else {
5853 resolved_fs_name = fs_name;
5854 }
5855
5856 std::string want = "mdsmap";
5857 if (!resolved_fs_name.empty()) {
5858 r = fetch_fsmap(true);
5859 if (r < 0)
5860 return r;
5861 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5862 if (fscid == FS_CLUSTER_ID_NONE) {
5863 return -ENOENT;
5864 }
5865
5866 std::ostringstream oss;
5867 oss << want << "." << fscid;
5868 want = oss.str();
5869 }
5870 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5871
5872 monclient->sub_want(want, 0, 0);
5873 monclient->renew_subs();
5874
5875 return 0;
5876 }
5877
5878 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5879 bool require_mds, const std::string &fs_name)
5880 {
5881 std::lock_guard lock(client_lock);
5882
5883 if (mounted) {
5884 ldout(cct, 5) << "already mounted" << dendl;
5885 return 0;
5886 }
5887
5888 unmounting = false;
5889
5890 int r = subscribe_mdsmap(fs_name);
5891 if (r < 0) {
5892 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5893 return r;
5894 }
5895
5896 tick(); // start tick
5897
5898 if (require_mds) {
5899 while (1) {
5900 auto availability = mdsmap->is_cluster_available();
5901 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5902 // Error out
5903 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5904 return CEPH_FUSE_NO_MDS_UP;
5905 } else if (availability == MDSMap::AVAILABLE) {
5906 // Continue to mount
5907 break;
5908 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5909 // Else, wait. MDSMonitor will update the map to bring
5910 // us to a conclusion eventually.
5911 wait_on_list(waiting_for_mdsmap);
5912 } else {
5913 // Unexpected value!
5914 ceph_abort();
5915 }
5916 }
5917 }
5918
5919 populate_metadata(mount_root.empty() ? "/" : mount_root);
5920
5921 filepath fp(CEPH_INO_ROOT);
5922 if (!mount_root.empty()) {
5923 fp = filepath(mount_root.c_str());
5924 }
5925 while (true) {
5926 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5927 req->set_filepath(fp);
5928 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5929 int res = make_request(req, perms);
5930 if (res < 0) {
5931 if (res == -EACCES && root) {
5932 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5933 break;
5934 }
5935 return res;
5936 }
5937
5938 if (fp.depth())
5939 fp.pop_dentry();
5940 else
5941 break;
5942 }
5943
5944 ceph_assert(root);
5945 _ll_get(root);
5946
5947 mounted = true;
5948
5949 // trace?
5950 if (!cct->_conf->client_trace.empty()) {
5951 traceout.open(cct->_conf->client_trace.c_str());
5952 if (traceout.is_open()) {
5953 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5954 } else {
5955 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5956 }
5957 }
5958
5959 /*
5960 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5961 ldout(cct, 3) << "op: struct stat st;" << dendl;
5962 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5963 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5964 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5965 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5966 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5967 ldout(cct, 3) << "op: int fd;" << dendl;
5968 */
5969 return 0;
5970 }
5971
5972 // UNMOUNT
5973
5974 void Client::_close_sessions()
5975 {
5976 while (!mds_sessions.empty()) {
5977 // send session closes!
5978 for (auto &p : mds_sessions) {
5979 if (p.second.state != MetaSession::STATE_CLOSING) {
5980 _close_mds_session(&p.second);
5981 }
5982 }
5983
5984 // wait for sessions to close
5985 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5986 std::unique_lock l{client_lock, std::adopt_lock};
5987 mount_cond.wait(l);
5988 l.release();
5989 }
5990 }
5991
5992 void Client::flush_mdlog_sync()
5993 {
5994 if (mds_requests.empty())
5995 return;
5996 for (auto &p : mds_sessions) {
5997 flush_mdlog(&p.second);
5998 }
5999 }
6000
6001 void Client::flush_mdlog(MetaSession *session)
6002 {
6003 // Only send this to Luminous or newer MDS daemons, older daemons
6004 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6005 const uint64_t features = session->con->get_features();
6006 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6007 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6008 session->con->send_message2(std::move(m));
6009 }
6010 }
6011
6012
6013 void Client::_abort_mds_sessions(int err)
6014 {
6015 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6016 auto req = p->second;
6017 ++p;
6018 // unsafe requests will be removed during close session below.
6019 if (req->got_unsafe)
6020 continue;
6021
6022 req->abort(err);
6023 if (req->caller_cond) {
6024 req->kick = true;
6025 req->caller_cond->notify_all();
6026 }
6027 }
6028
6029 // Process aborts on any requests that were on this waitlist.
6030 // Any requests that were on a waiting_for_open session waitlist
6031 // will get kicked during close session below.
6032 signal_cond_list(waiting_for_mdsmap);
6033
6034 // Force-close all sessions
6035 while(!mds_sessions.empty()) {
6036 auto& session = mds_sessions.begin()->second;
6037 _closed_mds_session(&session);
6038 }
6039 }
6040
6041 void Client::_unmount(bool abort)
6042 {
6043 std::unique_lock lock{client_lock, std::adopt_lock};
6044 if (unmounting)
6045 return;
6046
6047 if (abort || blacklisted) {
6048 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6049 } else {
6050 ldout(cct, 2) << "unmounting" << dendl;
6051 }
6052 unmounting = true;
6053
6054 deleg_timeout = 0;
6055
6056 if (abort) {
6057 // Abort all mds sessions
6058 _abort_mds_sessions(-ENOTCONN);
6059
6060 objecter->op_cancel_writes(-ENOTCONN);
6061 } else {
6062 // flush the mdlog for pending requests, if any
6063 flush_mdlog_sync();
6064 }
6065
6066 mount_cond.wait(lock, [this] {
6067 if (!mds_requests.empty()) {
6068 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6069 << dendl;
6070 }
6071 return mds_requests.empty();
6072 });
6073 if (tick_event)
6074 timer.cancel_event(tick_event);
6075 tick_event = 0;
6076
6077 cwd.reset();
6078
6079 // clean up any unclosed files
6080 while (!fd_map.empty()) {
6081 Fh *fh = fd_map.begin()->second;
6082 fd_map.erase(fd_map.begin());
6083 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6084 _release_fh(fh);
6085 }
6086
6087 while (!ll_unclosed_fh_set.empty()) {
6088 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6089 Fh *fh = *it;
6090 ll_unclosed_fh_set.erase(fh);
6091 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6092 _release_fh(fh);
6093 }
6094
6095 while (!opened_dirs.empty()) {
6096 dir_result_t *dirp = *opened_dirs.begin();
6097 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6098 _closedir(dirp);
6099 }
6100
6101 _ll_drop_pins();
6102
6103 mount_cond.wait(lock, [this] {
6104 if (unsafe_sync_write > 0) {
6105 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6106 << dendl;
6107 }
6108 return unsafe_sync_write <= 0;
6109 });
6110
6111 if (cct->_conf->client_oc) {
6112 // flush/release all buffered data
6113 std::list<InodeRef> anchor;
6114 for (auto& p : inode_map) {
6115 Inode *in = p.second;
6116 if (!in) {
6117 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6118 ceph_assert(in);
6119 }
6120
6121 // prevent inode from getting freed
6122 anchor.emplace_back(in);
6123
6124 if (abort || blacklisted) {
6125 objectcacher->purge_set(&in->oset);
6126 } else if (!in->caps.empty()) {
6127 _release(in);
6128 _flush(in, new C_Client_FlushComplete(this, in));
6129 }
6130 }
6131 }
6132
6133 if (abort || blacklisted) {
6134 for (auto p = dirty_list.begin(); !p.end(); ) {
6135 Inode *in = *p;
6136 ++p;
6137 if (in->dirty_caps) {
6138 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6139 in->mark_caps_clean();
6140 put_inode(in);
6141 }
6142 }
6143 } else {
6144 flush_caps_sync();
6145 wait_sync_caps(last_flush_tid);
6146 }
6147
6148 // empty lru cache
6149 trim_cache();
6150
6151 while (lru.lru_get_size() > 0 ||
6152 !inode_map.empty()) {
6153 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6154 << "+" << inode_map.size() << " items"
6155 << ", waiting (for caps to release?)"
6156 << dendl;
6157 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6158 r == std::cv_status::timeout) {
6159 dump_cache(NULL);
6160 }
6161 }
6162 ceph_assert(lru.lru_get_size() == 0);
6163 ceph_assert(inode_map.empty());
6164
6165 // stop tracing
6166 if (!cct->_conf->client_trace.empty()) {
6167 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6168 traceout.close();
6169 }
6170
6171 _close_sessions();
6172
6173 mounted = false;
6174
6175 lock.release();
6176 ldout(cct, 2) << "unmounted." << dendl;
6177 }
6178
6179 void Client::unmount()
6180 {
6181 std::lock_guard lock(client_lock);
6182 _unmount(false);
6183 }
6184
6185 void Client::abort_conn()
6186 {
6187 std::lock_guard lock(client_lock);
6188 _unmount(true);
6189 }
6190
6191 void Client::flush_cap_releases()
6192 {
6193 // send any cap releases
6194 for (auto &p : mds_sessions) {
6195 auto &session = p.second;
6196 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6197 p.first)) {
6198 if (cct->_conf->client_inject_release_failure) {
6199 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6200 } else {
6201 session.con->send_message2(std::move(session.release));
6202 }
6203 session.release.reset();
6204 }
6205 }
6206 }
6207
6208 void Client::tick()
6209 {
6210 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6211 sleep(cct->_conf->client_debug_inject_tick_delay);
6212 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6213 cct->_conf.apply_changes(nullptr);
6214 }
6215
6216 ldout(cct, 21) << "tick" << dendl;
6217 tick_event = timer.add_event_after(
6218 cct->_conf->client_tick_interval,
6219 new LambdaContext([this](int) {
6220 // Called back via Timer, which takes client_lock for us
6221 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6222 tick();
6223 }));
6224 utime_t now = ceph_clock_now();
6225
6226 if (!mounted && !mds_requests.empty()) {
6227 MetaRequest *req = mds_requests.begin()->second;
6228 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6229 req->abort(-ETIMEDOUT);
6230 if (req->caller_cond) {
6231 req->kick = true;
6232 req->caller_cond->notify_all();
6233 }
6234 signal_cond_list(waiting_for_mdsmap);
6235 for (auto &p : mds_sessions) {
6236 signal_context_list(p.second.waiting_for_open);
6237 }
6238 }
6239 }
6240
6241 if (mdsmap->get_epoch()) {
6242 // renew caps?
6243 utime_t el = now - last_cap_renew;
6244 if (el > mdsmap->get_session_timeout() / 3.0)
6245 renew_caps();
6246
6247 flush_cap_releases();
6248 }
6249
6250 // delayed caps
6251 xlist<Inode*>::iterator p = delayed_list.begin();
6252 while (!p.end()) {
6253 Inode *in = *p;
6254 ++p;
6255 if (in->hold_caps_until > now)
6256 break;
6257 delayed_list.pop_front();
6258 check_caps(in, CHECK_CAPS_NODELAY);
6259 }
6260
6261 trim_cache(true);
6262 }
6263
6264 void Client::renew_caps()
6265 {
6266 ldout(cct, 10) << "renew_caps()" << dendl;
6267 last_cap_renew = ceph_clock_now();
6268
6269 for (auto &p : mds_sessions) {
6270 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6271 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6272 renew_caps(&p.second);
6273 }
6274 }
6275
6276 void Client::renew_caps(MetaSession *session)
6277 {
6278 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6279 session->last_cap_renew_request = ceph_clock_now();
6280 uint64_t seq = ++session->cap_renew_seq;
6281 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6282 }
6283
6284
6285 // ===============================================================
6286 // high level (POSIXy) interface
6287
6288 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6289 InodeRef *target, const UserPerm& perms)
6290 {
6291 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6292 MetaRequest *req = new MetaRequest(op);
6293 filepath path;
6294 dir->make_nosnap_relative_path(path);
6295 path.push_dentry(name);
6296 req->set_filepath(path);
6297 req->set_inode(dir);
6298 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6299 mask |= DEBUG_GETATTR_CAPS;
6300 req->head.args.getattr.mask = mask;
6301
6302 ldout(cct, 10) << __func__ << " on " << path << dendl;
6303
6304 int r = make_request(req, perms, target);
6305 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6306 return r;
6307 }
6308
6309 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6310 const UserPerm& perms)
6311 {
6312 int r = 0;
6313 Dentry *dn = NULL;
6314
6315 if (dname == "..") {
6316 if (dir->dentries.empty()) {
6317 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6318 filepath path(dir->ino);
6319 req->set_filepath(path);
6320
6321 InodeRef tmptarget;
6322 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6323
6324 if (r == 0) {
6325 Inode *tempino = tmptarget.get();
6326 _ll_get(tempino);
6327 *target = tempino;
6328 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6329 } else {
6330 *target = dir;
6331 }
6332 }
6333 else
6334 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6335 goto done;
6336 }
6337
6338 if (dname == ".") {
6339 *target = dir;
6340 goto done;
6341 }
6342
6343 if (!dir->is_dir()) {
6344 r = -ENOTDIR;
6345 goto done;
6346 }
6347
6348 if (dname.length() > NAME_MAX) {
6349 r = -ENAMETOOLONG;
6350 goto done;
6351 }
6352
6353 if (dname == cct->_conf->client_snapdir &&
6354 dir->snapid == CEPH_NOSNAP) {
6355 *target = open_snapdir(dir);
6356 goto done;
6357 }
6358
6359 if (dir->dir &&
6360 dir->dir->dentries.count(dname)) {
6361 dn = dir->dir->dentries[dname];
6362
6363 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6364 << " seq " << dn->lease_seq
6365 << dendl;
6366
6367 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6368 // is dn lease valid?
6369 utime_t now = ceph_clock_now();
6370 if (dn->lease_mds >= 0 &&
6371 dn->lease_ttl > now &&
6372 mds_sessions.count(dn->lease_mds)) {
6373 MetaSession &s = mds_sessions.at(dn->lease_mds);
6374 if (s.cap_ttl > now &&
6375 s.cap_gen == dn->lease_gen) {
6376 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6377 // make trim_caps() behave.
6378 dir->try_touch_cap(dn->lease_mds);
6379 goto hit_dn;
6380 }
6381 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6382 << " vs lease_gen " << dn->lease_gen << dendl;
6383 }
6384 // dir shared caps?
6385 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6386 if (dn->cap_shared_gen == dir->shared_gen &&
6387 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6388 goto hit_dn;
6389 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6390 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6391 << *dir << " dn '" << dname << "'" << dendl;
6392 return -ENOENT;
6393 }
6394 }
6395 } else {
6396 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6397 }
6398 } else {
6399 // can we conclude ENOENT locally?
6400 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6401 (dir->flags & I_COMPLETE)) {
6402 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6403 return -ENOENT;
6404 }
6405 }
6406
6407 r = _do_lookup(dir, dname, mask, target, perms);
6408 goto done;
6409
6410 hit_dn:
6411 if (dn->inode) {
6412 *target = dn->inode;
6413 } else {
6414 r = -ENOENT;
6415 }
6416 touch_dn(dn);
6417
6418 done:
6419 if (r < 0)
6420 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6421 else
6422 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6423 return r;
6424 }
6425
6426 int Client::get_or_create(Inode *dir, const char* name,
6427 Dentry **pdn, bool expect_null)
6428 {
6429 // lookup
6430 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6431 dir->open_dir();
6432 if (dir->dir->dentries.count(name)) {
6433 Dentry *dn = dir->dir->dentries[name];
6434
6435 // is dn lease valid?
6436 utime_t now = ceph_clock_now();
6437 if (dn->inode &&
6438 dn->lease_mds >= 0 &&
6439 dn->lease_ttl > now &&
6440 mds_sessions.count(dn->lease_mds)) {
6441 MetaSession &s = mds_sessions.at(dn->lease_mds);
6442 if (s.cap_ttl > now &&
6443 s.cap_gen == dn->lease_gen) {
6444 if (expect_null)
6445 return -EEXIST;
6446 }
6447 }
6448 *pdn = dn;
6449 } else {
6450 // otherwise link up a new one
6451 *pdn = link(dir->dir, name, NULL, NULL);
6452 }
6453
6454 // success
6455 return 0;
6456 }
6457
6458 int Client::path_walk(const filepath& origpath, InodeRef *end,
6459 const UserPerm& perms, bool followsym, int mask)
6460 {
6461 filepath path = origpath;
6462 InodeRef cur;
6463 if (origpath.absolute())
6464 cur = root;
6465 else
6466 cur = cwd;
6467 ceph_assert(cur);
6468
6469 ldout(cct, 10) << __func__ << " " << path << dendl;
6470
6471 int symlinks = 0;
6472
6473 unsigned i=0;
6474 while (i < path.depth() && cur) {
6475 int caps = 0;
6476 const string &dname = path[i];
6477 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6478 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6479 InodeRef next;
6480 if (cct->_conf->client_permissions) {
6481 int r = may_lookup(cur.get(), perms);
6482 if (r < 0)
6483 return r;
6484 caps = CEPH_CAP_AUTH_SHARED;
6485 }
6486
6487 /* Get extra requested caps on the last component */
6488 if (i == (path.depth() - 1))
6489 caps |= mask;
6490 int r = _lookup(cur.get(), dname, caps, &next, perms);
6491 if (r < 0)
6492 return r;
6493 // only follow trailing symlink if followsym. always follow
6494 // 'directory' symlinks.
6495 if (next && next->is_symlink()) {
6496 symlinks++;
6497 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6498 if (symlinks > MAXSYMLINKS) {
6499 return -ELOOP;
6500 }
6501
6502 if (i < path.depth() - 1) {
6503 // dir symlink
6504 // replace consumed components of path with symlink dir target
6505 filepath resolved(next->symlink.c_str());
6506 resolved.append(path.postfixpath(i + 1));
6507 path = resolved;
6508 i = 0;
6509 if (next->symlink[0] == '/') {
6510 cur = root;
6511 }
6512 continue;
6513 } else if (followsym) {
6514 if (next->symlink[0] == '/') {
6515 path = next->symlink.c_str();
6516 i = 0;
6517 // reset position
6518 cur = root;
6519 } else {
6520 filepath more(next->symlink.c_str());
6521 // we need to remove the symlink component from off of the path
6522 // before adding the target that the symlink points to. remain
6523 // at the same position in the path.
6524 path.pop_dentry();
6525 path.append(more);
6526 }
6527 continue;
6528 }
6529 }
6530 cur.swap(next);
6531 i++;
6532 }
6533 if (!cur)
6534 return -ENOENT;
6535 if (end)
6536 end->swap(cur);
6537 return 0;
6538 }
6539
6540
6541 // namespace ops
6542
6543 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6544 {
6545 std::lock_guard lock(client_lock);
6546 tout(cct) << "link" << std::endl;
6547 tout(cct) << relexisting << std::endl;
6548 tout(cct) << relpath << std::endl;
6549
6550 if (unmounting)
6551 return -ENOTCONN;
6552
6553 filepath existing(relexisting);
6554
6555 InodeRef in, dir;
6556 int r = path_walk(existing, &in, perm, true);
6557 if (r < 0)
6558 return r;
6559 if (std::string(relpath) == "/") {
6560 r = -EEXIST;
6561 return r;
6562 }
6563 filepath path(relpath);
6564 string name = path.last_dentry();
6565 path.pop_dentry();
6566
6567 r = path_walk(path, &dir, perm, true);
6568 if (r < 0)
6569 return r;
6570 if (cct->_conf->client_permissions) {
6571 if (S_ISDIR(in->mode)) {
6572 r = -EPERM;
6573 return r;
6574 }
6575 r = may_hardlink(in.get(), perm);
6576 if (r < 0)
6577 return r;
6578 r = may_create(dir.get(), perm);
6579 if (r < 0)
6580 return r;
6581 }
6582 r = _link(in.get(), dir.get(), name.c_str(), perm);
6583 return r;
6584 }
6585
6586 int Client::unlink(const char *relpath, const UserPerm& perm)
6587 {
6588 std::lock_guard lock(client_lock);
6589 tout(cct) << __func__ << std::endl;
6590 tout(cct) << relpath << std::endl;
6591
6592 if (unmounting)
6593 return -ENOTCONN;
6594
6595 if (std::string(relpath) == "/")
6596 return -EISDIR;
6597
6598 filepath path(relpath);
6599 string name = path.last_dentry();
6600 path.pop_dentry();
6601 InodeRef dir;
6602 int r = path_walk(path, &dir, perm);
6603 if (r < 0)
6604 return r;
6605 if (cct->_conf->client_permissions) {
6606 r = may_delete(dir.get(), name.c_str(), perm);
6607 if (r < 0)
6608 return r;
6609 }
6610 return _unlink(dir.get(), name.c_str(), perm);
6611 }
6612
6613 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6614 {
6615 std::lock_guard lock(client_lock);
6616 tout(cct) << __func__ << std::endl;
6617 tout(cct) << relfrom << std::endl;
6618 tout(cct) << relto << std::endl;
6619
6620 if (unmounting)
6621 return -ENOTCONN;
6622
6623 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6624 return -EBUSY;
6625
6626 filepath from(relfrom);
6627 filepath to(relto);
6628 string fromname = from.last_dentry();
6629 from.pop_dentry();
6630 string toname = to.last_dentry();
6631 to.pop_dentry();
6632
6633 InodeRef fromdir, todir;
6634 int r = path_walk(from, &fromdir, perm);
6635 if (r < 0)
6636 goto out;
6637 r = path_walk(to, &todir, perm);
6638 if (r < 0)
6639 goto out;
6640
6641 if (cct->_conf->client_permissions) {
6642 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6643 if (r < 0)
6644 return r;
6645 r = may_delete(todir.get(), toname.c_str(), perm);
6646 if (r < 0 && r != -ENOENT)
6647 return r;
6648 }
6649 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6650 out:
6651 return r;
6652 }
6653
6654 // dirs
6655
6656 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6657 {
6658 std::lock_guard lock(client_lock);
6659 tout(cct) << __func__ << std::endl;
6660 tout(cct) << relpath << std::endl;
6661 tout(cct) << mode << std::endl;
6662 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6663
6664 if (unmounting)
6665 return -ENOTCONN;
6666
6667 if (std::string(relpath) == "/")
6668 return -EEXIST;
6669
6670 filepath path(relpath);
6671 string name = path.last_dentry();
6672 path.pop_dentry();
6673 InodeRef dir;
6674 int r = path_walk(path, &dir, perm);
6675 if (r < 0)
6676 return r;
6677 if (cct->_conf->client_permissions) {
6678 r = may_create(dir.get(), perm);
6679 if (r < 0)
6680 return r;
6681 }
6682 return _mkdir(dir.get(), name.c_str(), mode, perm);
6683 }
6684
6685 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6686 {
6687 std::lock_guard lock(client_lock);
6688 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6689 tout(cct) << __func__ << std::endl;
6690 tout(cct) << relpath << std::endl;
6691 tout(cct) << mode << std::endl;
6692
6693 if (unmounting)
6694 return -ENOTCONN;
6695
6696 //get through existing parts of path
6697 filepath path(relpath);
6698 unsigned int i;
6699 int r = 0, caps = 0;
6700 InodeRef cur, next;
6701 cur = cwd;
6702 for (i=0; i<path.depth(); ++i) {
6703 if (cct->_conf->client_permissions) {
6704 r = may_lookup(cur.get(), perms);
6705 if (r < 0)
6706 break;
6707 caps = CEPH_CAP_AUTH_SHARED;
6708 }
6709 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6710 if (r < 0)
6711 break;
6712 cur.swap(next);
6713 }
6714 if (r!=-ENOENT) return r;
6715 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6716 //make new directory at each level
6717 for (; i<path.depth(); ++i) {
6718 if (cct->_conf->client_permissions) {
6719 r = may_create(cur.get(), perms);
6720 if (r < 0)
6721 return r;
6722 }
6723 //make new dir
6724 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6725
6726 //check proper creation/existence
6727 if(-EEXIST == r && i < path.depth() - 1) {
6728 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6729 }
6730 if (r < 0)
6731 return r;
6732 //move to new dir and continue
6733 cur.swap(next);
6734 ldout(cct, 20) << __func__ << ": successfully created directory "
6735 << filepath(cur->ino).get_path() << dendl;
6736 }
6737 return 0;
6738 }
6739
6740 int Client::rmdir(const char *relpath, const UserPerm& perms)
6741 {
6742 std::lock_guard lock(client_lock);
6743 tout(cct) << __func__ << std::endl;
6744 tout(cct) << relpath << std::endl;
6745
6746 if (unmounting)
6747 return -ENOTCONN;
6748
6749 if (std::string(relpath) == "/")
6750 return -EBUSY;
6751
6752 filepath path(relpath);
6753 string name = path.last_dentry();
6754 path.pop_dentry();
6755 InodeRef dir;
6756 int r = path_walk(path, &dir, perms);
6757 if (r < 0)
6758 return r;
6759 if (cct->_conf->client_permissions) {
6760 int r = may_delete(dir.get(), name.c_str(), perms);
6761 if (r < 0)
6762 return r;
6763 }
6764 return _rmdir(dir.get(), name.c_str(), perms);
6765 }
6766
6767 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6768 {
6769 std::lock_guard lock(client_lock);
6770 tout(cct) << __func__ << std::endl;
6771 tout(cct) << relpath << std::endl;
6772 tout(cct) << mode << std::endl;
6773 tout(cct) << rdev << std::endl;
6774
6775 if (unmounting)
6776 return -ENOTCONN;
6777
6778 if (std::string(relpath) == "/")
6779 return -EEXIST;
6780
6781 filepath path(relpath);
6782 string name = path.last_dentry();
6783 path.pop_dentry();
6784 InodeRef dir;
6785 int r = path_walk(path, &dir, perms);
6786 if (r < 0)
6787 return r;
6788 if (cct->_conf->client_permissions) {
6789 int r = may_create(dir.get(), perms);
6790 if (r < 0)
6791 return r;
6792 }
6793 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6794 }
6795
6796 // symlinks
6797
6798 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6799 {
6800 std::lock_guard lock(client_lock);
6801 tout(cct) << __func__ << std::endl;
6802 tout(cct) << target << std::endl;
6803 tout(cct) << relpath << std::endl;
6804
6805 if (unmounting)
6806 return -ENOTCONN;
6807
6808 if (std::string(relpath) == "/")
6809 return -EEXIST;
6810
6811 filepath path(relpath);
6812 string name = path.last_dentry();
6813 path.pop_dentry();
6814 InodeRef dir;
6815 int r = path_walk(path, &dir, perms);
6816 if (r < 0)
6817 return r;
6818 if (cct->_conf->client_permissions) {
6819 int r = may_create(dir.get(), perms);
6820 if (r < 0)
6821 return r;
6822 }
6823 return _symlink(dir.get(), name.c_str(), target, perms);
6824 }
6825
6826 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6827 {
6828 std::lock_guard lock(client_lock);
6829 tout(cct) << __func__ << std::endl;
6830 tout(cct) << relpath << std::endl;
6831
6832 if (unmounting)
6833 return -ENOTCONN;
6834
6835 filepath path(relpath);
6836 InodeRef in;
6837 int r = path_walk(path, &in, perms, false);
6838 if (r < 0)
6839 return r;
6840
6841 return _readlink(in.get(), buf, size);
6842 }
6843
6844 int Client::_readlink(Inode *in, char *buf, size_t size)
6845 {
6846 if (!in->is_symlink())
6847 return -EINVAL;
6848
6849 // copy into buf (at most size bytes)
6850 int r = in->symlink.length();
6851 if (r > (int)size)
6852 r = size;
6853 memcpy(buf, in->symlink.c_str(), r);
6854 return r;
6855 }
6856
6857
6858 // inode stuff
6859
6860 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6861 {
6862 bool yes = in->caps_issued_mask(mask, true);
6863
6864 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6865 if (yes && !force)
6866 return 0;
6867
6868 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6869 filepath path;
6870 in->make_nosnap_relative_path(path);
6871 req->set_filepath(path);
6872 req->set_inode(in);
6873 req->head.args.getattr.mask = mask;
6874
6875 int res = make_request(req, perms);
6876 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6877 return res;
6878 }
6879
6880 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6881 const UserPerm& perms, InodeRef *inp)
6882 {
6883 int issued = in->caps_issued();
6884
6885 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6886 ccap_string(issued) << dendl;
6887
6888 if (in->snapid != CEPH_NOSNAP) {
6889 return -EROFS;
6890 }
6891 if ((mask & CEPH_SETATTR_SIZE) &&
6892 (unsigned long)stx->stx_size > in->size &&
6893 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6894 perms)) {
6895 return -EDQUOT;
6896 }
6897
6898 // make the change locally?
6899 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6900 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6901 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6902 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6903 << in->cap_dirtier_gid << ", forcing sync setattr"
6904 << dendl;
6905 /*
6906 * This works because we implicitly flush the caps as part of the
6907 * request, so the cap update check will happen with the writeback
6908 * cap context, and then the setattr check will happen with the
6909 * caller's context.
6910 *
6911 * In reality this pattern is likely pretty rare (different users
6912 * setattr'ing the same file). If that turns out not to be the
6913 * case later, we can build a more complex pipelined cap writeback
6914 * infrastructure...
6915 */
6916 if (!mask)
6917 mask |= CEPH_SETATTR_CTIME;
6918 goto force_request;
6919 }
6920
6921 if (!mask) {
6922 // caller just needs us to bump the ctime
6923 in->ctime = ceph_clock_now();
6924 in->cap_dirtier_uid = perms.uid();
6925 in->cap_dirtier_gid = perms.gid();
6926 if (issued & CEPH_CAP_AUTH_EXCL)
6927 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6928 else if (issued & CEPH_CAP_FILE_EXCL)
6929 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6930 else if (issued & CEPH_CAP_XATTR_EXCL)
6931 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6932 else
6933 mask |= CEPH_SETATTR_CTIME;
6934 }
6935
6936 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6937 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6938
6939 mask &= ~CEPH_SETATTR_KILL_SGUID;
6940
6941 if (mask & CEPH_SETATTR_UID) {
6942 in->ctime = ceph_clock_now();
6943 in->cap_dirtier_uid = perms.uid();
6944 in->cap_dirtier_gid = perms.gid();
6945 in->uid = stx->stx_uid;
6946 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6947 mask &= ~CEPH_SETATTR_UID;
6948 kill_sguid = true;
6949 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6950 }
6951 if (mask & CEPH_SETATTR_GID) {
6952 in->ctime = ceph_clock_now();
6953 in->cap_dirtier_uid = perms.uid();
6954 in->cap_dirtier_gid = perms.gid();
6955 in->gid = stx->stx_gid;
6956 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6957 mask &= ~CEPH_SETATTR_GID;
6958 kill_sguid = true;
6959 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6960 }
6961
6962 if (mask & CEPH_SETATTR_MODE) {
6963 in->ctime = ceph_clock_now();
6964 in->cap_dirtier_uid = perms.uid();
6965 in->cap_dirtier_gid = perms.gid();
6966 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6967 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6968 mask &= ~CEPH_SETATTR_MODE;
6969 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6970 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6971 /* Must squash the any setuid/setgid bits with an ownership change */
6972 in->mode &= ~(S_ISUID|S_ISGID);
6973 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6974 }
6975
6976 if (mask & CEPH_SETATTR_BTIME) {
6977 in->ctime = ceph_clock_now();
6978 in->cap_dirtier_uid = perms.uid();
6979 in->cap_dirtier_gid = perms.gid();
6980 in->btime = utime_t(stx->stx_btime);
6981 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6982 mask &= ~CEPH_SETATTR_BTIME;
6983 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6984 }
6985 } else if (mask & CEPH_SETATTR_SIZE) {
6986 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6987 mask |= CEPH_SETATTR_KILL_SGUID;
6988 }
6989
6990 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6991 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6992 if (mask & CEPH_SETATTR_MTIME)
6993 in->mtime = utime_t(stx->stx_mtime);
6994 if (mask & CEPH_SETATTR_ATIME)
6995 in->atime = utime_t(stx->stx_atime);
6996 in->ctime = ceph_clock_now();
6997 in->cap_dirtier_uid = perms.uid();
6998 in->cap_dirtier_gid = perms.gid();
6999 in->time_warp_seq++;
7000 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7001 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7002 }
7003 }
7004 if (!mask) {
7005 in->change_attr++;
7006 return 0;
7007 }
7008
7009 force_request:
7010 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7011
7012 filepath path;
7013
7014 in->make_nosnap_relative_path(path);
7015 req->set_filepath(path);
7016 req->set_inode(in);
7017
7018 if (mask & CEPH_SETATTR_KILL_SGUID) {
7019 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7020 }
7021 if (mask & CEPH_SETATTR_MODE) {
7022 req->head.args.setattr.mode = stx->stx_mode;
7023 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7024 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7025 }
7026 if (mask & CEPH_SETATTR_UID) {
7027 req->head.args.setattr.uid = stx->stx_uid;
7028 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7029 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7030 }
7031 if (mask & CEPH_SETATTR_GID) {
7032 req->head.args.setattr.gid = stx->stx_gid;
7033 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7034 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7035 }
7036 if (mask & CEPH_SETATTR_BTIME) {
7037 req->head.args.setattr.btime = utime_t(stx->stx_btime);
7038 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7039 }
7040 if (mask & CEPH_SETATTR_MTIME) {
7041 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7042 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7043 CEPH_CAP_FILE_WR;
7044 }
7045 if (mask & CEPH_SETATTR_ATIME) {
7046 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7047 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7048 CEPH_CAP_FILE_WR;
7049 }
7050 if (mask & CEPH_SETATTR_SIZE) {
7051 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7052 req->head.args.setattr.size = stx->stx_size;
7053 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7054 } else { //too big!
7055 put_request(req);
7056 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7057 return -EFBIG;
7058 }
7059 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7060 CEPH_CAP_FILE_WR;
7061 }
7062 req->head.args.setattr.mask = mask;
7063
7064 req->regetattr_mask = mask;
7065
7066 int res = make_request(req, perms, inp);
7067 ldout(cct, 10) << "_setattr result=" << res << dendl;
7068 return res;
7069 }
7070
7071 /* Note that we only care about attrs that setattr cares about */
7072 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7073 {
7074 stx->stx_size = st->st_size;
7075 stx->stx_mode = st->st_mode;
7076 stx->stx_uid = st->st_uid;
7077 stx->stx_gid = st->st_gid;
7078 #ifdef __APPLE__
7079 stx->stx_mtime = st->st_mtimespec;
7080 stx->stx_atime = st->st_atimespec;
7081 #else
7082 stx->stx_mtime = st->st_mtim;
7083 stx->stx_atime = st->st_atim;
7084 #endif
7085 }
7086
7087 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7088 const UserPerm& perms, InodeRef *inp)
7089 {
7090 int ret = _do_setattr(in, stx, mask, perms, inp);
7091 if (ret < 0)
7092 return ret;
7093 if (mask & CEPH_SETATTR_MODE)
7094 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7095 return ret;
7096 }
7097
7098 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7099 const UserPerm& perms)
7100 {
7101 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7102 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7103 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7104 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7105 if (cct->_conf->client_permissions) {
7106 int r = may_setattr(in.get(), stx, mask, perms);
7107 if (r < 0)
7108 return r;
7109 }
7110 return __setattrx(in.get(), stx, mask, perms);
7111 }
7112
7113 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7114 const UserPerm& perms)
7115 {
7116 struct ceph_statx stx;
7117
7118 stat_to_statx(attr, &stx);
7119 mask &= ~CEPH_SETATTR_BTIME;
7120
7121 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7122 mask &= ~CEPH_SETATTR_UID;
7123 }
7124 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7125 mask &= ~CEPH_SETATTR_GID;
7126 }
7127
7128 return _setattrx(in, &stx, mask, perms);
7129 }
7130
7131 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7132 const UserPerm& perms)
7133 {
7134 std::lock_guard lock(client_lock);
7135 tout(cct) << __func__ << std::endl;
7136 tout(cct) << relpath << std::endl;
7137 tout(cct) << mask << std::endl;
7138
7139 if (unmounting)
7140 return -ENOTCONN;
7141
7142 filepath path(relpath);
7143 InodeRef in;
7144 int r = path_walk(path, &in, perms);
7145 if (r < 0)
7146 return r;
7147 return _setattr(in, attr, mask, perms);
7148 }
7149
7150 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7151 const UserPerm& perms, int flags)
7152 {
7153 std::lock_guard lock(client_lock);
7154 tout(cct) << __func__ << std::endl;
7155 tout(cct) << relpath << std::endl;
7156 tout(cct) << mask << std::endl;
7157
7158 if (unmounting)
7159 return -ENOTCONN;
7160
7161 filepath path(relpath);
7162 InodeRef in;
7163 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7164 if (r < 0)
7165 return r;
7166 return _setattrx(in, stx, mask, perms);
7167 }
7168
7169 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7170 {
7171 std::lock_guard lock(client_lock);
7172 tout(cct) << __func__ << std::endl;
7173 tout(cct) << fd << std::endl;
7174 tout(cct) << mask << std::endl;
7175
7176 if (unmounting)
7177 return -ENOTCONN;
7178
7179 Fh *f = get_filehandle(fd);
7180 if (!f)
7181 return -EBADF;
7182 #if defined(__linux__) && defined(O_PATH)
7183 if (f->flags & O_PATH)
7184 return -EBADF;
7185 #endif
7186 return _setattr(f->inode, attr, mask, perms);
7187 }
7188
7189 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7190 {
7191 std::lock_guard lock(client_lock);
7192 tout(cct) << __func__ << std::endl;
7193 tout(cct) << fd << std::endl;
7194 tout(cct) << mask << std::endl;
7195
7196 if (unmounting)
7197 return -ENOTCONN;
7198
7199 Fh *f = get_filehandle(fd);
7200 if (!f)
7201 return -EBADF;
7202 #if defined(__linux__) && defined(O_PATH)
7203 if (f->flags & O_PATH)
7204 return -EBADF;
7205 #endif
7206 return _setattrx(f->inode, stx, mask, perms);
7207 }
7208
7209 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7210 frag_info_t *dirstat, int mask)
7211 {
7212 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7213 std::lock_guard lock(client_lock);
7214 tout(cct) << "stat" << std::endl;
7215 tout(cct) << relpath << std::endl;
7216
7217 if (unmounting)
7218 return -ENOTCONN;
7219
7220 filepath path(relpath);
7221 InodeRef in;
7222 int r = path_walk(path, &in, perms, true, mask);
7223 if (r < 0)
7224 return r;
7225 r = _getattr(in, mask, perms);
7226 if (r < 0) {
7227 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7228 return r;
7229 }
7230 fill_stat(in, stbuf, dirstat);
7231 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7232 return r;
7233 }
7234
7235 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7236 {
7237 unsigned mask = 0;
7238
7239 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7240 if (flags & AT_NO_ATTR_SYNC)
7241 goto out;
7242
7243 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7244 mask |= CEPH_CAP_PIN;
7245 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7246 mask |= CEPH_CAP_AUTH_SHARED;
7247 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7248 mask |= CEPH_CAP_LINK_SHARED;
7249 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7250 mask |= CEPH_CAP_FILE_SHARED;
7251 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7252 mask |= CEPH_CAP_XATTR_SHARED;
7253 out:
7254 return mask;
7255 }
7256
7257 int Client::statx(const char *relpath, struct ceph_statx *stx,
7258 const UserPerm& perms,
7259 unsigned int want, unsigned int flags)
7260 {
7261 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7262 std::lock_guard lock(client_lock);
7263 tout(cct) << "statx" << std::endl;
7264 tout(cct) << relpath << std::endl;
7265
7266 if (unmounting)
7267 return -ENOTCONN;
7268
7269 filepath path(relpath);
7270 InodeRef in;
7271
7272 unsigned mask = statx_to_mask(flags, want);
7273
7274 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7275 if (r < 0)
7276 return r;
7277
7278 r = _getattr(in, mask, perms);
7279 if (r < 0) {
7280 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7281 return r;
7282 }
7283
7284 fill_statx(in, mask, stx);
7285 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7286 return r;
7287 }
7288
7289 int Client::lstat(const char *relpath, struct stat *stbuf,
7290 const UserPerm& perms, frag_info_t *dirstat, int mask)
7291 {
7292 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7293 std::lock_guard lock(client_lock);
7294 tout(cct) << __func__ << std::endl;
7295 tout(cct) << relpath << std::endl;
7296
7297 if (unmounting)
7298 return -ENOTCONN;
7299
7300 filepath path(relpath);
7301 InodeRef in;
7302 // don't follow symlinks
7303 int r = path_walk(path, &in, perms, false, mask);
7304 if (r < 0)
7305 return r;
7306 r = _getattr(in, mask, perms);
7307 if (r < 0) {
7308 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7309 return r;
7310 }
7311 fill_stat(in, stbuf, dirstat);
7312 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7313 return r;
7314 }
7315
7316 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7317 {
7318 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7319 << " mode 0" << oct << in->mode << dec
7320 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7321 memset(st, 0, sizeof(struct stat));
7322 if (use_faked_inos())
7323 st->st_ino = in->faked_ino;
7324 else
7325 st->st_ino = in->ino;
7326 st->st_dev = in->snapid;
7327 st->st_mode = in->mode;
7328 st->st_rdev = in->rdev;
7329 if (in->is_dir()) {
7330 switch (in->nlink) {
7331 case 0:
7332 st->st_nlink = 0; /* dir is unlinked */
7333 break;
7334 case 1:
7335 st->st_nlink = 1 /* parent dentry */
7336 + 1 /* <dir>/. */
7337 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7338 break;
7339 default:
7340 ceph_abort();
7341 }
7342 } else {
7343 st->st_nlink = in->nlink;
7344 }
7345 st->st_uid = in->uid;
7346 st->st_gid = in->gid;
7347 if (in->ctime > in->mtime) {
7348 stat_set_ctime_sec(st, in->ctime.sec());
7349 stat_set_ctime_nsec(st, in->ctime.nsec());
7350 } else {
7351 stat_set_ctime_sec(st, in->mtime.sec());
7352 stat_set_ctime_nsec(st, in->mtime.nsec());
7353 }
7354 stat_set_atime_sec(st, in->atime.sec());
7355 stat_set_atime_nsec(st, in->atime.nsec());
7356 stat_set_mtime_sec(st, in->mtime.sec());
7357 stat_set_mtime_nsec(st, in->mtime.nsec());
7358 if (in->is_dir()) {
7359 if (cct->_conf->client_dirsize_rbytes)
7360 st->st_size = in->rstat.rbytes;
7361 else
7362 st->st_size = in->dirstat.size();
7363 st->st_blocks = 1;
7364 } else {
7365 st->st_size = in->size;
7366 st->st_blocks = (in->size + 511) >> 9;
7367 }
7368 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7369
7370 if (dirstat)
7371 *dirstat = in->dirstat;
7372 if (rstat)
7373 *rstat = in->rstat;
7374
7375 return in->caps_issued();
7376 }
7377
7378 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7379 {
7380 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7381 << " mode 0" << oct << in->mode << dec
7382 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7383 memset(stx, 0, sizeof(struct ceph_statx));
7384
7385 /*
7386 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7387 * so that all bits are set.
7388 */
7389 if (!mask)
7390 mask = ~0;
7391
7392 /* These are always considered to be available */
7393 stx->stx_dev = in->snapid;
7394 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7395
7396 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7397 stx->stx_mode = S_IFMT & in->mode;
7398 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7399 stx->stx_rdev = in->rdev;
7400 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7401
7402 if (mask & CEPH_CAP_AUTH_SHARED) {
7403 stx->stx_uid = in->uid;
7404 stx->stx_gid = in->gid;
7405 stx->stx_mode = in->mode;
7406 in->btime.to_timespec(&stx->stx_btime);
7407 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7408 }
7409
7410 if (mask & CEPH_CAP_LINK_SHARED) {
7411 if (in->is_dir()) {
7412 switch (in->nlink) {
7413 case 0:
7414 stx->stx_nlink = 0; /* dir is unlinked */
7415 break;
7416 case 1:
7417 stx->stx_nlink = 1 /* parent dentry */
7418 + 1 /* <dir>/. */
7419 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7420 break;
7421 default:
7422 ceph_abort();
7423 }
7424 } else {
7425 stx->stx_nlink = in->nlink;
7426 }
7427 stx->stx_mask |= CEPH_STATX_NLINK;
7428 }
7429
7430 if (mask & CEPH_CAP_FILE_SHARED) {
7431
7432 in->atime.to_timespec(&stx->stx_atime);
7433 in->mtime.to_timespec(&stx->stx_mtime);
7434
7435 if (in->is_dir()) {
7436 if (cct->_conf->client_dirsize_rbytes)
7437 stx->stx_size = in->rstat.rbytes;
7438 else
7439 stx->stx_size = in->dirstat.size();
7440 stx->stx_blocks = 1;
7441 } else {
7442 stx->stx_size = in->size;
7443 stx->stx_blocks = (in->size + 511) >> 9;
7444 }
7445 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7446 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7447 }
7448
7449 /* Change time and change_attr both require all shared caps to view */
7450 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7451 stx->stx_version = in->change_attr;
7452 if (in->ctime > in->mtime)
7453 in->ctime.to_timespec(&stx->stx_ctime);
7454 else
7455 in->mtime.to_timespec(&stx->stx_ctime);
7456 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7457 }
7458
7459 }
7460
7461 void Client::touch_dn(Dentry *dn)
7462 {
7463 lru.lru_touch(dn);
7464 }
7465
7466 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7467 {
7468 std::lock_guard lock(client_lock);
7469 tout(cct) << __func__ << std::endl;
7470 tout(cct) << relpath << std::endl;
7471 tout(cct) << mode << std::endl;
7472
7473 if (unmounting)
7474 return -ENOTCONN;
7475
7476 filepath path(relpath);
7477 InodeRef in;
7478 int r = path_walk(path, &in, perms);
7479 if (r < 0)
7480 return r;
7481 struct stat attr;
7482 attr.st_mode = mode;
7483 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7484 }
7485
7486 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7487 {
7488 std::lock_guard lock(client_lock);
7489 tout(cct) << __func__ << std::endl;
7490 tout(cct) << fd << std::endl;
7491 tout(cct) << mode << std::endl;
7492
7493 if (unmounting)
7494 return -ENOTCONN;
7495
7496 Fh *f = get_filehandle(fd);
7497 if (!f)
7498 return -EBADF;
7499 #if defined(__linux__) && defined(O_PATH)
7500 if (f->flags & O_PATH)
7501 return -EBADF;
7502 #endif
7503 struct stat attr;
7504 attr.st_mode = mode;
7505 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7506 }
7507
7508 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7509 {
7510 std::lock_guard lock(client_lock);
7511 tout(cct) << __func__ << std::endl;
7512 tout(cct) << relpath << std::endl;
7513 tout(cct) << mode << std::endl;
7514
7515 if (unmounting)
7516 return -ENOTCONN;
7517
7518 filepath path(relpath);
7519 InodeRef in;
7520 // don't follow symlinks
7521 int r = path_walk(path, &in, perms, false);
7522 if (r < 0)
7523 return r;
7524 struct stat attr;
7525 attr.st_mode = mode;
7526 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7527 }
7528
7529 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7530 const UserPerm& perms)
7531 {
7532 std::lock_guard lock(client_lock);
7533 tout(cct) << __func__ << std::endl;
7534 tout(cct) << relpath << std::endl;
7535 tout(cct) << new_uid << std::endl;
7536 tout(cct) << new_gid << std::endl;
7537
7538 if (unmounting)
7539 return -ENOTCONN;
7540
7541 filepath path(relpath);
7542 InodeRef in;
7543 int r = path_walk(path, &in, perms);
7544 if (r < 0)
7545 return r;
7546 struct stat attr;
7547 attr.st_uid = new_uid;
7548 attr.st_gid = new_gid;
7549 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7550 }
7551
7552 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7553 {
7554 std::lock_guard lock(client_lock);
7555 tout(cct) << __func__ << std::endl;
7556 tout(cct) << fd << std::endl;
7557 tout(cct) << new_uid << std::endl;
7558 tout(cct) << new_gid << std::endl;
7559
7560 if (unmounting)
7561 return -ENOTCONN;
7562
7563 Fh *f = get_filehandle(fd);
7564 if (!f)
7565 return -EBADF;
7566 #if defined(__linux__) && defined(O_PATH)
7567 if (f->flags & O_PATH)
7568 return -EBADF;
7569 #endif
7570 struct stat attr;
7571 attr.st_uid = new_uid;
7572 attr.st_gid = new_gid;
7573 int mask = 0;
7574 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7575 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7576 return _setattr(f->inode, &attr, mask, perms);
7577 }
7578
7579 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7580 const UserPerm& perms)
7581 {
7582 std::lock_guard lock(client_lock);
7583 tout(cct) << __func__ << std::endl;
7584 tout(cct) << relpath << std::endl;
7585 tout(cct) << new_uid << std::endl;
7586 tout(cct) << new_gid << std::endl;
7587
7588 if (unmounting)
7589 return -ENOTCONN;
7590
7591 filepath path(relpath);
7592 InodeRef in;
7593 // don't follow symlinks
7594 int r = path_walk(path, &in, perms, false);
7595 if (r < 0)
7596 return r;
7597 struct stat attr;
7598 attr.st_uid = new_uid;
7599 attr.st_gid = new_gid;
7600 int mask = 0;
7601 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7602 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7603 return _setattr(in, &attr, mask, perms);
7604 }
7605
7606 static void attr_set_atime_and_mtime(struct stat *attr,
7607 const utime_t &atime,
7608 const utime_t &mtime)
7609 {
7610 stat_set_atime_sec(attr, atime.tv.tv_sec);
7611 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7612 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7613 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7614 }
7615
7616 // for [l]utime() invoke the timeval variant as the timespec
7617 // variant are not yet implemented. for futime[s](), invoke
7618 // the timespec variant.
7619 int Client::utime(const char *relpath, struct utimbuf *buf,
7620 const UserPerm& perms)
7621 {
7622 struct timeval tv[2];
7623 tv[0].tv_sec = buf->actime;
7624 tv[0].tv_usec = 0;
7625 tv[1].tv_sec = buf->modtime;
7626 tv[1].tv_usec = 0;
7627
7628 return utimes(relpath, tv, perms);
7629 }
7630
7631 int Client::lutime(const char *relpath, struct utimbuf *buf,
7632 const UserPerm& perms)
7633 {
7634 struct timeval tv[2];
7635 tv[0].tv_sec = buf->actime;
7636 tv[0].tv_usec = 0;
7637 tv[1].tv_sec = buf->modtime;
7638 tv[1].tv_usec = 0;
7639
7640 return lutimes(relpath, tv, perms);
7641 }
7642
7643 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7644 {
7645 struct timespec ts[2];
7646 ts[0].tv_sec = buf->actime;
7647 ts[0].tv_nsec = 0;
7648 ts[1].tv_sec = buf->modtime;
7649 ts[1].tv_nsec = 0;
7650
7651 return futimens(fd, ts, perms);
7652 }
7653
7654 int Client::utimes(const char *relpath, struct timeval times[2],
7655 const UserPerm& perms)
7656 {
7657 std::lock_guard lock(client_lock);
7658 tout(cct) << __func__ << std::endl;
7659 tout(cct) << relpath << std::endl;
7660 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7661 << std::endl;
7662 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7663 << std::endl;
7664
7665 if (unmounting)
7666 return -ENOTCONN;
7667
7668 filepath path(relpath);
7669 InodeRef in;
7670 int r = path_walk(path, &in, perms);
7671 if (r < 0)
7672 return r;
7673 struct stat attr;
7674 utime_t atime(times[0]);
7675 utime_t mtime(times[1]);
7676
7677 attr_set_atime_and_mtime(&attr, atime, mtime);
7678 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7679 }
7680
7681 int Client::lutimes(const char *relpath, struct timeval times[2],
7682 const UserPerm& perms)
7683 {
7684 std::lock_guard lock(client_lock);
7685 tout(cct) << __func__ << std::endl;
7686 tout(cct) << relpath << std::endl;
7687 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7688 << std::endl;
7689 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7690 << std::endl;
7691
7692 if (unmounting)
7693 return -ENOTCONN;
7694
7695 filepath path(relpath);
7696 InodeRef in;
7697 int r = path_walk(path, &in, perms, false);
7698 if (r < 0)
7699 return r;
7700 struct stat attr;
7701 utime_t atime(times[0]);
7702 utime_t mtime(times[1]);
7703
7704 attr_set_atime_and_mtime(&attr, atime, mtime);
7705 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7706 }
7707
7708 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7709 {
7710 struct timespec ts[2];
7711 ts[0].tv_sec = times[0].tv_sec;
7712 ts[0].tv_nsec = times[0].tv_usec * 1000;
7713 ts[1].tv_sec = times[1].tv_sec;
7714 ts[1].tv_nsec = times[1].tv_usec * 1000;
7715
7716 return futimens(fd, ts, perms);
7717 }
7718
7719 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7720 {
7721 std::lock_guard lock(client_lock);
7722 tout(cct) << __func__ << std::endl;
7723 tout(cct) << fd << std::endl;
7724 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7725 << std::endl;
7726 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7727 << std::endl;
7728
7729 if (unmounting)
7730 return -ENOTCONN;
7731
7732 Fh *f = get_filehandle(fd);
7733 if (!f)
7734 return -EBADF;
7735 #if defined(__linux__) && defined(O_PATH)
7736 if (f->flags & O_PATH)
7737 return -EBADF;
7738 #endif
7739 struct stat attr;
7740 utime_t atime(times[0]);
7741 utime_t mtime(times[1]);
7742
7743 attr_set_atime_and_mtime(&attr, atime, mtime);
7744 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7745 }
7746
7747 int Client::flock(int fd, int operation, uint64_t owner)
7748 {
7749 std::lock_guard lock(client_lock);
7750 tout(cct) << __func__ << std::endl;
7751 tout(cct) << fd << std::endl;
7752 tout(cct) << operation << std::endl;
7753 tout(cct) << owner << std::endl;
7754
7755 if (unmounting)
7756 return -ENOTCONN;
7757
7758 Fh *f = get_filehandle(fd);
7759 if (!f)
7760 return -EBADF;
7761
7762 return _flock(f, operation, owner);
7763 }
7764
7765 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7766 {
7767 std::lock_guard lock(client_lock);
7768 tout(cct) << __func__ << std::endl;
7769 tout(cct) << relpath << std::endl;
7770
7771 if (unmounting)
7772 return -ENOTCONN;
7773
7774 filepath path(relpath);
7775 InodeRef in;
7776 int r = path_walk(path, &in, perms, true);
7777 if (r < 0)
7778 return r;
7779 if (cct->_conf->client_permissions) {
7780 int r = may_open(in.get(), O_RDONLY, perms);
7781 if (r < 0)
7782 return r;
7783 }
7784 r = _opendir(in.get(), dirpp, perms);
7785 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7786 if (r != -ENOTDIR)
7787 tout(cct) << (unsigned long)*dirpp << std::endl;
7788 return r;
7789 }
7790
7791 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7792 {
7793 if (!in->is_dir())
7794 return -ENOTDIR;
7795 *dirpp = new dir_result_t(in, perms);
7796 opened_dirs.insert(*dirpp);
7797 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7798 return 0;
7799 }
7800
7801
7802 int Client::closedir(dir_result_t *dir)
7803 {
7804 std::lock_guard lock(client_lock);
7805 tout(cct) << __func__ << std::endl;
7806 tout(cct) << (unsigned long)dir << std::endl;
7807
7808 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7809 _closedir(dir);
7810 return 0;
7811 }
7812
7813 void Client::_closedir(dir_result_t *dirp)
7814 {
7815 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7816 if (dirp->inode) {
7817 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7818 dirp->inode.reset();
7819 }
7820 _readdir_drop_dirp_buffer(dirp);
7821 opened_dirs.erase(dirp);
7822 delete dirp;
7823 }
7824
7825 void Client::rewinddir(dir_result_t *dirp)
7826 {
7827 std::lock_guard lock(client_lock);
7828 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7829
7830 if (unmounting)
7831 return;
7832
7833 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7834 _readdir_drop_dirp_buffer(d);
7835 d->reset();
7836 }
7837
7838 loff_t Client::telldir(dir_result_t *dirp)
7839 {
7840 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7841 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7842 return d->offset;
7843 }
7844
7845 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7846 {
7847 std::lock_guard lock(client_lock);
7848
7849 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7850
7851 if (unmounting)
7852 return;
7853
7854 if (offset == dirp->offset)
7855 return;
7856
7857 if (offset > dirp->offset)
7858 dirp->release_count = 0; // bump if we do a forward seek
7859 else
7860 dirp->ordered_count = 0; // disable filling readdir cache
7861
7862 if (dirp->hash_order()) {
7863 if (dirp->offset > offset) {
7864 _readdir_drop_dirp_buffer(dirp);
7865 dirp->reset();
7866 }
7867 } else {
7868 if (offset == 0 ||
7869 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7870 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7871 _readdir_drop_dirp_buffer(dirp);
7872 dirp->reset();
7873 }
7874 }
7875
7876 dirp->offset = offset;
7877 }
7878
7879
7880 //struct dirent {
7881 // ino_t d_ino; /* inode number */
7882 // off_t d_off; /* offset to the next dirent */
7883 // unsigned short d_reclen; /* length of this record */
7884 // unsigned char d_type; /* type of file */
7885 // char d_name[256]; /* filename */
7886 //};
7887 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7888 {
7889 strncpy(de->d_name, name, 255);
7890 de->d_name[255] = '\0';
7891 #ifndef __CYGWIN__
7892 de->d_ino = ino;
7893 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7894 de->d_off = next_off;
7895 #endif
7896 de->d_reclen = 1;
7897 de->d_type = IFTODT(type);
7898 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7899 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7900 #endif
7901 }
7902
7903 void Client::_readdir_next_frag(dir_result_t *dirp)
7904 {
7905 frag_t fg = dirp->buffer_frag;
7906
7907 if (fg.is_rightmost()) {
7908 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7909 dirp->set_end();
7910 return;
7911 }
7912
7913 // advance
7914 fg = fg.next();
7915 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7916
7917 if (dirp->hash_order()) {
7918 // keep last_name
7919 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7920 if (dirp->offset < new_offset) // don't decrease offset
7921 dirp->offset = new_offset;
7922 } else {
7923 dirp->last_name.clear();
7924 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7925 _readdir_rechoose_frag(dirp);
7926 }
7927 }
7928
7929 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7930 {
7931 ceph_assert(dirp->inode);
7932
7933 if (dirp->hash_order())
7934 return;
7935
7936 frag_t cur = frag_t(dirp->offset_high());
7937 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7938 if (fg != cur) {
7939 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7940 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7941 dirp->last_name.clear();
7942 dirp->next_offset = 2;
7943 }
7944 }
7945
7946 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7947 {
7948 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7949 dirp->buffer.clear();
7950 }
7951
7952 int Client::_readdir_get_frag(dir_result_t *dirp)
7953 {
7954 ceph_assert(dirp);
7955 ceph_assert(dirp->inode);
7956
7957 // get the current frag.
7958 frag_t fg;
7959 if (dirp->hash_order())
7960 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7961 else
7962 fg = frag_t(dirp->offset_high());
7963
7964 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7965 << " offset " << hex << dirp->offset << dec << dendl;
7966
7967 int op = CEPH_MDS_OP_READDIR;
7968 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7969 op = CEPH_MDS_OP_LSSNAP;
7970
7971 InodeRef& diri = dirp->inode;
7972
7973 MetaRequest *req = new MetaRequest(op);
7974 filepath path;
7975 diri->make_nosnap_relative_path(path);
7976 req->set_filepath(path);
7977 req->set_inode(diri.get());
7978 req->head.args.readdir.frag = fg;
7979 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7980 if (dirp->last_name.length()) {
7981 req->path2.set_path(dirp->last_name);
7982 } else if (dirp->hash_order()) {
7983 req->head.args.readdir.offset_hash = dirp->offset_high();
7984 }
7985 req->dirp = dirp;
7986
7987 bufferlist dirbl;
7988 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7989
7990 if (res == -EAGAIN) {
7991 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7992 _readdir_rechoose_frag(dirp);
7993 return _readdir_get_frag(dirp);
7994 }
7995
7996 if (res == 0) {
7997 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7998 << " size " << dirp->buffer.size() << dendl;
7999 } else {
8000 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8001 dirp->set_end();
8002 }
8003
8004 return res;
8005 }
8006
8007 struct dentry_off_lt {
8008 bool operator()(const Dentry* dn, int64_t off) const {
8009 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8010 }
8011 };
8012
8013 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8014 int caps, bool getref)
8015 {
8016 ceph_assert(ceph_mutex_is_locked(client_lock));
8017 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8018 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8019 << dendl;
8020 Dir *dir = dirp->inode->dir;
8021
8022 if (!dir) {
8023 ldout(cct, 10) << " dir is empty" << dendl;
8024 dirp->set_end();
8025 return 0;
8026 }
8027
8028 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8029 dir->readdir_cache.end(),
8030 dirp->offset, dentry_off_lt());
8031
8032 string dn_name;
8033 while (true) {
8034 if (!dirp->inode->is_complete_and_ordered())
8035 return -EAGAIN;
8036 if (pd == dir->readdir_cache.end())
8037 break;
8038 Dentry *dn = *pd;
8039 if (dn->inode == NULL) {
8040 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8041 ++pd;
8042 continue;
8043 }
8044 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8045 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8046 ++pd;
8047 continue;
8048 }
8049
8050 int idx = pd - dir->readdir_cache.begin();
8051 int r = _getattr(dn->inode, caps, dirp->perms);
8052 if (r < 0)
8053 return r;
8054
8055 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8056 pd = dir->readdir_cache.begin() + idx;
8057 if (pd >= dir->readdir_cache.end() || *pd != dn)
8058 return -EAGAIN;
8059
8060 struct ceph_statx stx;
8061 struct dirent de;
8062 fill_statx(dn->inode, caps, &stx);
8063
8064 uint64_t next_off = dn->offset + 1;
8065 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8066 ++pd;
8067 if (pd == dir->readdir_cache.end())
8068 next_off = dir_result_t::END;
8069
8070 Inode *in = NULL;
8071 if (getref) {
8072 in = dn->inode.get();
8073 _ll_get(in);
8074 }
8075
8076 dn_name = dn->name; // fill in name while we have lock
8077
8078 client_lock.unlock();
8079 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8080 client_lock.lock();
8081 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8082 << " = " << r << dendl;
8083 if (r < 0) {
8084 return r;
8085 }
8086
8087 dirp->offset = next_off;
8088 if (dirp->at_end())
8089 dirp->next_offset = 2;
8090 else
8091 dirp->next_offset = dirp->offset_low();
8092 dirp->last_name = dn_name; // we successfully returned this one; update!
8093 dirp->release_count = 0; // last_name no longer match cache index
8094 if (r > 0)
8095 return r;
8096 }
8097
8098 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8099 dirp->set_end();
8100 return 0;
8101 }
8102
8103 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8104 unsigned want, unsigned flags, bool getref)
8105 {
8106 int caps = statx_to_mask(flags, want);
8107
8108 std::lock_guard lock(client_lock);
8109
8110 if (unmounting)
8111 return -ENOTCONN;
8112
8113 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8114
8115 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8116 << dec << " at_end=" << dirp->at_end()
8117 << " hash_order=" << dirp->hash_order() << dendl;
8118
8119 struct dirent de;
8120 struct ceph_statx stx;
8121 memset(&de, 0, sizeof(de));
8122 memset(&stx, 0, sizeof(stx));
8123
8124 InodeRef& diri = dirp->inode;
8125
8126 if (dirp->at_end())
8127 return 0;
8128
8129 if (dirp->offset == 0) {
8130 ldout(cct, 15) << " including ." << dendl;
8131 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8132 uint64_t next_off = 1;
8133
8134 int r;
8135 r = _getattr(diri, caps, dirp->perms);
8136 if (r < 0)
8137 return r;
8138
8139 fill_statx(diri, caps, &stx);
8140 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8141
8142 Inode *inode = NULL;
8143 if (getref) {
8144 inode = diri.get();
8145 _ll_get(inode);
8146 }
8147
8148 client_lock.unlock();
8149 r = cb(p, &de, &stx, next_off, inode);
8150 client_lock.lock();
8151 if (r < 0)
8152 return r;
8153
8154 dirp->offset = next_off;
8155 if (r > 0)
8156 return r;
8157 }
8158 if (dirp->offset == 1) {
8159 ldout(cct, 15) << " including .." << dendl;
8160 uint64_t next_off = 2;
8161 InodeRef in;
8162 if (diri->dentries.empty())
8163 in = diri;
8164 else
8165 in = diri->get_first_parent()->dir->parent_inode;
8166
8167 int r;
8168 r = _getattr(in, caps, dirp->perms);
8169 if (r < 0)
8170 return r;
8171
8172 fill_statx(in, caps, &stx);
8173 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8174
8175 Inode *inode = NULL;
8176 if (getref) {
8177 inode = in.get();
8178 _ll_get(inode);
8179 }
8180
8181 client_lock.unlock();
8182 r = cb(p, &de, &stx, next_off, inode);
8183 client_lock.lock();
8184 if (r < 0)
8185 return r;
8186
8187 dirp->offset = next_off;
8188 if (r > 0)
8189 return r;
8190 }
8191
8192 // can we read from our cache?
8193 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8194 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8195 << dirp->inode->is_complete_and_ordered()
8196 << " issued " << ccap_string(dirp->inode->caps_issued())
8197 << dendl;
8198 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8199 dirp->inode->is_complete_and_ordered() &&
8200 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8201 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8202 if (err != -EAGAIN)
8203 return err;
8204 }
8205
8206 while (1) {
8207 if (dirp->at_end())
8208 return 0;
8209
8210 bool check_caps = true;
8211 if (!dirp->is_cached()) {
8212 int r = _readdir_get_frag(dirp);
8213 if (r)
8214 return r;
8215 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8216 // different than the requested one. (our dirfragtree was outdated)
8217 check_caps = false;
8218 }
8219 frag_t fg = dirp->buffer_frag;
8220
8221 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8222 << " offset " << hex << dirp->offset << dendl;
8223
8224 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8225 dirp->offset, dir_result_t::dentry_off_lt());
8226 it != dirp->buffer.end();
8227 ++it) {
8228 dir_result_t::dentry &entry = *it;
8229
8230 uint64_t next_off = entry.offset + 1;
8231
8232 int r;
8233 if (check_caps) {
8234 r = _getattr(entry.inode, caps, dirp->perms);
8235 if (r < 0)
8236 return r;
8237 }
8238
8239 fill_statx(entry.inode, caps, &stx);
8240 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8241
8242 Inode *inode = NULL;
8243 if (getref) {
8244 inode = entry.inode.get();
8245 _ll_get(inode);
8246 }
8247
8248 client_lock.unlock();
8249 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8250 client_lock.lock();
8251
8252 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8253 << " = " << r << dendl;
8254 if (r < 0)
8255 return r;
8256
8257 dirp->offset = next_off;
8258 if (r > 0)
8259 return r;
8260 }
8261
8262 if (dirp->next_offset > 2) {
8263 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8264 _readdir_drop_dirp_buffer(dirp);
8265 continue; // more!
8266 }
8267
8268 if (!fg.is_rightmost()) {
8269 // next frag!
8270 _readdir_next_frag(dirp);
8271 continue;
8272 }
8273
8274 if (diri->shared_gen == dirp->start_shared_gen &&
8275 diri->dir_release_count == dirp->release_count) {
8276 if (diri->dir_ordered_count == dirp->ordered_count) {
8277 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8278 if (diri->dir) {
8279 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8280 diri->dir->readdir_cache.resize(dirp->cache_index);
8281 }
8282 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8283 } else {
8284 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8285 diri->flags |= I_COMPLETE;
8286 }
8287 }
8288
8289 dirp->set_end();
8290 return 0;
8291 }
8292 ceph_abort();
8293 return 0;
8294 }
8295
8296
8297 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8298 {
8299 return readdirplus_r(d, de, 0, 0, 0, NULL);
8300 }
8301
8302 /*
8303 * readdirplus_r
8304 *
8305 * returns
8306 * 1 if we got a dirent
8307 * 0 for end of directory
8308 * <0 on error
8309 */
8310
8311 struct single_readdir {
8312 struct dirent *de;
8313 struct ceph_statx *stx;
8314 Inode *inode;
8315 bool full;
8316 };
8317
8318 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8319 struct ceph_statx *stx, off_t off,
8320 Inode *in)
8321 {
8322 single_readdir *c = static_cast<single_readdir *>(p);
8323
8324 if (c->full)
8325 return -1; // already filled this dirent
8326
8327 *c->de = *de;
8328 if (c->stx)
8329 *c->stx = *stx;
8330 c->inode = in;
8331 c->full = true;
8332 return 1;
8333 }
8334
8335 struct dirent *Client::readdir(dir_result_t *d)
8336 {
8337 int ret;
8338 static struct dirent de;
8339 single_readdir sr;
8340 sr.de = &de;
8341 sr.stx = NULL;
8342 sr.inode = NULL;
8343 sr.full = false;
8344
8345 // our callback fills the dirent and sets sr.full=true on first
8346 // call, and returns -1 the second time around.
8347 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8348 if (ret < -1) {
8349 errno = -ret; // this sucks.
8350 return (dirent *) NULL;
8351 }
8352 if (sr.full) {
8353 return &de;
8354 }
8355 return (dirent *) NULL;
8356 }
8357
8358 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8359 struct ceph_statx *stx, unsigned want,
8360 unsigned flags, Inode **out)
8361 {
8362 single_readdir sr;
8363 sr.de = de;
8364 sr.stx = stx;
8365 sr.inode = NULL;
8366 sr.full = false;
8367
8368 // our callback fills the dirent and sets sr.full=true on first
8369 // call, and returns -1 the second time around.
8370 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8371 if (r < -1)
8372 return r;
8373 if (out)
8374 *out = sr.inode;
8375 if (sr.full)
8376 return 1;
8377 return 0;
8378 }
8379
8380
8381 /* getdents */
8382 struct getdents_result {
8383 char *buf;
8384 int buflen;
8385 int pos;
8386 bool fullent;
8387 };
8388
8389 static int _readdir_getdent_cb(void *p, struct dirent *de,
8390 struct ceph_statx *stx, off_t off, Inode *in)
8391 {
8392 struct getdents_result *c = static_cast<getdents_result *>(p);
8393
8394 int dlen;
8395 if (c->fullent)
8396 dlen = sizeof(*de);
8397 else
8398 dlen = strlen(de->d_name) + 1;
8399
8400 if (c->pos + dlen > c->buflen)
8401 return -1; // doesn't fit
8402
8403 if (c->fullent) {
8404 memcpy(c->buf + c->pos, de, sizeof(*de));
8405 } else {
8406 memcpy(c->buf + c->pos, de->d_name, dlen);
8407 }
8408 c->pos += dlen;
8409 return 0;
8410 }
8411
8412 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8413 {
8414 getdents_result gr;
8415 gr.buf = buf;
8416 gr.buflen = buflen;
8417 gr.fullent = fullent;
8418 gr.pos = 0;
8419
8420 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8421
8422 if (r < 0) { // some error
8423 if (r == -1) { // buffer ran out of space
8424 if (gr.pos) { // but we got some entries already!
8425 return gr.pos;
8426 } // or we need a larger buffer
8427 return -ERANGE;
8428 } else { // actual error, return it
8429 return r;
8430 }
8431 }
8432 return gr.pos;
8433 }
8434
8435
8436 /* getdir */
8437 struct getdir_result {
8438 list<string> *contents;
8439 int num;
8440 };
8441
8442 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8443 {
8444 getdir_result *r = static_cast<getdir_result *>(p);
8445
8446 r->contents->push_back(de->d_name);
8447 r->num++;
8448 return 0;
8449 }
8450
8451 int Client::getdir(const char *relpath, list<string>& contents,
8452 const UserPerm& perms)
8453 {
8454 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8455 {
8456 std::lock_guard lock(client_lock);
8457 tout(cct) << "getdir" << std::endl;
8458 tout(cct) << relpath << std::endl;
8459 }
8460
8461 dir_result_t *d;
8462 int r = opendir(relpath, &d, perms);
8463 if (r < 0)
8464 return r;
8465
8466 getdir_result gr;
8467 gr.contents = &contents;
8468 gr.num = 0;
8469 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8470
8471 closedir(d);
8472
8473 if (r < 0)
8474 return r;
8475 return gr.num;
8476 }
8477
8478
8479 /****** file i/o **********/
8480 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8481 mode_t mode, int stripe_unit, int stripe_count,
8482 int object_size, const char *data_pool)
8483 {
8484 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8485 std::lock_guard lock(client_lock);
8486 tout(cct) << "open" << std::endl;
8487 tout(cct) << relpath << std::endl;
8488 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8489
8490 if (unmounting)
8491 return -ENOTCONN;
8492
8493 Fh *fh = NULL;
8494
8495 #if defined(__linux__) && defined(O_PATH)
8496 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8497 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8498 * in kernel (fs/open.c). */
8499 if (flags & O_PATH)
8500 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8501 #endif
8502
8503 filepath path(relpath);
8504 InodeRef in;
8505 bool created = false;
8506 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8507 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8508 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8509
8510 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8511 return -EEXIST;
8512
8513 #if defined(__linux__) && defined(O_PATH)
8514 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8515 #else
8516 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8517 #endif
8518 return -ELOOP;
8519
8520 if (r == -ENOENT && (flags & O_CREAT)) {
8521 filepath dirpath = path;
8522 string dname = dirpath.last_dentry();
8523 dirpath.pop_dentry();
8524 InodeRef dir;
8525 r = path_walk(dirpath, &dir, perms, true,
8526 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8527 if (r < 0)
8528 goto out;
8529 if (cct->_conf->client_permissions) {
8530 r = may_create(dir.get(), perms);
8531 if (r < 0)
8532 goto out;
8533 }
8534 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8535 stripe_count, object_size, data_pool, &created, perms);
8536 }
8537 if (r < 0)
8538 goto out;
8539
8540 if (!created) {
8541 // posix says we can only check permissions of existing files
8542 if (cct->_conf->client_permissions) {
8543 r = may_open(in.get(), flags, perms);
8544 if (r < 0)
8545 goto out;
8546 }
8547 }
8548
8549 if (!fh)
8550 r = _open(in.get(), flags, mode, &fh, perms);
8551 if (r >= 0) {
8552 // allocate a integer file descriptor
8553 ceph_assert(fh);
8554 r = get_fd();
8555 ceph_assert(fd_map.count(r) == 0);
8556 fd_map[r] = fh;
8557 }
8558
8559 out:
8560 tout(cct) << r << std::endl;
8561 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8562 return r;
8563 }
8564
8565 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8566 {
8567 /* Use default file striping parameters */
8568 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8569 }
8570
8571 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8572 const UserPerm& perms)
8573 {
8574 std::lock_guard lock(client_lock);
8575 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8576
8577 if (unmounting)
8578 return -ENOTCONN;
8579
8580 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8581 filepath path(ino);
8582 req->set_filepath(path);
8583
8584 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8585 char f[30];
8586 sprintf(f, "%u", h);
8587 filepath path2(dirino);
8588 path2.push_dentry(string(f));
8589 req->set_filepath2(path2);
8590
8591 int r = make_request(req, perms, NULL, NULL,
8592 rand() % mdsmap->get_num_in_mds());
8593 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8594 return r;
8595 }
8596
8597
8598 /**
8599 * Load inode into local cache.
8600 *
8601 * If inode pointer is non-NULL, and take a reference on
8602 * the resulting Inode object in one operation, so that caller
8603 * can safely assume inode will still be there after return.
8604 */
8605 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8606 {
8607 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8608
8609 if (unmounting)
8610 return -ENOTCONN;
8611
8612 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8613 filepath path(ino);
8614 req->set_filepath(path);
8615
8616 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8617 if (r == 0 && inode != NULL) {
8618 vinodeno_t vino(ino, CEPH_NOSNAP);
8619 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8620 ceph_assert(p != inode_map.end());
8621 *inode = p->second;
8622 _ll_get(*inode);
8623 }
8624 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8625 return r;
8626 }
8627
8628 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8629 {
8630 std::lock_guard lock(client_lock);
8631 return _lookup_ino(ino, perms, inode);
8632 }
8633
8634 /**
8635 * Find the parent inode of `ino` and insert it into
8636 * our cache. Conditionally also set `parent` to a referenced
8637 * Inode* if caller provides non-NULL value.
8638 */
8639 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8640 {
8641 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8642
8643 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8644 filepath path(ino->ino);
8645 req->set_filepath(path);
8646
8647 InodeRef target;
8648 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8649 // Give caller a reference to the parent ino if they provided a pointer.
8650 if (parent != NULL) {
8651 if (r == 0) {
8652 *parent = target.get();
8653 _ll_get(*parent);
8654 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8655 } else {
8656 *parent = NULL;
8657 }
8658 }
8659 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8660 return r;
8661 }
8662
8663 /**
8664 * Populate the parent dentry for `ino`, provided it is
8665 * a child of `parent`.
8666 */
8667 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8668 {
8669 ceph_assert(parent->is_dir());
8670 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8671
8672 if (unmounting)
8673 return -ENOTCONN;
8674
8675 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8676 req->set_filepath2(filepath(parent->ino));
8677 req->set_filepath(filepath(ino->ino));
8678 req->set_inode(ino);
8679
8680 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8681 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8682 return r;
8683 }
8684
8685 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8686 {
8687 std::lock_guard lock(client_lock);
8688 return _lookup_name(ino, parent, perms);
8689 }
8690
8691 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8692 {
8693 ceph_assert(in);
8694 Fh *f = new Fh(in, flags, cmode, perms);
8695
8696 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8697
8698 if (in->snapid != CEPH_NOSNAP) {
8699 in->snap_cap_refs++;
8700 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8701 << ccap_string(in->caps_issued()) << dendl;
8702 }
8703
8704 const auto& conf = cct->_conf;
8705 f->readahead.set_trigger_requests(1);
8706 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8707 uint64_t max_readahead = Readahead::NO_LIMIT;
8708 if (conf->client_readahead_max_bytes) {
8709 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8710 }
8711 if (conf->client_readahead_max_periods) {
8712 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8713 }
8714 f->readahead.set_max_readahead_size(max_readahead);
8715 vector<uint64_t> alignments;
8716 alignments.push_back(in->layout.get_period());
8717 alignments.push_back(in->layout.stripe_unit);
8718 f->readahead.set_alignments(alignments);
8719
8720 return f;
8721 }
8722
8723 int Client::_release_fh(Fh *f)
8724 {
8725 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8726 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8727 Inode *in = f->inode.get();
8728 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8729
8730 in->unset_deleg(f);
8731
8732 if (in->snapid == CEPH_NOSNAP) {
8733 if (in->put_open_ref(f->mode)) {
8734 _flush(in, new C_Client_FlushComplete(this, in));
8735 check_caps(in, 0);
8736 }
8737 } else {
8738 ceph_assert(in->snap_cap_refs > 0);
8739 in->snap_cap_refs--;
8740 }
8741
8742 _release_filelocks(f);
8743
8744 // Finally, read any async err (i.e. from flushes)
8745 int err = f->take_async_err();
8746 if (err != 0) {
8747 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8748 << cpp_strerror(err) << dendl;
8749 } else {
8750 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8751 }
8752
8753 _put_fh(f);
8754
8755 return err;
8756 }
8757
8758 void Client::_put_fh(Fh *f)
8759 {
8760 int left = f->put();
8761 if (!left) {
8762 delete f;
8763 }
8764 }
8765
8766 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8767 const UserPerm& perms)
8768 {
8769 if (in->snapid != CEPH_NOSNAP &&
8770 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8771 return -EROFS;
8772 }
8773
8774 // use normalized flags to generate cmode
8775 int cflags = ceph_flags_sys2wire(flags);
8776 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8777 cflags |= CEPH_O_LAZY;
8778
8779 int cmode = ceph_flags_to_mode(cflags);
8780 int want = ceph_caps_for_mode(cmode);
8781 int result = 0;
8782
8783 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8784
8785 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8786 // update wanted?
8787 check_caps(in, CHECK_CAPS_NODELAY);
8788 } else {
8789
8790 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8791 filepath path;
8792 in->make_nosnap_relative_path(path);
8793 req->set_filepath(path);
8794 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8795 req->head.args.open.mode = mode;
8796 req->head.args.open.pool = -1;
8797 if (cct->_conf->client_debug_getattr_caps)
8798 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8799 else
8800 req->head.args.open.mask = 0;
8801 req->head.args.open.old_size = in->size; // for O_TRUNC
8802 req->set_inode(in);
8803 result = make_request(req, perms);
8804
8805 /*
8806 * NFS expects that delegations will be broken on a conflicting open,
8807 * not just when there is actual conflicting access to the file. SMB leases
8808 * and oplocks also have similar semantics.
8809 *
8810 * Ensure that clients that have delegations enabled will wait on minimal
8811 * caps during open, just to ensure that other clients holding delegations
8812 * return theirs first.
8813 */
8814 if (deleg_timeout && result == 0) {
8815 int need = 0, have;
8816
8817 if (cmode & CEPH_FILE_MODE_WR)
8818 need |= CEPH_CAP_FILE_WR;
8819 if (cmode & CEPH_FILE_MODE_RD)
8820 need |= CEPH_CAP_FILE_RD;
8821
8822 result = get_caps(in, need, want, &have, -1);
8823 if (result < 0) {
8824 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8825 " . Denying open: " <<
8826 cpp_strerror(result) << dendl;
8827 in->put_open_ref(cmode);
8828 } else {
8829 put_cap_ref(in, need);
8830 }
8831 }
8832 }
8833
8834 // success?
8835 if (result >= 0) {
8836 if (fhp)
8837 *fhp = _create_fh(in, flags, cmode, perms);
8838 } else {
8839 in->put_open_ref(cmode);
8840 }
8841
8842 trim_cache();
8843
8844 return result;
8845 }
8846
8847 int Client::_renew_caps(Inode *in)
8848 {
8849 int wanted = in->caps_file_wanted();
8850 if (in->is_any_caps() &&
8851 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8852 check_caps(in, CHECK_CAPS_NODELAY);
8853 return 0;
8854 }
8855
8856 int flags = 0;
8857 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8858 flags = O_RDWR;
8859 else if (wanted & CEPH_CAP_FILE_RD)
8860 flags = O_RDONLY;
8861 else if (wanted & CEPH_CAP_FILE_WR)
8862 flags = O_WRONLY;
8863
8864 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8865 filepath path;
8866 in->make_nosnap_relative_path(path);
8867 req->set_filepath(path);
8868 req->head.args.open.flags = flags;
8869 req->head.args.open.pool = -1;
8870 if (cct->_conf->client_debug_getattr_caps)
8871 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8872 else
8873 req->head.args.open.mask = 0;
8874 req->set_inode(in);
8875
8876 // duplicate in case Cap goes away; not sure if that race is a concern?
8877 const UserPerm *pperm = in->get_best_perms();
8878 UserPerm perms;
8879 if (pperm != NULL)
8880 perms = *pperm;
8881 int ret = make_request(req, perms);
8882 return ret;
8883 }
8884
8885 int Client::close(int fd)
8886 {
8887 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8888 std::lock_guard lock(client_lock);
8889 tout(cct) << "close" << std::endl;
8890 tout(cct) << fd << std::endl;
8891
8892 if (unmounting)
8893 return -ENOTCONN;
8894
8895 Fh *fh = get_filehandle(fd);
8896 if (!fh)
8897 return -EBADF;
8898 int err = _release_fh(fh);
8899 fd_map.erase(fd);
8900 put_fd(fd);
8901 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8902 return err;
8903 }
8904
8905
8906 // ------------
8907 // read, write
8908
8909 loff_t Client::lseek(int fd, loff_t offset, int whence)
8910 {
8911 std::lock_guard lock(client_lock);
8912 tout(cct) << "lseek" << std::endl;
8913 tout(cct) << fd << std::endl;
8914 tout(cct) << offset << std::endl;
8915 tout(cct) << whence << std::endl;
8916
8917 if (unmounting)
8918 return -ENOTCONN;
8919
8920 Fh *f = get_filehandle(fd);
8921 if (!f)
8922 return -EBADF;
8923 #if defined(__linux__) && defined(O_PATH)
8924 if (f->flags & O_PATH)
8925 return -EBADF;
8926 #endif
8927 return _lseek(f, offset, whence);
8928 }
8929
8930 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8931 {
8932 Inode *in = f->inode.get();
8933 bool whence_check = false;
8934 loff_t pos = -1;
8935
8936 switch (whence) {
8937 case SEEK_END:
8938 whence_check = true;
8939 break;
8940
8941 #ifdef SEEK_DATA
8942 case SEEK_DATA:
8943 whence_check = true;
8944 break;
8945 #endif
8946
8947 #ifdef SEEK_HOLE
8948 case SEEK_HOLE:
8949 whence_check = true;
8950 break;
8951 #endif
8952 }
8953
8954 if (whence_check) {
8955 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8956 if (r < 0)
8957 return r;
8958 }
8959
8960 switch (whence) {
8961 case SEEK_SET:
8962 pos = offset;
8963 break;
8964
8965 case SEEK_CUR:
8966 pos = f->pos + offset;
8967 break;
8968
8969 case SEEK_END:
8970 pos = in->size + offset;
8971 break;
8972
8973 #ifdef SEEK_DATA
8974 case SEEK_DATA:
8975 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
8976 return -ENXIO;
8977 pos = offset;
8978 break;
8979 #endif
8980
8981 #ifdef SEEK_HOLE
8982 case SEEK_HOLE:
8983 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
8984 return -ENXIO;
8985 pos = in->size;
8986 break;
8987 #endif
8988
8989 default:
8990 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
8991 return -EINVAL;
8992 }
8993
8994 if (pos < 0) {
8995 return -EINVAL;
8996 } else {
8997 f->pos = pos;
8998 }
8999
9000 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9001 return f->pos;
9002 }
9003
9004
9005 void Client::lock_fh_pos(Fh *f)
9006 {
9007 ldout(cct, 10) << __func__ << " " << f << dendl;
9008
9009 if (f->pos_locked || !f->pos_waiters.empty()) {
9010 ceph::condition_variable cond;
9011 f->pos_waiters.push_back(&cond);
9012 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9013 std::unique_lock l{client_lock, std::adopt_lock};
9014 cond.wait(l, [f, me=&cond] {
9015 return !f->pos_locked && f->pos_waiters.front() == me;
9016 });
9017 l.release();
9018 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9019 ceph_assert(f->pos_waiters.front() == &cond);
9020 f->pos_waiters.pop_front();
9021 }
9022
9023 f->pos_locked = true;
9024 }
9025
9026 void Client::unlock_fh_pos(Fh *f)
9027 {
9028 ldout(cct, 10) << __func__ << " " << f << dendl;
9029 f->pos_locked = false;
9030 }
9031
9032 int Client::uninline_data(Inode *in, Context *onfinish)
9033 {
9034 if (!in->inline_data.length()) {
9035 onfinish->complete(0);
9036 return 0;
9037 }
9038
9039 char oid_buf[32];
9040 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9041 object_t oid = oid_buf;
9042
9043 ObjectOperation create_ops;
9044 create_ops.create(false);
9045
9046 objecter->mutate(oid,
9047 OSDMap::file_to_object_locator(in->layout),
9048 create_ops,
9049 in->snaprealm->get_snap_context(),
9050 ceph::real_clock::now(),
9051 0,
9052 NULL);
9053
9054 bufferlist inline_version_bl;
9055 encode(in->inline_version, inline_version_bl);
9056
9057 ObjectOperation uninline_ops;
9058 uninline_ops.cmpxattr("inline_version",
9059 CEPH_OSD_CMPXATTR_OP_GT,
9060 CEPH_OSD_CMPXATTR_MODE_U64,
9061 inline_version_bl);
9062 bufferlist inline_data = in->inline_data;
9063 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9064 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9065
9066 objecter->mutate(oid,
9067 OSDMap::file_to_object_locator(in->layout),
9068 uninline_ops,
9069 in->snaprealm->get_snap_context(),
9070 ceph::real_clock::now(),
9071 0,
9072 onfinish);
9073
9074 return 0;
9075 }
9076
9077 //
9078
9079 // blocking osd interface
9080
9081 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9082 {
9083 std::lock_guard lock(client_lock);
9084 tout(cct) << "read" << std::endl;
9085 tout(cct) << fd << std::endl;
9086 tout(cct) << size << std::endl;
9087 tout(cct) << offset << std::endl;
9088
9089 if (unmounting)
9090 return -ENOTCONN;
9091
9092 Fh *f = get_filehandle(fd);
9093 if (!f)
9094 return -EBADF;
9095 #if defined(__linux__) && defined(O_PATH)
9096 if (f->flags & O_PATH)
9097 return -EBADF;
9098 #endif
9099 bufferlist bl;
9100 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9101 size = std::min(size, (loff_t)INT_MAX);
9102 int r = _read(f, offset, size, &bl);
9103 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9104 if (r >= 0) {
9105 bl.begin().copy(bl.length(), buf);
9106 r = bl.length();
9107 }
9108 return r;
9109 }
9110
9111 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9112 {
9113 if (iovcnt < 0)
9114 return -EINVAL;
9115 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9116 }
9117
9118 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9119 {
9120 int want, have = 0;
9121 bool movepos = false;
9122 std::unique_ptr<C_SaferCond> onuninline;
9123 int64_t r = 0;
9124 const auto& conf = cct->_conf;
9125 Inode *in = f->inode.get();
9126 utime_t lat;
9127 utime_t start = ceph_clock_now();
9128
9129 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9130 return -EBADF;
9131 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9132
9133 if (offset < 0) {
9134 lock_fh_pos(f);
9135 offset = f->pos;
9136 movepos = true;
9137 }
9138 loff_t start_pos = offset;
9139
9140 if (in->inline_version == 0) {
9141 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9142 if (r < 0) {
9143 goto done;
9144 }
9145 ceph_assert(in->inline_version > 0);
9146 }
9147
9148 retry:
9149 if (f->mode & CEPH_FILE_MODE_LAZY)
9150 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9151 else
9152 want = CEPH_CAP_FILE_CACHE;
9153 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
9154 if (r < 0) {
9155 goto done;
9156 }
9157 if (f->flags & O_DIRECT)
9158 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9159
9160 if (in->inline_version < CEPH_INLINE_NONE) {
9161 if (!(have & CEPH_CAP_FILE_CACHE)) {
9162 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9163 uninline_data(in, onuninline.get());
9164 } else {
9165 uint32_t len = in->inline_data.length();
9166 uint64_t endoff = offset + size;
9167 if (endoff > in->size)
9168 endoff = in->size;
9169
9170 if (offset < len) {
9171 if (endoff <= len) {
9172 bl->substr_of(in->inline_data, offset, endoff - offset);
9173 } else {
9174 bl->substr_of(in->inline_data, offset, len - offset);
9175 bl->append_zero(endoff - len);
9176 }
9177 r = endoff - offset;
9178 } else if ((uint64_t)offset < endoff) {
9179 bl->append_zero(endoff - offset);
9180 r = endoff - offset;
9181 } else {
9182 r = 0;
9183 }
9184 goto success;
9185 }
9186 }
9187
9188 if (!conf->client_debug_force_sync_read &&
9189 conf->client_oc &&
9190 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9191
9192 if (f->flags & O_RSYNC) {
9193 _flush_range(in, offset, size);
9194 }
9195 r = _read_async(f, offset, size, bl);
9196 if (r < 0)
9197 goto done;
9198 } else {
9199 if (f->flags & O_DIRECT)
9200 _flush_range(in, offset, size);
9201
9202 bool checkeof = false;
9203 r = _read_sync(f, offset, size, bl, &checkeof);
9204 if (r < 0)
9205 goto done;
9206 if (checkeof) {
9207 offset += r;
9208 size -= r;
9209
9210 put_cap_ref(in, CEPH_CAP_FILE_RD);
9211 have = 0;
9212 // reverify size
9213 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9214 if (r < 0)
9215 goto done;
9216
9217 // eof? short read.
9218 if ((uint64_t)offset < in->size)
9219 goto retry;
9220 }
9221 }
9222
9223 success:
9224 ceph_assert(r >= 0);
9225 if (movepos) {
9226 // adjust fd pos
9227 f->pos = start_pos + r;
9228 }
9229
9230 lat = ceph_clock_now();
9231 lat -= start;
9232 logger->tinc(l_c_read, lat);
9233
9234 done:
9235 // done!
9236
9237 if (onuninline) {
9238 client_lock.unlock();
9239 int ret = onuninline->wait();
9240 client_lock.lock();
9241 if (ret >= 0 || ret == -ECANCELED) {
9242 in->inline_data.clear();
9243 in->inline_version = CEPH_INLINE_NONE;
9244 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9245 check_caps(in, 0);
9246 } else
9247 r = ret;
9248 }
9249 if (have) {
9250 put_cap_ref(in, CEPH_CAP_FILE_RD);
9251 }
9252 if (movepos) {
9253 unlock_fh_pos(f);
9254 }
9255 return r;
9256 }
9257
9258 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9259 client(c), f(f) {
9260 f->get();
9261 f->readahead.inc_pending();
9262 }
9263
9264 Client::C_Readahead::~C_Readahead() {
9265 f->readahead.dec_pending();
9266 client->_put_fh(f);
9267 }
9268
9269 void Client::C_Readahead::finish(int r) {
9270 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9271 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9272 }
9273
9274 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9275 {
9276 const auto& conf = cct->_conf;
9277 Inode *in = f->inode.get();
9278
9279 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9280
9281 // trim read based on file size?
9282 if (off >= in->size)
9283 return 0;
9284 if (len == 0)
9285 return 0;
9286 if (off + len > in->size) {
9287 len = in->size - off;
9288 }
9289
9290 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9291 << " max_bytes=" << f->readahead.get_max_readahead_size()
9292 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9293
9294 // read (and possibly block)
9295 int r = 0;
9296 C_SaferCond onfinish("Client::_read_async flock");
9297 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9298 off, len, bl, 0, &onfinish);
9299 if (r == 0) {
9300 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9301 client_lock.unlock();
9302 r = onfinish.wait();
9303 client_lock.lock();
9304 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9305 }
9306
9307 if(f->readahead.get_min_readahead_size() > 0) {
9308 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9309 if (readahead_extent.second > 0) {
9310 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9311 << " (caller wants " << off << "~" << len << ")" << dendl;
9312 Context *onfinish2 = new C_Readahead(this, f);
9313 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9314 readahead_extent.first, readahead_extent.second,
9315 NULL, 0, onfinish2);
9316 if (r2 == 0) {
9317 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9318 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9319 } else {
9320 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9321 delete onfinish2;
9322 }
9323 }
9324 }
9325
9326 return r;
9327 }
9328
9329 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9330 bool *checkeof)
9331 {
9332 Inode *in = f->inode.get();
9333 uint64_t pos = off;
9334 int left = len;
9335 int read = 0;
9336
9337 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9338
9339 while (left > 0) {
9340 C_SaferCond onfinish("Client::_read_sync flock");
9341 bufferlist tbl;
9342
9343 int wanted = left;
9344 filer->read_trunc(in->ino, &in->layout, in->snapid,
9345 pos, left, &tbl, 0,
9346 in->truncate_size, in->truncate_seq,
9347 &onfinish);
9348 client_lock.unlock();
9349 int r = onfinish.wait();
9350 client_lock.lock();
9351
9352 // if we get ENOENT from OSD, assume 0 bytes returned
9353 if (r == -ENOENT)
9354 r = 0;
9355 if (r < 0)
9356 return r;
9357 if (tbl.length()) {
9358 r = tbl.length();
9359
9360 read += r;
9361 pos += r;
9362 left -= r;
9363 bl->claim_append(tbl);
9364 }
9365 // short read?
9366 if (r >= 0 && r < wanted) {
9367 if (pos < in->size) {
9368 // zero up to known EOF
9369 int64_t some = in->size - pos;
9370 if (some > left)
9371 some = left;
9372 auto z = buffer::ptr_node::create(some);
9373 z->zero();
9374 bl->push_back(std::move(z));
9375 read += some;
9376 pos += some;
9377 left -= some;
9378 if (left == 0)
9379 return read;
9380 }
9381
9382 *checkeof = true;
9383 return read;
9384 }
9385 }
9386 return read;
9387 }
9388
9389
9390 /*
9391 * we keep count of uncommitted sync writes on the inode, so that
9392 * fsync can DDRT.
9393 */
9394 void Client::_sync_write_commit(Inode *in)
9395 {
9396 ceph_assert(unsafe_sync_write > 0);
9397 unsafe_sync_write--;
9398
9399 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9400
9401 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9402 if (unsafe_sync_write == 0 && unmounting) {
9403 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9404 mount_cond.notify_all();
9405 }
9406 }
9407
9408 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9409 {
9410 std::lock_guard lock(client_lock);
9411 tout(cct) << "write" << std::endl;
9412 tout(cct) << fd << std::endl;
9413 tout(cct) << size << std::endl;
9414 tout(cct) << offset << std::endl;
9415
9416 if (unmounting)
9417 return -ENOTCONN;
9418
9419 Fh *fh = get_filehandle(fd);
9420 if (!fh)
9421 return -EBADF;
9422 #if defined(__linux__) && defined(O_PATH)
9423 if (fh->flags & O_PATH)
9424 return -EBADF;
9425 #endif
9426 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9427 size = std::min(size, (loff_t)INT_MAX);
9428 int r = _write(fh, offset, size, buf, NULL, false);
9429 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9430 return r;
9431 }
9432
9433 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9434 {
9435 if (iovcnt < 0)
9436 return -EINVAL;
9437 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9438 }
9439
9440 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9441 unsigned iovcnt, int64_t offset, bool write,
9442 bool clamp_to_int)
9443 {
9444 #if defined(__linux__) && defined(O_PATH)
9445 if (fh->flags & O_PATH)
9446 return -EBADF;
9447 #endif
9448 loff_t totallen = 0;
9449 for (unsigned i = 0; i < iovcnt; i++) {
9450 totallen += iov[i].iov_len;
9451 }
9452
9453 /*
9454 * Some of the API functions take 64-bit size values, but only return
9455 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9456 * we don't do I/Os larger than the values we can return.
9457 */
9458 if (clamp_to_int) {
9459 totallen = std::min(totallen, (loff_t)INT_MAX);
9460 }
9461 if (write) {
9462 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9463 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9464 return w;
9465 } else {
9466 bufferlist bl;
9467 int64_t r = _read(fh, offset, totallen, &bl);
9468 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9469 if (r <= 0)
9470 return r;
9471
9472 auto iter = bl.cbegin();
9473 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9474 /*
9475 * This piece of code aims to handle the case that bufferlist does not have enough data
9476 * to fill in the iov
9477 */
9478 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9479 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9480 resid -= round_size;
9481 /* iter is self-updating */
9482 }
9483 return r;
9484 }
9485 }
9486
9487 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9488 {
9489 std::lock_guard lock(client_lock);
9490 tout(cct) << fd << std::endl;
9491 tout(cct) << offset << std::endl;
9492
9493 if (unmounting)
9494 return -ENOTCONN;
9495
9496 Fh *fh = get_filehandle(fd);
9497 if (!fh)
9498 return -EBADF;
9499 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9500 }
9501
9502 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9503 const struct iovec *iov, int iovcnt)
9504 {
9505 uint64_t fpos = 0;
9506
9507 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9508 return -EFBIG;
9509
9510 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9511 Inode *in = f->inode.get();
9512
9513 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9514 return -ENOSPC;
9515 }
9516
9517 ceph_assert(in->snapid == CEPH_NOSNAP);
9518
9519 // was Fh opened as writeable?
9520 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9521 return -EBADF;
9522
9523 // use/adjust fd pos?
9524 if (offset < 0) {
9525 lock_fh_pos(f);
9526 /*
9527 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9528 * change out from under us.
9529 */
9530 if (f->flags & O_APPEND) {
9531 auto r = _lseek(f, 0, SEEK_END);
9532 if (r < 0) {
9533 unlock_fh_pos(f);
9534 return r;
9535 }
9536 }
9537 offset = f->pos;
9538 fpos = offset+size;
9539 unlock_fh_pos(f);
9540 }
9541
9542 // check quota
9543 uint64_t endoff = offset + size;
9544 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9545 f->actor_perms)) {
9546 return -EDQUOT;
9547 }
9548
9549 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9550
9551 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9552
9553 // time it.
9554 utime_t start = ceph_clock_now();
9555
9556 if (in->inline_version == 0) {
9557 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9558 if (r < 0)
9559 return r;
9560 ceph_assert(in->inline_version > 0);
9561 }
9562
9563 // copy into fresh buffer (since our write may be resub, async)
9564 bufferlist bl;
9565 if (buf) {
9566 if (size > 0)
9567 bl.append(buf, size);
9568 } else if (iov){
9569 for (int i = 0; i < iovcnt; i++) {
9570 if (iov[i].iov_len > 0) {
9571 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9572 }
9573 }
9574 }
9575
9576 utime_t lat;
9577 uint64_t totalwritten;
9578 int want, have;
9579 if (f->mode & CEPH_FILE_MODE_LAZY)
9580 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9581 else
9582 want = CEPH_CAP_FILE_BUFFER;
9583 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9584 if (r < 0)
9585 return r;
9586
9587 /* clear the setuid/setgid bits, if any */
9588 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9589 struct ceph_statx stx = { 0 };
9590
9591 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9592 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9593 if (r < 0)
9594 return r;
9595 } else {
9596 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9597 }
9598
9599 if (f->flags & O_DIRECT)
9600 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9601
9602 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9603
9604 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9605
9606 if (in->inline_version < CEPH_INLINE_NONE) {
9607 if (endoff > cct->_conf->client_max_inline_size ||
9608 endoff > CEPH_INLINE_MAX_SIZE ||
9609 !(have & CEPH_CAP_FILE_BUFFER)) {
9610 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9611 uninline_data(in, onuninline.get());
9612 } else {
9613 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9614
9615 uint32_t len = in->inline_data.length();
9616
9617 if (endoff < len)
9618 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9619
9620 if (offset < len)
9621 in->inline_data.splice(offset, len - offset);
9622 else if (offset > len)
9623 in->inline_data.append_zero(offset - len);
9624
9625 in->inline_data.append(bl);
9626 in->inline_version++;
9627
9628 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9629
9630 goto success;
9631 }
9632 }
9633
9634 if (cct->_conf->client_oc &&
9635 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9636 // do buffered write
9637 if (!in->oset.dirty_or_tx)
9638 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9639
9640 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9641
9642 // async, caching, non-blocking.
9643 r = objectcacher->file_write(&in->oset, &in->layout,
9644 in->snaprealm->get_snap_context(),
9645 offset, size, bl, ceph::real_clock::now(),
9646 0);
9647 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9648
9649 if (r < 0)
9650 goto done;
9651
9652 // flush cached write if O_SYNC is set on file fh
9653 // O_DSYNC == O_SYNC on linux < 2.6.33
9654 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9655 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9656 _flush_range(in, offset, size);
9657 }
9658 } else {
9659 if (f->flags & O_DIRECT)
9660 _flush_range(in, offset, size);
9661
9662 // simple, non-atomic sync write
9663 C_SaferCond onfinish("Client::_write flock");
9664 unsafe_sync_write++;
9665 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9666
9667 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9668 offset, size, bl, ceph::real_clock::now(), 0,
9669 in->truncate_size, in->truncate_seq,
9670 &onfinish);
9671 client_lock.unlock();
9672 onfinish.wait();
9673 client_lock.lock();
9674 _sync_write_commit(in);
9675 }
9676
9677 // if we get here, write was successful, update client metadata
9678 success:
9679 // time
9680 lat = ceph_clock_now();
9681 lat -= start;
9682 logger->tinc(l_c_wrlat, lat);
9683
9684 if (fpos) {
9685 lock_fh_pos(f);
9686 f->pos = fpos;
9687 unlock_fh_pos(f);
9688 }
9689 totalwritten = size;
9690 r = (int64_t)totalwritten;
9691
9692 // extend file?
9693 if (totalwritten + offset > in->size) {
9694 in->size = totalwritten + offset;
9695 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9696
9697 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9698 check_caps(in, CHECK_CAPS_NODELAY);
9699 } else if (is_max_size_approaching(in)) {
9700 check_caps(in, 0);
9701 }
9702
9703 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9704 } else {
9705 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9706 }
9707
9708 // mtime
9709 in->mtime = in->ctime = ceph_clock_now();
9710 in->change_attr++;
9711 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9712
9713 done:
9714
9715 if (nullptr != onuninline) {
9716 client_lock.unlock();
9717 int uninline_ret = onuninline->wait();
9718 client_lock.lock();
9719
9720 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9721 in->inline_data.clear();
9722 in->inline_version = CEPH_INLINE_NONE;
9723 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9724 check_caps(in, 0);
9725 } else
9726 r = uninline_ret;
9727 }
9728
9729 put_cap_ref(in, CEPH_CAP_FILE_WR);
9730 return r;
9731 }
9732
9733 int Client::_flush(Fh *f)
9734 {
9735 Inode *in = f->inode.get();
9736 int err = f->take_async_err();
9737 if (err != 0) {
9738 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9739 << cpp_strerror(err) << dendl;
9740 } else {
9741 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9742 }
9743
9744 return err;
9745 }
9746
9747 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9748 {
9749 struct ceph_statx stx;
9750 stx.stx_size = length;
9751 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9752 }
9753
9754 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9755 {
9756 std::lock_guard lock(client_lock);
9757 tout(cct) << __func__ << std::endl;
9758 tout(cct) << fd << std::endl;
9759 tout(cct) << length << std::endl;
9760
9761 if (unmounting)
9762 return -ENOTCONN;
9763
9764 Fh *f = get_filehandle(fd);
9765 if (!f)
9766 return -EBADF;
9767 #if defined(__linux__) && defined(O_PATH)
9768 if (f->flags & O_PATH)
9769 return -EBADF;
9770 #endif
9771 struct stat attr;
9772 attr.st_size = length;
9773 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9774 }
9775
9776 int Client::fsync(int fd, bool syncdataonly)
9777 {
9778 std::lock_guard lock(client_lock);
9779 tout(cct) << "fsync" << std::endl;
9780 tout(cct) << fd << std::endl;
9781 tout(cct) << syncdataonly << std::endl;
9782
9783 if (unmounting)
9784 return -ENOTCONN;
9785
9786 Fh *f = get_filehandle(fd);
9787 if (!f)
9788 return -EBADF;
9789 #if defined(__linux__) && defined(O_PATH)
9790 if (f->flags & O_PATH)
9791 return -EBADF;
9792 #endif
9793 int r = _fsync(f, syncdataonly);
9794 if (r == 0) {
9795 // The IOs in this fsync were okay, but maybe something happened
9796 // in the background that we shoudl be reporting?
9797 r = f->take_async_err();
9798 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9799 << ") = 0, async_err = " << r << dendl;
9800 } else {
9801 // Assume that an error we encountered during fsync, even reported
9802 // synchronously, would also have applied the error to the Fh, and we
9803 // should clear it here to avoid returning the same error again on next
9804 // call.
9805 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9806 << r << dendl;
9807 f->take_async_err();
9808 }
9809 return r;
9810 }
9811
9812 int Client::_fsync(Inode *in, bool syncdataonly)
9813 {
9814 int r = 0;
9815 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9816 ceph_tid_t flush_tid = 0;
9817 InodeRef tmp_ref;
9818 utime_t lat;
9819 utime_t start = ceph_clock_now();
9820
9821 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9822
9823 if (cct->_conf->client_oc) {
9824 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9825 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9826 _flush(in, object_cacher_completion.get());
9827 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9828 }
9829
9830 if (!syncdataonly && in->dirty_caps) {
9831 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9832 if (in->flushing_caps)
9833 flush_tid = last_flush_tid;
9834 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9835
9836 if (!syncdataonly && !in->unsafe_ops.empty()) {
9837 flush_mdlog_sync();
9838
9839 MetaRequest *req = in->unsafe_ops.back();
9840 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9841
9842 req->get();
9843 wait_on_list(req->waitfor_safe);
9844 put_request(req);
9845 }
9846
9847 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9848 client_lock.unlock();
9849 ldout(cct, 15) << "waiting on data to flush" << dendl;
9850 r = object_cacher_completion->wait();
9851 client_lock.lock();
9852 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9853 } else {
9854 // FIXME: this can starve
9855 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9856 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9857 << " uncommitted, waiting" << dendl;
9858 wait_on_list(in->waitfor_commit);
9859 }
9860 }
9861
9862 if (!r) {
9863 if (flush_tid > 0)
9864 wait_sync_caps(in, flush_tid);
9865
9866 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9867 } else {
9868 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9869 << cpp_strerror(-r) << dendl;
9870 }
9871
9872 lat = ceph_clock_now();
9873 lat -= start;
9874 logger->tinc(l_c_fsync, lat);
9875
9876 return r;
9877 }
9878
9879 int Client::_fsync(Fh *f, bool syncdataonly)
9880 {
9881 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9882 return _fsync(f->inode.get(), syncdataonly);
9883 }
9884
9885 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9886 {
9887 std::lock_guard lock(client_lock);
9888 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9889 tout(cct) << fd << std::endl;
9890
9891 if (unmounting)
9892 return -ENOTCONN;
9893
9894 Fh *f = get_filehandle(fd);
9895 if (!f)
9896 return -EBADF;
9897 int r = _getattr(f->inode, mask, perms);
9898 if (r < 0)
9899 return r;
9900 fill_stat(f->inode, stbuf, NULL);
9901 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9902 return r;
9903 }
9904
9905 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9906 unsigned int want, unsigned int flags)
9907 {
9908 std::lock_guard lock(client_lock);
9909 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9910 tout(cct) << fd << std::endl;
9911
9912 if (unmounting)
9913 return -ENOTCONN;
9914
9915 Fh *f = get_filehandle(fd);
9916 if (!f)
9917 return -EBADF;
9918
9919 unsigned mask = statx_to_mask(flags, want);
9920
9921 int r = 0;
9922 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9923 r = _getattr(f->inode, mask, perms);
9924 if (r < 0) {
9925 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9926 return r;
9927 }
9928 }
9929
9930 fill_statx(f->inode, mask, stx);
9931 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9932 return r;
9933 }
9934
9935 // not written yet, but i want to link!
9936
9937 int Client::chdir(const char *relpath, std::string &new_cwd,
9938 const UserPerm& perms)
9939 {
9940 std::lock_guard lock(client_lock);
9941 tout(cct) << "chdir" << std::endl;
9942 tout(cct) << relpath << std::endl;
9943
9944 if (unmounting)
9945 return -ENOTCONN;
9946
9947 filepath path(relpath);
9948 InodeRef in;
9949 int r = path_walk(path, &in, perms);
9950 if (r < 0)
9951 return r;
9952
9953 if (!(in.get()->is_dir()))
9954 return -ENOTDIR;
9955
9956 if (cwd != in)
9957 cwd.swap(in);
9958 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9959
9960 _getcwd(new_cwd, perms);
9961 return 0;
9962 }
9963
9964 void Client::_getcwd(string& dir, const UserPerm& perms)
9965 {
9966 filepath path;
9967 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
9968
9969 Inode *in = cwd.get();
9970 while (in != root) {
9971 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
9972
9973 // A cwd or ancester is unlinked
9974 if (in->dentries.empty()) {
9975 return;
9976 }
9977
9978 Dentry *dn = in->get_first_parent();
9979
9980
9981 if (!dn) {
9982 // look it up
9983 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
9984 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9985 filepath path(in->ino);
9986 req->set_filepath(path);
9987 req->set_inode(in);
9988 int res = make_request(req, perms);
9989 if (res < 0)
9990 break;
9991
9992 // start over
9993 path = filepath();
9994 in = cwd.get();
9995 continue;
9996 }
9997 path.push_front_dentry(dn->name);
9998 in = dn->dir->parent_inode;
9999 }
10000 dir = "/";
10001 dir += path.get_path();
10002 }
10003
10004 void Client::getcwd(string& dir, const UserPerm& perms)
10005 {
10006 std::lock_guard l(client_lock);
10007 if (!unmounting)
10008 _getcwd(dir, perms);
10009 }
10010
10011 int Client::statfs(const char *path, struct statvfs *stbuf,
10012 const UserPerm& perms)
10013 {
10014 std::lock_guard l(client_lock);
10015 tout(cct) << __func__ << std::endl;
10016 unsigned long int total_files_on_fs;
10017
10018 if (unmounting)
10019 return -ENOTCONN;
10020
10021 ceph_statfs stats;
10022 C_SaferCond cond;
10023
10024 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10025 if (data_pools.size() == 1) {
10026 objecter->get_fs_stats(stats, data_pools[0], &cond);
10027 } else {
10028 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10029 }
10030
10031 client_lock.unlock();
10032 int rval = cond.wait();
10033 assert(root);
10034 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10035 client_lock.lock();
10036
10037 if (rval < 0) {
10038 ldout(cct, 1) << "underlying call to statfs returned error: "
10039 << cpp_strerror(rval)
10040 << dendl;
10041 return rval;
10042 }
10043
10044 memset(stbuf, 0, sizeof(*stbuf));
10045
10046 /*
10047 * we're going to set a block size of 4MB so we can represent larger
10048 * FSes without overflowing. Additionally convert the space
10049 * measurements from KB to bytes while making them in terms of
10050 * blocks. We use 4MB only because it is big enough, and because it
10051 * actually *is* the (ceph) default block size.
10052 */
10053 const int CEPH_BLOCK_SHIFT = 22;
10054 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10055 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10056 stbuf->f_files = total_files_on_fs;
10057 stbuf->f_ffree = 0;
10058 stbuf->f_favail = -1;
10059 stbuf->f_fsid = -1; // ??
10060 stbuf->f_flag = 0; // ??
10061 stbuf->f_namemax = NAME_MAX;
10062
10063 // Usually quota_root will == root_ancestor, but if the mount root has no
10064 // quota but we can see a parent of it that does have a quota, we'll
10065 // respect that one instead.
10066 ceph_assert(root != nullptr);
10067 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10068
10069 // get_quota_root should always give us something
10070 // because client quotas are always enabled
10071 ceph_assert(quota_root != nullptr);
10072
10073 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10074
10075 // Skip the getattr if any sessions are stale, as we don't want to
10076 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10077 // is unhealthy.
10078 if (!_any_stale_sessions()) {
10079 int r = _getattr(quota_root, 0, perms, true);
10080 if (r != 0) {
10081 // Ignore return value: error getting latest inode metadata is not a good
10082 // reason to break "df".
10083 lderr(cct) << "Error in getattr on quota root 0x"
10084 << std::hex << quota_root->ino << std::dec
10085 << " statfs result may be outdated" << dendl;
10086 }
10087 }
10088
10089 // Special case: if there is a size quota set on the Inode acting
10090 // as the root for this client mount, then report the quota status
10091 // as the filesystem statistics.
10092 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10093 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10094 // It is possible for a quota to be exceeded: arithmetic here must
10095 // handle case where used > total.
10096 const fsblkcnt_t free = total > used ? total - used : 0;
10097
10098 stbuf->f_blocks = total;
10099 stbuf->f_bfree = free;
10100 stbuf->f_bavail = free;
10101 } else {
10102 // General case: report the cluster statistics returned from RADOS. Because
10103 // multiple pools may be used without one filesystem namespace via
10104 // layouts, this is the most correct thing we can do.
10105 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10106 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10107 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10108 }
10109
10110 return rval;
10111 }
10112
10113 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10114 struct flock *fl, uint64_t owner, bool removing)
10115 {
10116 ldout(cct, 10) << __func__ << " ino " << in->ino
10117 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10118 << " type " << fl->l_type << " owner " << owner
10119 << " " << fl->l_start << "~" << fl->l_len << dendl;
10120
10121 int lock_cmd;
10122 if (F_RDLCK == fl->l_type)
10123 lock_cmd = CEPH_LOCK_SHARED;
10124 else if (F_WRLCK == fl->l_type)
10125 lock_cmd = CEPH_LOCK_EXCL;
10126 else if (F_UNLCK == fl->l_type)
10127 lock_cmd = CEPH_LOCK_UNLOCK;
10128 else
10129 return -EIO;
10130
10131 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10132 sleep = 0;
10133
10134 /*
10135 * Set the most significant bit, so that MDS knows the 'owner'
10136 * is sufficient to identify the owner of lock. (old code uses
10137 * both 'owner' and 'pid')
10138 */
10139 owner |= (1ULL << 63);
10140
10141 MetaRequest *req = new MetaRequest(op);
10142 filepath path;
10143 in->make_nosnap_relative_path(path);
10144 req->set_filepath(path);
10145 req->set_inode(in);
10146
10147 req->head.args.filelock_change.rule = lock_type;
10148 req->head.args.filelock_change.type = lock_cmd;
10149 req->head.args.filelock_change.owner = owner;
10150 req->head.args.filelock_change.pid = fl->l_pid;
10151 req->head.args.filelock_change.start = fl->l_start;
10152 req->head.args.filelock_change.length = fl->l_len;
10153 req->head.args.filelock_change.wait = sleep;
10154
10155 int ret;
10156 bufferlist bl;
10157
10158 if (sleep && switch_interrupt_cb) {
10159 // enable interrupt
10160 switch_interrupt_cb(callback_handle, req->get());
10161 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10162 // disable interrupt
10163 switch_interrupt_cb(callback_handle, NULL);
10164 if (ret == 0 && req->aborted()) {
10165 // effect of this lock request has been revoked by the 'lock intr' request
10166 ret = req->get_abort_code();
10167 }
10168 put_request(req);
10169 } else {
10170 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10171 }
10172
10173 if (ret == 0) {
10174 if (op == CEPH_MDS_OP_GETFILELOCK) {
10175 ceph_filelock filelock;
10176 auto p = bl.cbegin();
10177 decode(filelock, p);
10178
10179 if (CEPH_LOCK_SHARED == filelock.type)
10180 fl->l_type = F_RDLCK;
10181 else if (CEPH_LOCK_EXCL == filelock.type)
10182 fl->l_type = F_WRLCK;
10183 else
10184 fl->l_type = F_UNLCK;
10185
10186 fl->l_whence = SEEK_SET;
10187 fl->l_start = filelock.start;
10188 fl->l_len = filelock.length;
10189 fl->l_pid = filelock.pid;
10190 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10191 ceph_lock_state_t *lock_state;
10192 if (lock_type == CEPH_LOCK_FCNTL) {
10193 if (!in->fcntl_locks)
10194 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10195 lock_state = in->fcntl_locks.get();
10196 } else if (lock_type == CEPH_LOCK_FLOCK) {
10197 if (!in->flock_locks)
10198 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10199 lock_state = in->flock_locks.get();
10200 } else {
10201 ceph_abort();
10202 return -EINVAL;
10203 }
10204 _update_lock_state(fl, owner, lock_state);
10205
10206 if (!removing) {
10207 if (lock_type == CEPH_LOCK_FCNTL) {
10208 if (!fh->fcntl_locks)
10209 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10210 lock_state = fh->fcntl_locks.get();
10211 } else {
10212 if (!fh->flock_locks)
10213 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10214 lock_state = fh->flock_locks.get();
10215 }
10216 _update_lock_state(fl, owner, lock_state);
10217 }
10218 } else
10219 ceph_abort();
10220 }
10221 return ret;
10222 }
10223
10224 int Client::_interrupt_filelock(MetaRequest *req)
10225 {
10226 // Set abort code, but do not kick. The abort code prevents the request
10227 // from being re-sent.
10228 req->abort(-EINTR);
10229 if (req->mds < 0)
10230 return 0; // haven't sent the request
10231
10232 Inode *in = req->inode();
10233
10234 int lock_type;
10235 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10236 lock_type = CEPH_LOCK_FLOCK_INTR;
10237 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10238 lock_type = CEPH_LOCK_FCNTL_INTR;
10239 else {
10240 ceph_abort();
10241 return -EINVAL;
10242 }
10243
10244 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10245 filepath path;
10246 in->make_nosnap_relative_path(path);
10247 intr_req->set_filepath(path);
10248 intr_req->set_inode(in);
10249 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10250 intr_req->head.args.filelock_change.rule = lock_type;
10251 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10252
10253 UserPerm perms(req->get_uid(), req->get_gid());
10254 return make_request(intr_req, perms, NULL, NULL, -1);
10255 }
10256
10257 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10258 {
10259 if (!in->fcntl_locks && !in->flock_locks)
10260 return;
10261
10262 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10263 encode(nr_fcntl_locks, bl);
10264 if (nr_fcntl_locks) {
10265 auto &lock_state = in->fcntl_locks;
10266 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10267 p != lock_state->held_locks.end();
10268 ++p)
10269 encode(p->second, bl);
10270 }
10271
10272 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10273 encode(nr_flock_locks, bl);
10274 if (nr_flock_locks) {
10275 auto &lock_state = in->flock_locks;
10276 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10277 p != lock_state->held_locks.end();
10278 ++p)
10279 encode(p->second, bl);
10280 }
10281
10282 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10283 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10284 }
10285
10286 void Client::_release_filelocks(Fh *fh)
10287 {
10288 if (!fh->fcntl_locks && !fh->flock_locks)
10289 return;
10290
10291 Inode *in = fh->inode.get();
10292 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10293
10294 list<pair<int, ceph_filelock> > to_release;
10295
10296 if (fh->fcntl_locks) {
10297 auto &lock_state = fh->fcntl_locks;
10298 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10299 p != lock_state->held_locks.end();
10300 ++p)
10301 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10302 lock_state.reset();
10303 }
10304 if (fh->flock_locks) {
10305 auto &lock_state = fh->flock_locks;
10306 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10307 p != lock_state->held_locks.end();
10308 ++p)
10309 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10310 lock_state.reset();
10311 }
10312
10313 if (to_release.empty())
10314 return;
10315
10316 // mds has already released filelocks if session was closed.
10317 if (in->caps.empty())
10318 return;
10319
10320 struct flock fl;
10321 memset(&fl, 0, sizeof(fl));
10322 fl.l_whence = SEEK_SET;
10323 fl.l_type = F_UNLCK;
10324
10325 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10326 p != to_release.end();
10327 ++p) {
10328 fl.l_start = p->second.start;
10329 fl.l_len = p->second.length;
10330 fl.l_pid = p->second.pid;
10331 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10332 p->second.owner, true);
10333 }
10334 }
10335
10336 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10337 ceph_lock_state_t *lock_state)
10338 {
10339 int lock_cmd;
10340 if (F_RDLCK == fl->l_type)
10341 lock_cmd = CEPH_LOCK_SHARED;
10342 else if (F_WRLCK == fl->l_type)
10343 lock_cmd = CEPH_LOCK_EXCL;
10344 else
10345 lock_cmd = CEPH_LOCK_UNLOCK;;
10346
10347 ceph_filelock filelock;
10348 filelock.start = fl->l_start;
10349 filelock.length = fl->l_len;
10350 filelock.client = 0;
10351 // see comment in _do_filelock()
10352 filelock.owner = owner | (1ULL << 63);
10353 filelock.pid = fl->l_pid;
10354 filelock.type = lock_cmd;
10355
10356 if (filelock.type == CEPH_LOCK_UNLOCK) {
10357 list<ceph_filelock> activated_locks;
10358 lock_state->remove_lock(filelock, activated_locks);
10359 } else {
10360 bool r = lock_state->add_lock(filelock, false, false, NULL);
10361 ceph_assert(r);
10362 }
10363 }
10364
10365 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10366 {
10367 Inode *in = fh->inode.get();
10368 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10369 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10370 return ret;
10371 }
10372
10373 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10374 {
10375 Inode *in = fh->inode.get();
10376 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10377 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10378 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10379 return ret;
10380 }
10381
10382 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10383 {
10384 Inode *in = fh->inode.get();
10385 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10386
10387 int sleep = !(cmd & LOCK_NB);
10388 cmd &= ~LOCK_NB;
10389
10390 int type;
10391 switch (cmd) {
10392 case LOCK_SH:
10393 type = F_RDLCK;
10394 break;
10395 case LOCK_EX:
10396 type = F_WRLCK;
10397 break;
10398 case LOCK_UN:
10399 type = F_UNLCK;
10400 break;
10401 default:
10402 return -EINVAL;
10403 }
10404
10405 struct flock fl;
10406 memset(&fl, 0, sizeof(fl));
10407 fl.l_type = type;
10408 fl.l_whence = SEEK_SET;
10409
10410 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10411 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10412 return ret;
10413 }
10414
10415 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10416 {
10417 /* Since the only thing this does is wrap a call to statfs, and
10418 statfs takes a lock, it doesn't seem we have a need to split it
10419 out. */
10420 return statfs(0, stbuf, perms);
10421 }
10422
10423 void Client::ll_register_callbacks(struct client_callback_args *args)
10424 {
10425 if (!args)
10426 return;
10427 std::lock_guard l(client_lock);
10428 ldout(cct, 10) << __func__ << " cb " << args->handle
10429 << " invalidate_ino_cb " << args->ino_cb
10430 << " invalidate_dentry_cb " << args->dentry_cb
10431 << " switch_interrupt_cb " << args->switch_intr_cb
10432 << " remount_cb " << args->remount_cb
10433 << dendl;
10434 callback_handle = args->handle;
10435 if (args->ino_cb) {
10436 ino_invalidate_cb = args->ino_cb;
10437 async_ino_invalidator.start();
10438 }
10439 if (args->dentry_cb) {
10440 dentry_invalidate_cb = args->dentry_cb;
10441 async_dentry_invalidator.start();
10442 }
10443 if (args->switch_intr_cb) {
10444 switch_interrupt_cb = args->switch_intr_cb;
10445 interrupt_finisher.start();
10446 }
10447 if (args->remount_cb) {
10448 remount_cb = args->remount_cb;
10449 remount_finisher.start();
10450 }
10451 umask_cb = args->umask_cb;
10452 }
10453
10454 int Client::test_dentry_handling(bool can_invalidate)
10455 {
10456 int r = 0;
10457
10458 can_invalidate_dentries = can_invalidate;
10459
10460 if (can_invalidate_dentries) {
10461 ceph_assert(dentry_invalidate_cb);
10462 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10463 r = 0;
10464 } else {
10465 ceph_assert(remount_cb);
10466 ldout(cct, 1) << "using remount_cb" << dendl;
10467 r = _do_remount(false);
10468 }
10469
10470 return r;
10471 }
10472
10473 int Client::_sync_fs()
10474 {
10475 ldout(cct, 10) << __func__ << dendl;
10476
10477 // flush file data
10478 std::unique_ptr<C_SaferCond> cond = nullptr;
10479 if (cct->_conf->client_oc) {
10480 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10481 objectcacher->flush_all(cond.get());
10482 }
10483
10484 // flush caps
10485 flush_caps_sync();
10486 ceph_tid_t flush_tid = last_flush_tid;
10487
10488 // wait for unsafe mds requests
10489 wait_unsafe_requests();
10490
10491 wait_sync_caps(flush_tid);
10492
10493 if (nullptr != cond) {
10494 client_lock.unlock();
10495 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10496 cond->wait();
10497 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10498 client_lock.lock();
10499 }
10500
10501 return 0;
10502 }
10503
10504 int Client::sync_fs()
10505 {
10506 std::lock_guard l(client_lock);
10507
10508 if (unmounting)
10509 return -ENOTCONN;
10510
10511 return _sync_fs();
10512 }
10513
10514 int64_t Client::drop_caches()
10515 {
10516 std::lock_guard l(client_lock);
10517 return objectcacher->release_all();
10518 }
10519
10520 int Client::_lazyio(Fh *fh, int enable)
10521 {
10522 Inode *in = fh->inode.get();
10523 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10524
10525 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10526 return 0;
10527
10528 int orig_mode = fh->mode;
10529 if (enable) {
10530 fh->mode |= CEPH_FILE_MODE_LAZY;
10531 in->get_open_ref(fh->mode);
10532 in->put_open_ref(orig_mode);
10533 check_caps(in, CHECK_CAPS_NODELAY);
10534 } else {
10535 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10536 in->get_open_ref(fh->mode);
10537 in->put_open_ref(orig_mode);
10538 check_caps(in, 0);
10539 }
10540
10541 return 0;
10542 }
10543
10544 int Client::lazyio(int fd, int enable)
10545 {
10546 std::lock_guard l(client_lock);
10547 Fh *f = get_filehandle(fd);
10548 if (!f)
10549 return -EBADF;
10550
10551 return _lazyio(f, enable);
10552 }
10553
10554 int Client::ll_lazyio(Fh *fh, int enable)
10555 {
10556 std::lock_guard lock(client_lock);
10557 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10558 tout(cct) << __func__ << std::endl;
10559
10560 return _lazyio(fh, enable);
10561 }
10562
10563 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10564 {
10565 std::lock_guard l(client_lock);
10566 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10567 << ", " << offset << ", " << count << ")" << dendl;
10568
10569 Fh *f = get_filehandle(fd);
10570 if (!f)
10571 return -EBADF;
10572
10573 // for now
10574 _fsync(f, true);
10575
10576 return 0;
10577 }
10578
10579 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10580 {
10581 std::lock_guard l(client_lock);
10582 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10583 << ", " << offset << ", " << count << ")" << dendl;
10584
10585 Fh *f = get_filehandle(fd);
10586 if (!f)
10587 return -EBADF;
10588 Inode *in = f->inode.get();
10589
10590 _fsync(f, true);
10591 if (_release(in)) {
10592 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10593 if (r < 0)
10594 return r;
10595 }
10596 return 0;
10597 }
10598
10599
10600 // =============================
10601 // snaps
10602
10603 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10604 {
10605 std::lock_guard l(client_lock);
10606
10607 if (unmounting)
10608 return -ENOTCONN;
10609
10610 filepath path(relpath);
10611 InodeRef in;
10612 int r = path_walk(path, &in, perm);
10613 if (r < 0)
10614 return r;
10615 if (cct->_conf->client_permissions) {
10616 r = may_create(in.get(), perm);
10617 if (r < 0)
10618 return r;
10619 }
10620 Inode *snapdir = open_snapdir(in.get());
10621 return _mkdir(snapdir, name, 0, perm);
10622 }
10623
10624 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10625 {
10626 std::lock_guard l(client_lock);
10627
10628 if (unmounting)
10629 return -ENOTCONN;
10630
10631 filepath path(relpath);
10632 InodeRef in;
10633 int r = path_walk(path, &in, perms);
10634 if (r < 0)
10635 return r;
10636 if (cct->_conf->client_permissions) {
10637 r = may_delete(in.get(), NULL, perms);
10638 if (r < 0)
10639 return r;
10640 }
10641 Inode *snapdir = open_snapdir(in.get());
10642 return _rmdir(snapdir, name, perms);
10643 }
10644
10645 // =============================
10646 // expose caps
10647
10648 int Client::get_caps_issued(int fd) {
10649
10650 std::lock_guard lock(client_lock);
10651
10652 if (unmounting)
10653 return -ENOTCONN;
10654
10655 Fh *f = get_filehandle(fd);
10656 if (!f)
10657 return -EBADF;
10658
10659 return f->inode->caps_issued();
10660 }
10661
10662 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10663 {
10664 std::lock_guard lock(client_lock);
10665
10666 if (unmounting)
10667 return -ENOTCONN;
10668
10669 filepath p(path);
10670 InodeRef in;
10671 int r = path_walk(p, &in, perms, true);
10672 if (r < 0)
10673 return r;
10674 return in->caps_issued();
10675 }
10676
10677 // =========================================
10678 // low level
10679
10680 Inode *Client::open_snapdir(Inode *diri)
10681 {
10682 Inode *in;
10683 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10684 if (!inode_map.count(vino)) {
10685 in = new Inode(this, vino, &diri->layout);
10686
10687 in->ino = diri->ino;
10688 in->snapid = CEPH_SNAPDIR;
10689 in->mode = diri->mode;
10690 in->uid = diri->uid;
10691 in->gid = diri->gid;
10692 in->nlink = 1;
10693 in->mtime = diri->mtime;
10694 in->ctime = diri->ctime;
10695 in->btime = diri->btime;
10696 in->size = diri->size;
10697 in->change_attr = diri->change_attr;
10698
10699 in->dirfragtree.clear();
10700 in->snapdir_parent = diri;
10701 diri->flags |= I_SNAPDIR_OPEN;
10702 inode_map[vino] = in;
10703 if (use_faked_inos())
10704 _assign_faked_ino(in);
10705 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10706 } else {
10707 in = inode_map[vino];
10708 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10709 }
10710 return in;
10711 }
10712
10713 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10714 Inode **out, const UserPerm& perms)
10715 {
10716 std::lock_guard lock(client_lock);
10717 vinodeno_t vparent = _get_vino(parent);
10718 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10719 tout(cct) << __func__ << std::endl;
10720 tout(cct) << name << std::endl;
10721
10722 if (unmounting)
10723 return -ENOTCONN;
10724
10725 int r = 0;
10726 if (!fuse_default_permissions) {
10727 if (strcmp(name, ".") && strcmp(name, "..")) {
10728 r = may_lookup(parent, perms);
10729 if (r < 0)
10730 return r;
10731 }
10732 }
10733
10734 string dname(name);
10735 InodeRef in;
10736
10737 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10738 if (r < 0) {
10739 attr->st_ino = 0;
10740 goto out;
10741 }
10742
10743 ceph_assert(in);
10744 fill_stat(in, attr);
10745 _ll_get(in.get());
10746
10747 out:
10748 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10749 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10750 tout(cct) << attr->st_ino << std::endl;
10751 *out = in.get();
10752 return r;
10753 }
10754
10755 int Client::ll_lookup_inode(
10756 struct inodeno_t ino,
10757 const UserPerm& perms,
10758 Inode **inode)
10759 {
10760 ceph_assert(inode != NULL);
10761 std::lock_guard lock(client_lock);
10762 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10763
10764 if (unmounting)
10765 return -ENOTCONN;
10766
10767 // Num1: get inode and *inode
10768 int r = _lookup_ino(ino, perms, inode);
10769 if (r)
10770 return r;
10771
10772 ceph_assert(*inode != NULL);
10773
10774 if (!(*inode)->dentries.empty()) {
10775 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10776 return 0;
10777 }
10778
10779 if ((*inode)->is_root()) {
10780 ldout(cct, 8) << "ino is root, no parent" << dendl;
10781 return 0;
10782 }
10783
10784 // Num2: Request the parent inode, so that we can look up the name
10785 Inode *parent;
10786 r = _lookup_parent(*inode, perms, &parent);
10787 if (r) {
10788 _ll_forget(*inode, 1);
10789 return r;
10790 }
10791
10792 ceph_assert(parent != NULL);
10793
10794 // Num3: Finally, get the name (dentry) of the requested inode
10795 r = _lookup_name(*inode, parent, perms);
10796 if (r) {
10797 // Unexpected error
10798 _ll_forget(parent, 1);
10799 _ll_forget(*inode, 1);
10800 return r;
10801 }
10802
10803 _ll_forget(parent, 1);
10804 return 0;
10805 }
10806
10807 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10808 struct ceph_statx *stx, unsigned want, unsigned flags,
10809 const UserPerm& perms)
10810 {
10811 std::lock_guard lock(client_lock);
10812 vinodeno_t vparent = _get_vino(parent);
10813 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10814 tout(cct) << "ll_lookupx" << std::endl;
10815 tout(cct) << name << std::endl;
10816
10817 if (unmounting)
10818 return -ENOTCONN;
10819
10820 int r = 0;
10821 if (!fuse_default_permissions) {
10822 r = may_lookup(parent, perms);
10823 if (r < 0)
10824 return r;
10825 }
10826
10827 string dname(name);
10828 InodeRef in;
10829
10830 unsigned mask = statx_to_mask(flags, want);
10831 r = _lookup(parent, dname, mask, &in, perms);
10832 if (r < 0) {
10833 stx->stx_ino = 0;
10834 stx->stx_mask = 0;
10835 } else {
10836 ceph_assert(in);
10837 fill_statx(in, mask, stx);
10838 _ll_get(in.get());
10839 }
10840
10841 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10842 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10843 tout(cct) << stx->stx_ino << std::endl;
10844 *out = in.get();
10845 return r;
10846 }
10847
10848 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10849 unsigned int want, unsigned int flags, const UserPerm& perms)
10850 {
10851 std::lock_guard lock(client_lock);
10852
10853 if (unmounting)
10854 return -ENOTCONN;
10855
10856 filepath fp(name, 0);
10857 InodeRef in;
10858 int rc;
10859 unsigned mask = statx_to_mask(flags, want);
10860
10861 ldout(cct, 3) << __func__ << " " << name << dendl;
10862 tout(cct) << __func__ << std::endl;
10863 tout(cct) << name << std::endl;
10864
10865 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10866 if (rc < 0) {
10867 /* zero out mask, just in case... */
10868 stx->stx_mask = 0;
10869 stx->stx_ino = 0;
10870 *out = NULL;
10871 return rc;
10872 } else {
10873 ceph_assert(in);
10874 fill_statx(in, mask, stx);
10875 _ll_get(in.get());
10876 *out = in.get();
10877 return 0;
10878 }
10879 }
10880
10881 void Client::_ll_get(Inode *in)
10882 {
10883 if (in->ll_ref == 0) {
10884 in->get();
10885 if (in->is_dir() && !in->dentries.empty()) {
10886 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10887 in->get_first_parent()->get(); // pin dentry
10888 }
10889 if (in->snapid != CEPH_NOSNAP)
10890 ll_snap_ref[in->snapid]++;
10891 }
10892 in->ll_get();
10893 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10894 }
10895
10896 int Client::_ll_put(Inode *in, uint64_t num)
10897 {
10898 in->ll_put(num);
10899 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10900 if (in->ll_ref == 0) {
10901 if (in->is_dir() && !in->dentries.empty()) {
10902 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10903 in->get_first_parent()->put(); // unpin dentry
10904 }
10905 if (in->snapid != CEPH_NOSNAP) {
10906 auto p = ll_snap_ref.find(in->snapid);
10907 ceph_assert(p != ll_snap_ref.end());
10908 ceph_assert(p->second > 0);
10909 if (--p->second == 0)
10910 ll_snap_ref.erase(p);
10911 }
10912 put_inode(in);
10913 return 0;
10914 } else {
10915 return in->ll_ref;
10916 }
10917 }
10918
10919 void Client::_ll_drop_pins()
10920 {
10921 ldout(cct, 10) << __func__ << dendl;
10922 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10923 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10924 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10925 it != inode_map.end();
10926 it = next) {
10927 Inode *in = it->second;
10928 next = it;
10929 ++next;
10930 if (in->ll_ref){
10931 to_be_put.insert(in);
10932 _ll_put(in, in->ll_ref);
10933 }
10934 }
10935 }
10936
10937 bool Client::_ll_forget(Inode *in, uint64_t count)
10938 {
10939 inodeno_t ino = in->ino;
10940
10941 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10942 tout(cct) << __func__ << std::endl;
10943 tout(cct) << ino.val << std::endl;
10944 tout(cct) << count << std::endl;
10945
10946 // Ignore forget if we're no longer mounted
10947 if (unmounting)
10948 return true;
10949
10950 if (ino == 1) return true; // ignore forget on root.
10951
10952 bool last = false;
10953 if (in->ll_ref < count) {
10954 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10955 << ", which only has ll_ref=" << in->ll_ref << dendl;
10956 _ll_put(in, in->ll_ref);
10957 last = true;
10958 } else {
10959 if (_ll_put(in, count) == 0)
10960 last = true;
10961 }
10962
10963 return last;
10964 }
10965
10966 bool Client::ll_forget(Inode *in, uint64_t count)
10967 {
10968 std::lock_guard lock(client_lock);
10969 return _ll_forget(in, count);
10970 }
10971
10972 bool Client::ll_put(Inode *in)
10973 {
10974 /* ll_forget already takes the lock */
10975 return ll_forget(in, 1);
10976 }
10977
10978 int Client::ll_get_snap_ref(snapid_t snap)
10979 {
10980 std::lock_guard lock(client_lock);
10981 auto p = ll_snap_ref.find(snap);
10982 if (p != ll_snap_ref.end())
10983 return p->second;
10984 return 0;
10985 }
10986
10987 snapid_t Client::ll_get_snapid(Inode *in)
10988 {
10989 std::lock_guard lock(client_lock);
10990 return in->snapid;
10991 }
10992
10993 Inode *Client::ll_get_inode(ino_t ino)
10994 {
10995 std::lock_guard lock(client_lock);
10996
10997 if (unmounting)
10998 return NULL;
10999
11000 vinodeno_t vino = _map_faked_ino(ino);
11001 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11002 if (p == inode_map.end())
11003 return NULL;
11004 Inode *in = p->second;
11005 _ll_get(in);
11006 return in;
11007 }
11008
11009 Inode *Client::ll_get_inode(vinodeno_t vino)
11010 {
11011 std::lock_guard lock(client_lock);
11012
11013 if (unmounting)
11014 return NULL;
11015
11016 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11017 if (p == inode_map.end())
11018 return NULL;
11019 Inode *in = p->second;
11020 _ll_get(in);
11021 return in;
11022 }
11023
11024 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11025 {
11026 vinodeno_t vino = _get_vino(in);
11027
11028 ldout(cct, 8) << __func__ << " " << vino << dendl;
11029 tout(cct) << __func__ << std::endl;
11030 tout(cct) << vino.ino.val << std::endl;
11031
11032 if (vino.snapid < CEPH_NOSNAP)
11033 return 0;
11034 else
11035 return _getattr(in, caps, perms);
11036 }
11037
11038 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11039 {
11040 std::lock_guard lock(client_lock);
11041
11042 if (unmounting)
11043 return -ENOTCONN;
11044
11045 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11046
11047 if (res == 0)
11048 fill_stat(in, attr);
11049 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11050 return res;
11051 }
11052
11053 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11054 unsigned int flags, const UserPerm& perms)
11055 {
11056 std::lock_guard lock(client_lock);
11057
11058 if (unmounting)
11059 return -ENOTCONN;
11060
11061 int res = 0;
11062 unsigned mask = statx_to_mask(flags, want);
11063
11064 if (mask && !in->caps_issued_mask(mask, true))
11065 res = _ll_getattr(in, mask, perms);
11066
11067 if (res == 0)
11068 fill_statx(in, mask, stx);
11069 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11070 return res;
11071 }
11072
11073 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11074 const UserPerm& perms, InodeRef *inp)
11075 {
11076 vinodeno_t vino = _get_vino(in);
11077
11078 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11079 << dendl;
11080 tout(cct) << __func__ << std::endl;
11081 tout(cct) << vino.ino.val << std::endl;
11082 tout(cct) << stx->stx_mode << std::endl;
11083 tout(cct) << stx->stx_uid << std::endl;
11084 tout(cct) << stx->stx_gid << std::endl;
11085 tout(cct) << stx->stx_size << std::endl;
11086 tout(cct) << stx->stx_mtime << std::endl;
11087 tout(cct) << stx->stx_atime << std::endl;
11088 tout(cct) << stx->stx_btime << std::endl;
11089 tout(cct) << mask << std::endl;
11090
11091 if (!fuse_default_permissions) {
11092 int res = may_setattr(in, stx, mask, perms);
11093 if (res < 0)
11094 return res;
11095 }
11096
11097 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11098
11099 return __setattrx(in, stx, mask, perms, inp);
11100 }
11101
11102 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11103 const UserPerm& perms)
11104 {
11105 std::lock_guard lock(client_lock);
11106
11107 if (unmounting)
11108 return -ENOTCONN;
11109
11110 InodeRef target(in);
11111 int res = _ll_setattrx(in, stx, mask, perms, &target);
11112 if (res == 0) {
11113 ceph_assert(in == target.get());
11114 fill_statx(in, in->caps_issued(), stx);
11115 }
11116
11117 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11118 return res;
11119 }
11120
11121 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11122 const UserPerm& perms)
11123 {
11124 struct ceph_statx stx;
11125 stat_to_statx(attr, &stx);
11126
11127 std::lock_guard lock(client_lock);
11128
11129 if (unmounting)
11130 return -ENOTCONN;
11131
11132 InodeRef target(in);
11133 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11134 if (res == 0) {
11135 ceph_assert(in == target.get());
11136 fill_stat(in, attr);
11137 }
11138
11139 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11140 return res;
11141 }
11142
11143
11144 // ----------
11145 // xattrs
11146
11147 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11148 const UserPerm& perms)
11149 {
11150 std::lock_guard lock(client_lock);
11151
11152 if (unmounting)
11153 return -ENOTCONN;
11154
11155 InodeRef in;
11156 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11157 if (r < 0)
11158 return r;
11159 return _getxattr(in, name, value, size, perms);
11160 }
11161
11162 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11163 const UserPerm& perms)
11164 {
11165 std::lock_guard lock(client_lock);
11166
11167 if (unmounting)
11168 return -ENOTCONN;
11169
11170 InodeRef in;
11171 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11172 if (r < 0)
11173 return r;
11174 return _getxattr(in, name, value, size, perms);
11175 }
11176
11177 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11178 const UserPerm& perms)
11179 {
11180 std::lock_guard lock(client_lock);
11181
11182 if (unmounting)
11183 return -ENOTCONN;
11184
11185 Fh *f = get_filehandle(fd);
11186 if (!f)
11187 return -EBADF;
11188 return _getxattr(f->inode, name, value, size, perms);
11189 }
11190
11191 int Client::listxattr(const char *path, char *list, size_t size,
11192 const UserPerm& perms)
11193 {
11194 std::lock_guard lock(client_lock);
11195
11196 if (unmounting)
11197 return -ENOTCONN;
11198
11199 InodeRef in;
11200 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11201 if (r < 0)
11202 return r;
11203 return Client::_listxattr(in.get(), list, size, perms);
11204 }
11205
11206 int Client::llistxattr(const char *path, char *list, size_t size,
11207 const UserPerm& perms)
11208 {
11209 std::lock_guard lock(client_lock);
11210
11211 if (unmounting)
11212 return -ENOTCONN;
11213
11214 InodeRef in;
11215 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11216 if (r < 0)
11217 return r;
11218 return Client::_listxattr(in.get(), list, size, perms);
11219 }
11220
11221 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11222 {
11223 std::lock_guard lock(client_lock);
11224
11225 if (unmounting)
11226 return -ENOTCONN;
11227
11228 Fh *f = get_filehandle(fd);
11229 if (!f)
11230 return -EBADF;
11231 return Client::_listxattr(f->inode.get(), list, size, perms);
11232 }
11233
11234 int Client::removexattr(const char *path, const char *name,
11235 const UserPerm& perms)
11236 {
11237 std::lock_guard lock(client_lock);
11238
11239 if (unmounting)
11240 return -ENOTCONN;
11241
11242 InodeRef in;
11243 int r = Client::path_walk(path, &in, perms, true);
11244 if (r < 0)
11245 return r;
11246 return _removexattr(in, name, perms);
11247 }
11248
11249 int Client::lremovexattr(const char *path, const char *name,
11250 const UserPerm& perms)
11251 {
11252 std::lock_guard lock(client_lock);
11253
11254 if (unmounting)
11255 return -ENOTCONN;
11256
11257 InodeRef in;
11258 int r = Client::path_walk(path, &in, perms, false);
11259 if (r < 0)
11260 return r;
11261 return _removexattr(in, name, perms);
11262 }
11263
11264 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11265 {
11266 std::lock_guard lock(client_lock);
11267
11268 if (unmounting)
11269 return -ENOTCONN;
11270
11271 Fh *f = get_filehandle(fd);
11272 if (!f)
11273 return -EBADF;
11274 return _removexattr(f->inode, name, perms);
11275 }
11276
11277 int Client::setxattr(const char *path, const char *name, const void *value,
11278 size_t size, int flags, const UserPerm& perms)
11279 {
11280 _setxattr_maybe_wait_for_osdmap(name, value, size);
11281
11282 std::lock_guard lock(client_lock);
11283
11284 if (unmounting)
11285 return -ENOTCONN;
11286
11287 InodeRef in;
11288 int r = Client::path_walk(path, &in, perms, true);
11289 if (r < 0)
11290 return r;
11291 return _setxattr(in, name, value, size, flags, perms);
11292 }
11293
11294 int Client::lsetxattr(const char *path, const char *name, const void *value,
11295 size_t size, int flags, const UserPerm& perms)
11296 {
11297 _setxattr_maybe_wait_for_osdmap(name, value, size);
11298
11299 std::lock_guard lock(client_lock);
11300
11301 if (unmounting)
11302 return -ENOTCONN;
11303
11304 InodeRef in;
11305 int r = Client::path_walk(path, &in, perms, false);
11306 if (r < 0)
11307 return r;
11308 return _setxattr(in, name, value, size, flags, perms);
11309 }
11310
11311 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11312 int flags, const UserPerm& perms)
11313 {
11314 _setxattr_maybe_wait_for_osdmap(name, value, size);
11315
11316 std::lock_guard lock(client_lock);
11317
11318 if (unmounting)
11319 return -ENOTCONN;
11320
11321 Fh *f = get_filehandle(fd);
11322 if (!f)
11323 return -EBADF;
11324 return _setxattr(f->inode, name, value, size, flags, perms);
11325 }
11326
11327 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11328 const UserPerm& perms)
11329 {
11330 int r;
11331
11332 const VXattr *vxattr = _match_vxattr(in, name);
11333 if (vxattr) {
11334 r = -ENODATA;
11335
11336 // Do a force getattr to get the latest quota before returning
11337 // a value to userspace.
11338 int flags = 0;
11339 if (vxattr->flags & VXATTR_RSTAT) {
11340 flags |= CEPH_STAT_RSTAT;
11341 }
11342 r = _getattr(in, flags, perms, true);
11343 if (r != 0) {
11344 // Error from getattr!
11345 return r;
11346 }
11347
11348 // call pointer-to-member function
11349 char buf[256];
11350 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11351 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11352 } else {
11353 r = -ENODATA;
11354 }
11355
11356 if (size != 0) {
11357 if (r > (int)size) {
11358 r = -ERANGE;
11359 } else if (r > 0) {
11360 memcpy(value, buf, r);
11361 }
11362 }
11363 goto out;
11364 }
11365
11366 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11367 r = -EOPNOTSUPP;
11368 goto out;
11369 }
11370
11371 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11372 if (r == 0) {
11373 string n(name);
11374 r = -ENODATA;
11375 if (in->xattrs.count(n)) {
11376 r = in->xattrs[n].length();
11377 if (r > 0 && size != 0) {
11378 if (size >= (unsigned)r)
11379 memcpy(value, in->xattrs[n].c_str(), r);
11380 else
11381 r = -ERANGE;
11382 }
11383 }
11384 }
11385 out:
11386 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11387 return r;
11388 }
11389
11390 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11391 const UserPerm& perms)
11392 {
11393 if (cct->_conf->client_permissions) {
11394 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11395 if (r < 0)
11396 return r;
11397 }
11398 return _getxattr(in.get(), name, value, size, perms);
11399 }
11400
11401 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11402 size_t size, const UserPerm& perms)
11403 {
11404 std::lock_guard lock(client_lock);
11405
11406 if (unmounting)
11407 return -ENOTCONN;
11408
11409 vinodeno_t vino = _get_vino(in);
11410
11411 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11412 tout(cct) << __func__ << std::endl;
11413 tout(cct) << vino.ino.val << std::endl;
11414 tout(cct) << name << std::endl;
11415
11416 if (!fuse_default_permissions) {
11417 int r = xattr_permission(in, name, MAY_READ, perms);
11418 if (r < 0)
11419 return r;
11420 }
11421
11422 return _getxattr(in, name, value, size, perms);
11423 }
11424
11425 int Client::_listxattr(Inode *in, char *name, size_t size,
11426 const UserPerm& perms)
11427 {
11428 bool len_only = (size == 0);
11429 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11430 if (r != 0) {
11431 goto out;
11432 }
11433
11434 r = 0;
11435 for (const auto& p : in->xattrs) {
11436 size_t this_len = p.first.length() + 1;
11437 r += this_len;
11438 if (len_only)
11439 continue;
11440
11441 if (this_len > size) {
11442 r = -ERANGE;
11443 goto out;
11444 }
11445
11446 memcpy(name, p.first.c_str(), this_len);
11447 name += this_len;
11448 size -= this_len;
11449 }
11450 out:
11451 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11452 return r;
11453 }
11454
11455 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11456 const UserPerm& perms)
11457 {
11458 std::lock_guard lock(client_lock);
11459
11460 if (unmounting)
11461 return -ENOTCONN;
11462
11463 vinodeno_t vino = _get_vino(in);
11464
11465 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11466 tout(cct) << __func__ << std::endl;
11467 tout(cct) << vino.ino.val << std::endl;
11468 tout(cct) << size << std::endl;
11469
11470 return _listxattr(in, names, size, perms);
11471 }
11472
11473 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11474 size_t size, int flags, const UserPerm& perms)
11475 {
11476
11477 int xattr_flags = 0;
11478 if (!value)
11479 xattr_flags |= CEPH_XATTR_REMOVE;
11480 if (flags & XATTR_CREATE)
11481 xattr_flags |= CEPH_XATTR_CREATE;
11482 if (flags & XATTR_REPLACE)
11483 xattr_flags |= CEPH_XATTR_REPLACE;
11484
11485 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11486 filepath path;
11487 in->make_nosnap_relative_path(path);
11488 req->set_filepath(path);
11489 req->set_string2(name);
11490 req->set_inode(in);
11491 req->head.args.setxattr.flags = xattr_flags;
11492
11493 bufferlist bl;
11494 assert (value || size == 0);
11495 bl.append((const char*)value, size);
11496 req->set_data(bl);
11497
11498 int res = make_request(req, perms);
11499
11500 trim_cache();
11501 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11502 res << dendl;
11503 return res;
11504 }
11505
11506 int Client::_setxattr(Inode *in, const char *name, const void *value,
11507 size_t size, int flags, const UserPerm& perms)
11508 {
11509 if (in->snapid != CEPH_NOSNAP) {
11510 return -EROFS;
11511 }
11512
11513 bool posix_acl_xattr = false;
11514 if (acl_type == POSIX_ACL)
11515 posix_acl_xattr = !strncmp(name, "system.", 7);
11516
11517 if (strncmp(name, "user.", 5) &&
11518 strncmp(name, "security.", 9) &&
11519 strncmp(name, "trusted.", 8) &&
11520 strncmp(name, "ceph.", 5) &&
11521 !posix_acl_xattr)
11522 return -EOPNOTSUPP;
11523
11524 bool check_realm = false;
11525
11526 if (posix_acl_xattr) {
11527 if (!strcmp(name, ACL_EA_ACCESS)) {
11528 mode_t new_mode = in->mode;
11529 if (value) {
11530 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11531 if (ret < 0)
11532 return ret;
11533 if (ret == 0) {
11534 value = NULL;
11535 size = 0;
11536 }
11537 if (new_mode != in->mode) {
11538 struct ceph_statx stx;
11539 stx.stx_mode = new_mode;
11540 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11541 if (ret < 0)
11542 return ret;
11543 }
11544 }
11545 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11546 if (value) {
11547 if (!S_ISDIR(in->mode))
11548 return -EACCES;
11549 int ret = posix_acl_check(value, size);
11550 if (ret < 0)
11551 return -EINVAL;
11552 if (ret == 0) {
11553 value = NULL;
11554 size = 0;
11555 }
11556 }
11557 } else {
11558 return -EOPNOTSUPP;
11559 }
11560 } else {
11561 const VXattr *vxattr = _match_vxattr(in, name);
11562 if (vxattr) {
11563 if (vxattr->readonly)
11564 return -EOPNOTSUPP;
11565 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11566 check_realm = true;
11567 }
11568 }
11569
11570 int ret = _do_setxattr(in, name, value, size, flags, perms);
11571 if (ret >= 0 && check_realm) {
11572 // check if snaprealm was created for quota inode
11573 if (in->quota.is_enable() &&
11574 !(in->snaprealm && in->snaprealm->ino == in->ino))
11575 ret = -EOPNOTSUPP;
11576 }
11577
11578 return ret;
11579 }
11580
11581 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11582 size_t size, int flags, const UserPerm& perms)
11583 {
11584 if (cct->_conf->client_permissions) {
11585 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11586 if (r < 0)
11587 return r;
11588 }
11589 return _setxattr(in.get(), name, value, size, flags, perms);
11590 }
11591
11592 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11593 {
11594 string tmp;
11595 if (name == "layout") {
11596 string::iterator begin = value.begin();
11597 string::iterator end = value.end();
11598 keys_and_values<string::iterator> p; // create instance of parser
11599 std::map<string, string> m; // map to receive results
11600 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11601 return -EINVAL;
11602 }
11603 if (begin != end)
11604 return -EINVAL;
11605 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11606 if (q->first == "pool") {
11607 tmp = q->second;
11608 break;
11609 }
11610 }
11611 } else if (name == "layout.pool") {
11612 tmp = value;
11613 }
11614
11615 if (tmp.length()) {
11616 int64_t pool;
11617 try {
11618 pool = boost::lexical_cast<unsigned>(tmp);
11619 if (!osdmap->have_pg_pool(pool))
11620 return -ENOENT;
11621 } catch (boost::bad_lexical_cast const&) {
11622 pool = osdmap->lookup_pg_pool_name(tmp);
11623 if (pool < 0) {
11624 return -ENOENT;
11625 }
11626 }
11627 }
11628
11629 return 0;
11630 }
11631
11632 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11633 {
11634 // For setting pool of layout, MetaRequest need osdmap epoch.
11635 // There is a race which create a new data pool but client and mds both don't have.
11636 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11637 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11638 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11639 string rest(strstr(name, "layout"));
11640 string v((const char*)value, size);
11641 int r = objecter->with_osdmap([&](const OSDMap& o) {
11642 return _setxattr_check_data_pool(rest, v, &o);
11643 });
11644
11645 if (r == -ENOENT) {
11646 C_SaferCond ctx;
11647 objecter->wait_for_latest_osdmap(&ctx);
11648 ctx.wait();
11649 }
11650 }
11651 }
11652
11653 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11654 size_t size, int flags, const UserPerm& perms)
11655 {
11656 _setxattr_maybe_wait_for_osdmap(name, value, size);
11657
11658 std::lock_guard lock(client_lock);
11659
11660 if (unmounting)
11661 return -ENOTCONN;
11662
11663 vinodeno_t vino = _get_vino(in);
11664
11665 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11666 tout(cct) << __func__ << std::endl;
11667 tout(cct) << vino.ino.val << std::endl;
11668 tout(cct) << name << std::endl;
11669
11670 if (!fuse_default_permissions) {
11671 int r = xattr_permission(in, name, MAY_WRITE, perms);
11672 if (r < 0)
11673 return r;
11674 }
11675 return _setxattr(in, name, value, size, flags, perms);
11676 }
11677
11678 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11679 {
11680 if (in->snapid != CEPH_NOSNAP) {
11681 return -EROFS;
11682 }
11683
11684 // same xattrs supported by kernel client
11685 if (strncmp(name, "user.", 5) &&
11686 strncmp(name, "system.", 7) &&
11687 strncmp(name, "security.", 9) &&
11688 strncmp(name, "trusted.", 8) &&
11689 strncmp(name, "ceph.", 5))
11690 return -EOPNOTSUPP;
11691
11692 const VXattr *vxattr = _match_vxattr(in, name);
11693 if (vxattr && vxattr->readonly)
11694 return -EOPNOTSUPP;
11695
11696 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11697 filepath path;
11698 in->make_nosnap_relative_path(path);
11699 req->set_filepath(path);
11700 req->set_filepath2(name);
11701 req->set_inode(in);
11702
11703 int res = make_request(req, perms);
11704
11705 trim_cache();
11706 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11707 return res;
11708 }
11709
11710 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11711 {
11712 if (cct->_conf->client_permissions) {
11713 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11714 if (r < 0)
11715 return r;
11716 }
11717 return _removexattr(in.get(), name, perms);
11718 }
11719
11720 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11721 {
11722 std::lock_guard lock(client_lock);
11723
11724 if (unmounting)
11725 return -ENOTCONN;
11726
11727 vinodeno_t vino = _get_vino(in);
11728
11729 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11730 tout(cct) << "ll_removexattr" << std::endl;
11731 tout(cct) << vino.ino.val << std::endl;
11732 tout(cct) << name << std::endl;
11733
11734 if (!fuse_default_permissions) {
11735 int r = xattr_permission(in, name, MAY_WRITE, perms);
11736 if (r < 0)
11737 return r;
11738 }
11739
11740 return _removexattr(in, name, perms);
11741 }
11742
11743 bool Client::_vxattrcb_quota_exists(Inode *in)
11744 {
11745 return in->quota.is_enable() &&
11746 in->snaprealm && in->snaprealm->ino == in->ino;
11747 }
11748 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11749 {
11750 return snprintf(val, size,
11751 "max_bytes=%lld max_files=%lld",
11752 (long long int)in->quota.max_bytes,
11753 (long long int)in->quota.max_files);
11754 }
11755 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11756 {
11757 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11758 }
11759 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11760 {
11761 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11762 }
11763
11764 bool Client::_vxattrcb_layout_exists(Inode *in)
11765 {
11766 return in->layout != file_layout_t();
11767 }
11768 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11769 {
11770 int r = snprintf(val, size,
11771 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11772 (unsigned long long)in->layout.stripe_unit,
11773 (unsigned long long)in->layout.stripe_count,
11774 (unsigned long long)in->layout.object_size);
11775 objecter->with_osdmap([&](const OSDMap& o) {
11776 if (o.have_pg_pool(in->layout.pool_id))
11777 r += snprintf(val + r, size - r, "%s",
11778 o.get_pool_name(in->layout.pool_id).c_str());
11779 else
11780 r += snprintf(val + r, size - r, "%" PRIu64,
11781 (uint64_t)in->layout.pool_id);
11782 });
11783 if (in->layout.pool_ns.length())
11784 r += snprintf(val + r, size - r, " pool_namespace=%s",
11785 in->layout.pool_ns.c_str());
11786 return r;
11787 }
11788 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11789 {
11790 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11791 }
11792 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11793 {
11794 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11795 }
11796 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11797 {
11798 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11799 }
11800 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11801 {
11802 size_t r;
11803 objecter->with_osdmap([&](const OSDMap& o) {
11804 if (o.have_pg_pool(in->layout.pool_id))
11805 r = snprintf(val, size, "%s", o.get_pool_name(
11806 in->layout.pool_id).c_str());
11807 else
11808 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11809 });
11810 return r;
11811 }
11812 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11813 {
11814 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11815 }
11816 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11817 {
11818 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11819 }
11820 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11821 {
11822 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11823 }
11824 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11825 {
11826 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11827 }
11828 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11829 {
11830 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11831 }
11832 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11833 {
11834 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11835 }
11836 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11837 {
11838 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11839 }
11840 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11841 {
11842 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11843 }
11844 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11845 {
11846 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11847 (long)in->rstat.rctime.nsec());
11848 }
11849 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11850 {
11851 return in->dir_pin != -ENODATA;
11852 }
11853 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11854 {
11855 return snprintf(val, size, "%ld", (long)in->dir_pin);
11856 }
11857
11858 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11859 {
11860 return !in->snap_btime.is_zero();
11861 }
11862
11863 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11864 {
11865 return snprintf(val, size, "%llu.%09lu",
11866 (long long unsigned)in->snap_btime.sec(),
11867 (long unsigned)in->snap_btime.nsec());
11868 }
11869
11870 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11871 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11872
11873 #define XATTR_NAME_CEPH(_type, _name) \
11874 { \
11875 name: CEPH_XATTR_NAME(_type, _name), \
11876 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11877 readonly: true, \
11878 exists_cb: NULL, \
11879 flags: 0, \
11880 }
11881 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11882 { \
11883 name: CEPH_XATTR_NAME(_type, _name), \
11884 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11885 readonly: true, \
11886 exists_cb: NULL, \
11887 flags: _flags, \
11888 }
11889 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11890 { \
11891 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11892 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11893 readonly: false, \
11894 exists_cb: &Client::_vxattrcb_layout_exists, \
11895 flags: 0, \
11896 }
11897 #define XATTR_QUOTA_FIELD(_type, _name) \
11898 { \
11899 name: CEPH_XATTR_NAME(_type, _name), \
11900 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11901 readonly: false, \
11902 exists_cb: &Client::_vxattrcb_quota_exists, \
11903 flags: 0, \
11904 }
11905
11906 const Client::VXattr Client::_dir_vxattrs[] = {
11907 {
11908 name: "ceph.dir.layout",
11909 getxattr_cb: &Client::_vxattrcb_layout,
11910 readonly: false,
11911 exists_cb: &Client::_vxattrcb_layout_exists,
11912 flags: 0,
11913 },
11914 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11915 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11916 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11917 XATTR_LAYOUT_FIELD(dir, layout, pool),
11918 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11919 XATTR_NAME_CEPH(dir, entries),
11920 XATTR_NAME_CEPH(dir, files),
11921 XATTR_NAME_CEPH(dir, subdirs),
11922 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11923 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11924 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11925 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11926 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11927 {
11928 name: "ceph.quota",
11929 getxattr_cb: &Client::_vxattrcb_quota,
11930 readonly: false,
11931 exists_cb: &Client::_vxattrcb_quota_exists,
11932 flags: 0,
11933 },
11934 XATTR_QUOTA_FIELD(quota, max_bytes),
11935 XATTR_QUOTA_FIELD(quota, max_files),
11936 {
11937 name: "ceph.dir.pin",
11938 getxattr_cb: &Client::_vxattrcb_dir_pin,
11939 readonly: false,
11940 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11941 flags: 0,
11942 },
11943 {
11944 name: "ceph.snap.btime",
11945 getxattr_cb: &Client::_vxattrcb_snap_btime,
11946 readonly: true,
11947 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11948 flags: 0,
11949 },
11950 { name: "" } /* Required table terminator */
11951 };
11952
11953 const Client::VXattr Client::_file_vxattrs[] = {
11954 {
11955 name: "ceph.file.layout",
11956 getxattr_cb: &Client::_vxattrcb_layout,
11957 readonly: false,
11958 exists_cb: &Client::_vxattrcb_layout_exists,
11959 flags: 0,
11960 },
11961 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11962 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11963 XATTR_LAYOUT_FIELD(file, layout, object_size),
11964 XATTR_LAYOUT_FIELD(file, layout, pool),
11965 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11966 {
11967 name: "ceph.snap.btime",
11968 getxattr_cb: &Client::_vxattrcb_snap_btime,
11969 readonly: true,
11970 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11971 flags: 0,
11972 },
11973 { name: "" } /* Required table terminator */
11974 };
11975
11976 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11977 {
11978 if (in->is_dir())
11979 return _dir_vxattrs;
11980 else if (in->is_file())
11981 return _file_vxattrs;
11982 return NULL;
11983 }
11984
11985 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11986 {
11987 if (strncmp(name, "ceph.", 5) == 0) {
11988 const VXattr *vxattr = _get_vxattrs(in);
11989 if (vxattr) {
11990 while (!vxattr->name.empty()) {
11991 if (vxattr->name == name)
11992 return vxattr;
11993 vxattr++;
11994 }
11995 }
11996 }
11997 return NULL;
11998 }
11999
12000 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12001 {
12002 std::lock_guard lock(client_lock);
12003
12004 if (unmounting)
12005 return -ENOTCONN;
12006
12007 vinodeno_t vino = _get_vino(in);
12008
12009 ldout(cct, 3) << "ll_readlink " << vino << dendl;
12010 tout(cct) << "ll_readlink" << std::endl;
12011 tout(cct) << vino.ino.val << std::endl;
12012
12013 for (auto dn : in->dentries) {
12014 touch_dn(dn);
12015 }
12016
12017 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12018 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12019 return r;
12020 }
12021
12022 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12023 const UserPerm& perms, InodeRef *inp)
12024 {
12025 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12026 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12027 << ", gid " << perms.gid() << ")" << dendl;
12028
12029 if (strlen(name) > NAME_MAX)
12030 return -ENAMETOOLONG;
12031
12032 if (dir->snapid != CEPH_NOSNAP) {
12033 return -EROFS;
12034 }
12035 if (is_quota_files_exceeded(dir, perms)) {
12036 return -EDQUOT;
12037 }
12038
12039 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12040
12041 filepath path;
12042 dir->make_nosnap_relative_path(path);
12043 path.push_dentry(name);
12044 req->set_filepath(path);
12045 req->set_inode(dir);
12046 req->head.args.mknod.rdev = rdev;
12047 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12048 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12049
12050 bufferlist xattrs_bl;
12051 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12052 if (res < 0)
12053 goto fail;
12054 req->head.args.mknod.mode = mode;
12055 if (xattrs_bl.length() > 0)
12056 req->set_data(xattrs_bl);
12057
12058 Dentry *de;
12059 res = get_or_create(dir, name, &de);
12060 if (res < 0)
12061 goto fail;
12062 req->set_dentry(de);
12063
12064 res = make_request(req, perms, inp);
12065
12066 trim_cache();
12067
12068 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12069 return res;
12070
12071 fail:
12072 put_request(req);
12073 return res;
12074 }
12075
12076 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12077 dev_t rdev, struct stat *attr, Inode **out,
12078 const UserPerm& perms)
12079 {
12080 std::lock_guard lock(client_lock);
12081
12082 if (unmounting)
12083 return -ENOTCONN;
12084
12085 vinodeno_t vparent = _get_vino(parent);
12086
12087 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12088 tout(cct) << "ll_mknod" << std::endl;
12089 tout(cct) << vparent.ino.val << std::endl;
12090 tout(cct) << name << std::endl;
12091 tout(cct) << mode << std::endl;
12092 tout(cct) << rdev << std::endl;
12093
12094 if (!fuse_default_permissions) {
12095 int r = may_create(parent, perms);
12096 if (r < 0)
12097 return r;
12098 }
12099
12100 InodeRef in;
12101 int r = _mknod(parent, name, mode, rdev, perms, &in);
12102 if (r == 0) {
12103 fill_stat(in, attr);
12104 _ll_get(in.get());
12105 }
12106 tout(cct) << attr->st_ino << std::endl;
12107 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12108 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12109 *out = in.get();
12110 return r;
12111 }
12112
12113 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12114 dev_t rdev, Inode **out,
12115 struct ceph_statx *stx, unsigned want, unsigned flags,
12116 const UserPerm& perms)
12117 {
12118 unsigned caps = statx_to_mask(flags, want);
12119 std::lock_guard lock(client_lock);
12120
12121 if (unmounting)
12122 return -ENOTCONN;
12123
12124 vinodeno_t vparent = _get_vino(parent);
12125
12126 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12127 tout(cct) << "ll_mknodx" << std::endl;
12128 tout(cct) << vparent.ino.val << std::endl;
12129 tout(cct) << name << std::endl;
12130 tout(cct) << mode << std::endl;
12131 tout(cct) << rdev << std::endl;
12132
12133 if (!fuse_default_permissions) {
12134 int r = may_create(parent, perms);
12135 if (r < 0)
12136 return r;
12137 }
12138
12139 InodeRef in;
12140 int r = _mknod(parent, name, mode, rdev, perms, &in);
12141 if (r == 0) {
12142 fill_statx(in, caps, stx);
12143 _ll_get(in.get());
12144 }
12145 tout(cct) << stx->stx_ino << std::endl;
12146 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12147 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12148 *out = in.get();
12149 return r;
12150 }
12151
12152 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12153 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12154 int object_size, const char *data_pool, bool *created,
12155 const UserPerm& perms)
12156 {
12157 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12158 mode << dec << ")" << dendl;
12159
12160 if (strlen(name) > NAME_MAX)
12161 return -ENAMETOOLONG;
12162 if (dir->snapid != CEPH_NOSNAP) {
12163 return -EROFS;
12164 }
12165 if (is_quota_files_exceeded(dir, perms)) {
12166 return -EDQUOT;
12167 }
12168
12169 // use normalized flags to generate cmode
12170 int cflags = ceph_flags_sys2wire(flags);
12171 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12172 cflags |= CEPH_O_LAZY;
12173
12174 int cmode = ceph_flags_to_mode(cflags);
12175
12176 int64_t pool_id = -1;
12177 if (data_pool && *data_pool) {
12178 pool_id = objecter->with_osdmap(
12179 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12180 if (pool_id < 0)
12181 return -EINVAL;
12182 if (pool_id > 0xffffffffll)
12183 return -ERANGE; // bummer!
12184 }
12185
12186 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12187
12188 filepath path;
12189 dir->make_nosnap_relative_path(path);
12190 path.push_dentry(name);
12191 req->set_filepath(path);
12192 req->set_inode(dir);
12193 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12194
12195 req->head.args.open.stripe_unit = stripe_unit;
12196 req->head.args.open.stripe_count = stripe_count;
12197 req->head.args.open.object_size = object_size;
12198 if (cct->_conf->client_debug_getattr_caps)
12199 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12200 else
12201 req->head.args.open.mask = 0;
12202 req->head.args.open.pool = pool_id;
12203 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12204 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12205
12206 mode |= S_IFREG;
12207 bufferlist xattrs_bl;
12208 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12209 if (res < 0)
12210 goto fail;
12211 req->head.args.open.mode = mode;
12212 if (xattrs_bl.length() > 0)
12213 req->set_data(xattrs_bl);
12214
12215 Dentry *de;
12216 res = get_or_create(dir, name, &de);
12217 if (res < 0)
12218 goto fail;
12219 req->set_dentry(de);
12220
12221 res = make_request(req, perms, inp, created);
12222 if (res < 0) {
12223 goto reply_error;
12224 }
12225
12226 /* If the caller passed a value in fhp, do the open */
12227 if(fhp) {
12228 (*inp)->get_open_ref(cmode);
12229 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12230 }
12231
12232 reply_error:
12233 trim_cache();
12234
12235 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12236 << " layout " << stripe_unit
12237 << ' ' << stripe_count
12238 << ' ' << object_size
12239 <<") = " << res << dendl;
12240 return res;
12241
12242 fail:
12243 put_request(req);
12244 return res;
12245 }
12246
12247
12248 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12249 InodeRef *inp)
12250 {
12251 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12252 << mode << dec << ", uid " << perm.uid()
12253 << ", gid " << perm.gid() << ")" << dendl;
12254
12255 if (strlen(name) > NAME_MAX)
12256 return -ENAMETOOLONG;
12257
12258 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12259 return -EROFS;
12260 }
12261 if (is_quota_files_exceeded(dir, perm)) {
12262 return -EDQUOT;
12263 }
12264 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12265 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12266
12267 filepath path;
12268 dir->make_nosnap_relative_path(path);
12269 path.push_dentry(name);
12270 req->set_filepath(path);
12271 req->set_inode(dir);
12272 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12273 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12274
12275 mode |= S_IFDIR;
12276 bufferlist xattrs_bl;
12277 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12278 if (res < 0)
12279 goto fail;
12280 req->head.args.mkdir.mode = mode;
12281 if (xattrs_bl.length() > 0)
12282 req->set_data(xattrs_bl);
12283
12284 Dentry *de;
12285 res = get_or_create(dir, name, &de);
12286 if (res < 0)
12287 goto fail;
12288 req->set_dentry(de);
12289
12290 ldout(cct, 10) << "_mkdir: making request" << dendl;
12291 res = make_request(req, perm, inp);
12292 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12293
12294 trim_cache();
12295
12296 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12297 return res;
12298
12299 fail:
12300 put_request(req);
12301 return res;
12302 }
12303
12304 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12305 struct stat *attr, Inode **out, const UserPerm& perm)
12306 {
12307 std::lock_guard lock(client_lock);
12308
12309 if (unmounting)
12310 return -ENOTCONN;
12311
12312 vinodeno_t vparent = _get_vino(parent);
12313
12314 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12315 tout(cct) << "ll_mkdir" << std::endl;
12316 tout(cct) << vparent.ino.val << std::endl;
12317 tout(cct) << name << std::endl;
12318 tout(cct) << mode << std::endl;
12319
12320 if (!fuse_default_permissions) {
12321 int r = may_create(parent, perm);
12322 if (r < 0)
12323 return r;
12324 }
12325
12326 InodeRef in;
12327 int r = _mkdir(parent, name, mode, perm, &in);
12328 if (r == 0) {
12329 fill_stat(in, attr);
12330 _ll_get(in.get());
12331 }
12332 tout(cct) << attr->st_ino << std::endl;
12333 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12334 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12335 *out = in.get();
12336 return r;
12337 }
12338
12339 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12340 struct ceph_statx *stx, unsigned want, unsigned flags,
12341 const UserPerm& perms)
12342 {
12343 std::lock_guard lock(client_lock);
12344
12345 if (unmounting)
12346 return -ENOTCONN;
12347
12348 vinodeno_t vparent = _get_vino(parent);
12349
12350 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12351 tout(cct) << "ll_mkdirx" << std::endl;
12352 tout(cct) << vparent.ino.val << std::endl;
12353 tout(cct) << name << std::endl;
12354 tout(cct) << mode << std::endl;
12355
12356 if (!fuse_default_permissions) {
12357 int r = may_create(parent, perms);
12358 if (r < 0)
12359 return r;
12360 }
12361
12362 InodeRef in;
12363 int r = _mkdir(parent, name, mode, perms, &in);
12364 if (r == 0) {
12365 fill_statx(in, statx_to_mask(flags, want), stx);
12366 _ll_get(in.get());
12367 } else {
12368 stx->stx_ino = 0;
12369 stx->stx_mask = 0;
12370 }
12371 tout(cct) << stx->stx_ino << std::endl;
12372 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12373 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12374 *out = in.get();
12375 return r;
12376 }
12377
12378 int Client::_symlink(Inode *dir, const char *name, const char *target,
12379 const UserPerm& perms, InodeRef *inp)
12380 {
12381 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12382 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12383 << dendl;
12384
12385 if (strlen(name) > NAME_MAX)
12386 return -ENAMETOOLONG;
12387
12388 if (dir->snapid != CEPH_NOSNAP) {
12389 return -EROFS;
12390 }
12391 if (is_quota_files_exceeded(dir, perms)) {
12392 return -EDQUOT;
12393 }
12394
12395 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12396
12397 filepath path;
12398 dir->make_nosnap_relative_path(path);
12399 path.push_dentry(name);
12400 req->set_filepath(path);
12401 req->set_inode(dir);
12402 req->set_string2(target);
12403 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12404 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12405
12406 Dentry *de;
12407 int res = get_or_create(dir, name, &de);
12408 if (res < 0)
12409 goto fail;
12410 req->set_dentry(de);
12411
12412 res = make_request(req, perms, inp);
12413
12414 trim_cache();
12415 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12416 res << dendl;
12417 return res;
12418
12419 fail:
12420 put_request(req);
12421 return res;
12422 }
12423
12424 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12425 struct stat *attr, Inode **out, const UserPerm& perms)
12426 {
12427 std::lock_guard lock(client_lock);
12428
12429 if (unmounting)
12430 return -ENOTCONN;
12431
12432 vinodeno_t vparent = _get_vino(parent);
12433
12434 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12435 << dendl;
12436 tout(cct) << "ll_symlink" << std::endl;
12437 tout(cct) << vparent.ino.val << std::endl;
12438 tout(cct) << name << std::endl;
12439 tout(cct) << value << std::endl;
12440
12441 if (!fuse_default_permissions) {
12442 int r = may_create(parent, perms);
12443 if (r < 0)
12444 return r;
12445 }
12446
12447 InodeRef in;
12448 int r = _symlink(parent, name, value, perms, &in);
12449 if (r == 0) {
12450 fill_stat(in, attr);
12451 _ll_get(in.get());
12452 }
12453 tout(cct) << attr->st_ino << std::endl;
12454 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12455 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12456 *out = in.get();
12457 return r;
12458 }
12459
12460 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12461 Inode **out, struct ceph_statx *stx, unsigned want,
12462 unsigned flags, const UserPerm& perms)
12463 {
12464 std::lock_guard lock(client_lock);
12465
12466 if (unmounting)
12467 return -ENOTCONN;
12468
12469 vinodeno_t vparent = _get_vino(parent);
12470
12471 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12472 << dendl;
12473 tout(cct) << "ll_symlinkx" << std::endl;
12474 tout(cct) << vparent.ino.val << std::endl;
12475 tout(cct) << name << std::endl;
12476 tout(cct) << value << std::endl;
12477
12478 if (!fuse_default_permissions) {
12479 int r = may_create(parent, perms);
12480 if (r < 0)
12481 return r;
12482 }
12483
12484 InodeRef in;
12485 int r = _symlink(parent, name, value, perms, &in);
12486 if (r == 0) {
12487 fill_statx(in, statx_to_mask(flags, want), stx);
12488 _ll_get(in.get());
12489 }
12490 tout(cct) << stx->stx_ino << std::endl;
12491 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12492 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12493 *out = in.get();
12494 return r;
12495 }
12496
12497 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12498 {
12499 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12500 << " uid " << perm.uid() << " gid " << perm.gid()
12501 << ")" << dendl;
12502
12503 if (dir->snapid != CEPH_NOSNAP) {
12504 return -EROFS;
12505 }
12506
12507 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12508
12509 filepath path;
12510 dir->make_nosnap_relative_path(path);
12511 path.push_dentry(name);
12512 req->set_filepath(path);
12513
12514 InodeRef otherin;
12515 Inode *in;
12516 Dentry *de;
12517
12518 int res = get_or_create(dir, name, &de);
12519 if (res < 0)
12520 goto fail;
12521 req->set_dentry(de);
12522 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12523 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12524
12525 res = _lookup(dir, name, 0, &otherin, perm);
12526 if (res < 0)
12527 goto fail;
12528
12529 in = otherin.get();
12530 req->set_other_inode(in);
12531 in->break_all_delegs();
12532 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12533
12534 req->set_inode(dir);
12535
12536 res = make_request(req, perm);
12537
12538 trim_cache();
12539 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12540 return res;
12541
12542 fail:
12543 put_request(req);
12544 return res;
12545 }
12546
12547 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12548 {
12549 std::lock_guard lock(client_lock);
12550
12551 if (unmounting)
12552 return -ENOTCONN;
12553
12554 vinodeno_t vino = _get_vino(in);
12555
12556 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12557 tout(cct) << "ll_unlink" << std::endl;
12558 tout(cct) << vino.ino.val << std::endl;
12559 tout(cct) << name << std::endl;
12560
12561 if (!fuse_default_permissions) {
12562 int r = may_delete(in, name, perm);
12563 if (r < 0)
12564 return r;
12565 }
12566 return _unlink(in, name, perm);
12567 }
12568
12569 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12570 {
12571 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12572 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12573
12574 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12575 return -EROFS;
12576 }
12577
12578 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12579 MetaRequest *req = new MetaRequest(op);
12580 filepath path;
12581 dir->make_nosnap_relative_path(path);
12582 path.push_dentry(name);
12583 req->set_filepath(path);
12584 req->set_inode(dir);
12585
12586 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12587 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12588 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12589
12590 InodeRef in;
12591
12592 Dentry *de;
12593 int res = get_or_create(dir, name, &de);
12594 if (res < 0)
12595 goto fail;
12596 if (op == CEPH_MDS_OP_RMDIR)
12597 req->set_dentry(de);
12598 else
12599 de->get();
12600
12601 res = _lookup(dir, name, 0, &in, perms);
12602 if (res < 0)
12603 goto fail;
12604
12605 if (op == CEPH_MDS_OP_RMSNAP) {
12606 unlink(de, true, true);
12607 de->put();
12608 }
12609 req->set_other_inode(in.get());
12610
12611 res = make_request(req, perms);
12612
12613 trim_cache();
12614 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12615 return res;
12616
12617 fail:
12618 put_request(req);
12619 return res;
12620 }
12621
12622 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12623 {
12624 std::lock_guard lock(client_lock);
12625
12626 if (unmounting)
12627 return -ENOTCONN;
12628
12629 vinodeno_t vino = _get_vino(in);
12630
12631 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12632 tout(cct) << "ll_rmdir" << std::endl;
12633 tout(cct) << vino.ino.val << std::endl;
12634 tout(cct) << name << std::endl;
12635
12636 if (!fuse_default_permissions) {
12637 int r = may_delete(in, name, perms);
12638 if (r < 0)
12639 return r;
12640 }
12641
12642 return _rmdir(in, name, perms);
12643 }
12644
12645 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12646 {
12647 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12648 << todir->ino << " " << toname
12649 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12650 << dendl;
12651
12652 if (fromdir->snapid != todir->snapid)
12653 return -EXDEV;
12654
12655 int op = CEPH_MDS_OP_RENAME;
12656 if (fromdir->snapid != CEPH_NOSNAP) {
12657 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12658 op = CEPH_MDS_OP_RENAMESNAP;
12659 else
12660 return -EROFS;
12661 }
12662
12663 InodeRef target;
12664 MetaRequest *req = new MetaRequest(op);
12665
12666 filepath from;
12667 fromdir->make_nosnap_relative_path(from);
12668 from.push_dentry(fromname);
12669 filepath to;
12670 todir->make_nosnap_relative_path(to);
12671 to.push_dentry(toname);
12672 req->set_filepath(to);
12673 req->set_filepath2(from);
12674
12675 Dentry *oldde;
12676 int res = get_or_create(fromdir, fromname, &oldde);
12677 if (res < 0)
12678 goto fail;
12679 Dentry *de;
12680 res = get_or_create(todir, toname, &de);
12681 if (res < 0)
12682 goto fail;
12683
12684 if (op == CEPH_MDS_OP_RENAME) {
12685 req->set_old_dentry(oldde);
12686 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12687 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12688
12689 req->set_dentry(de);
12690 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12691 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12692
12693 InodeRef oldin, otherin;
12694 Inode *fromdir_root = nullptr;
12695 Inode *todir_root = nullptr;
12696 int mask = 0;
12697 bool quota_check = false;
12698 if (fromdir != todir) {
12699 fromdir_root =
12700 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12701 todir_root =
12702 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12703
12704 if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12705 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12706 // to auth MDS to get latest rstat for todir_root and source dir
12707 // even if their dentry caches and inode caps are satisfied.
12708 res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12709 if (res < 0)
12710 goto fail;
12711
12712 quota_check = true;
12713 if (oldde->inode && oldde->inode->is_dir()) {
12714 mask |= CEPH_STAT_RSTAT;
12715 }
12716 }
12717 }
12718
12719 res = _lookup(fromdir, fromname, mask, &oldin, perm);
12720 if (res < 0)
12721 goto fail;
12722
12723 Inode *oldinode = oldin.get();
12724 oldinode->break_all_delegs();
12725 req->set_old_inode(oldinode);
12726 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12727
12728 if (quota_check) {
12729 int64_t old_bytes, old_files;
12730 if (oldinode->is_dir()) {
12731 old_bytes = oldinode->rstat.rbytes;
12732 old_files = oldinode->rstat.rsize();
12733 } else {
12734 old_bytes = oldinode->size;
12735 old_files = 1;
12736 }
12737
12738 bool quota_exceed = false;
12739 if (todir_root && todir_root->quota.max_bytes &&
12740 (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12741 ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12742 << old_bytes << ") to (" << todir->ino
12743 << ") will exceed quota on " << *todir_root << dendl;
12744 quota_exceed = true;
12745 }
12746
12747 if (todir_root && todir_root->quota.max_files &&
12748 (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12749 ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12750 << old_files << ") to (" << todir->ino
12751 << ") will exceed quota on " << *todir_root << dendl;
12752 quota_exceed = true;
12753 }
12754
12755 if (quota_exceed) {
12756 res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12757 goto fail;
12758 }
12759 }
12760
12761 res = _lookup(todir, toname, 0, &otherin, perm);
12762 switch (res) {
12763 case 0:
12764 {
12765 Inode *in = otherin.get();
12766 req->set_other_inode(in);
12767 in->break_all_delegs();
12768 }
12769 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12770 break;
12771 case -ENOENT:
12772 break;
12773 default:
12774 goto fail;
12775 }
12776
12777 req->set_inode(todir);
12778 } else {
12779 // renamesnap reply contains no tracedn, so we need to invalidate
12780 // dentry manually
12781 unlink(oldde, true, true);
12782 unlink(de, true, true);
12783
12784 req->set_inode(todir);
12785 }
12786
12787 res = make_request(req, perm, &target);
12788 ldout(cct, 10) << "rename result is " << res << dendl;
12789
12790 // renamed item from our cache
12791
12792 trim_cache();
12793 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12794 return res;
12795
12796 fail:
12797 put_request(req);
12798 return res;
12799 }
12800
12801 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12802 const char *newname, const UserPerm& perm)
12803 {
12804 std::lock_guard lock(client_lock);
12805
12806 if (unmounting)
12807 return -ENOTCONN;
12808
12809 vinodeno_t vparent = _get_vino(parent);
12810 vinodeno_t vnewparent = _get_vino(newparent);
12811
12812 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12813 << vnewparent << " " << newname << dendl;
12814 tout(cct) << "ll_rename" << std::endl;
12815 tout(cct) << vparent.ino.val << std::endl;
12816 tout(cct) << name << std::endl;
12817 tout(cct) << vnewparent.ino.val << std::endl;
12818 tout(cct) << newname << std::endl;
12819
12820 if (!fuse_default_permissions) {
12821 int r = may_delete(parent, name, perm);
12822 if (r < 0)
12823 return r;
12824 r = may_delete(newparent, newname, perm);
12825 if (r < 0 && r != -ENOENT)
12826 return r;
12827 }
12828
12829 return _rename(parent, name, newparent, newname, perm);
12830 }
12831
12832 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12833 {
12834 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12835 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12836
12837 if (strlen(newname) > NAME_MAX)
12838 return -ENAMETOOLONG;
12839
12840 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12841 return -EROFS;
12842 }
12843 if (is_quota_files_exceeded(dir, perm)) {
12844 return -EDQUOT;
12845 }
12846
12847 in->break_all_delegs();
12848 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12849
12850 filepath path(newname, dir->ino);
12851 req->set_filepath(path);
12852 filepath existing(in->ino);
12853 req->set_filepath2(existing);
12854
12855 req->set_inode(dir);
12856 req->inode_drop = CEPH_CAP_FILE_SHARED;
12857 req->inode_unless = CEPH_CAP_FILE_EXCL;
12858
12859 Dentry *de;
12860 int res = get_or_create(dir, newname, &de);
12861 if (res < 0)
12862 goto fail;
12863 req->set_dentry(de);
12864
12865 res = make_request(req, perm, inp);
12866 ldout(cct, 10) << "link result is " << res << dendl;
12867
12868 trim_cache();
12869 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12870 return res;
12871
12872 fail:
12873 put_request(req);
12874 return res;
12875 }
12876
12877 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12878 const UserPerm& perm)
12879 {
12880 std::lock_guard lock(client_lock);
12881
12882 if (unmounting)
12883 return -ENOTCONN;
12884
12885 vinodeno_t vino = _get_vino(in);
12886 vinodeno_t vnewparent = _get_vino(newparent);
12887
12888 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12889 newname << dendl;
12890 tout(cct) << "ll_link" << std::endl;
12891 tout(cct) << vino.ino.val << std::endl;
12892 tout(cct) << vnewparent << std::endl;
12893 tout(cct) << newname << std::endl;
12894
12895 InodeRef target;
12896
12897 if (!fuse_default_permissions) {
12898 if (S_ISDIR(in->mode))
12899 return -EPERM;
12900
12901 int r = may_hardlink(in, perm);
12902 if (r < 0)
12903 return r;
12904
12905 r = may_create(newparent, perm);
12906 if (r < 0)
12907 return r;
12908 }
12909
12910 return _link(in, newparent, newname, perm, &target);
12911 }
12912
12913 int Client::ll_num_osds(void)
12914 {
12915 std::lock_guard lock(client_lock);
12916 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12917 }
12918
12919 int Client::ll_osdaddr(int osd, uint32_t *addr)
12920 {
12921 std::lock_guard lock(client_lock);
12922
12923 entity_addr_t g;
12924 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12925 if (!o.exists(osd))
12926 return false;
12927 g = o.get_addrs(osd).front();
12928 return true;
12929 });
12930 if (!exists)
12931 return -1;
12932 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12933 *addr = ntohl(nb_addr);
12934 return 0;
12935 }
12936
12937 uint32_t Client::ll_stripe_unit(Inode *in)
12938 {
12939 std::lock_guard lock(client_lock);
12940 return in->layout.stripe_unit;
12941 }
12942
12943 uint64_t Client::ll_snap_seq(Inode *in)
12944 {
12945 std::lock_guard lock(client_lock);
12946 return in->snaprealm->seq;
12947 }
12948
12949 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12950 {
12951 std::lock_guard lock(client_lock);
12952 *layout = in->layout;
12953 return 0;
12954 }
12955
12956 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12957 {
12958 return ll_file_layout(fh->inode.get(), layout);
12959 }
12960
12961 /* Currently we cannot take advantage of redundancy in reads, since we
12962 would have to go through all possible placement groups (a
12963 potentially quite large number determined by a hash), and use CRUSH
12964 to calculate the appropriate set of OSDs for each placement group,
12965 then index into that. An array with one entry per OSD is much more
12966 tractable and works for demonstration purposes. */
12967
12968 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12969 file_layout_t* layout)
12970 {
12971 std::lock_guard lock(client_lock);
12972
12973 inodeno_t ino = in->ino;
12974 uint32_t object_size = layout->object_size;
12975 uint32_t su = layout->stripe_unit;
12976 uint32_t stripe_count = layout->stripe_count;
12977 uint64_t stripes_per_object = object_size / su;
12978 uint64_t stripeno = 0, stripepos = 0;
12979
12980 if(stripe_count) {
12981 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12982 stripepos = blockno % stripe_count; // which object in the object set (X)
12983 }
12984 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12985 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12986
12987 object_t oid = file_object_t(ino, objectno);
12988 return objecter->with_osdmap([&](const OSDMap& o) {
12989 ceph_object_layout olayout =
12990 o.file_to_object_layout(oid, *layout);
12991 pg_t pg = (pg_t)olayout.ol_pgid;
12992 vector<int> osds;
12993 int primary;
12994 o.pg_to_acting_osds(pg, &osds, &primary);
12995 return primary;
12996 });
12997 }
12998
12999 /* Return the offset of the block, internal to the object */
13000
13001 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13002 {
13003 std::lock_guard lock(client_lock);
13004 file_layout_t *layout=&(in->layout);
13005 uint32_t object_size = layout->object_size;
13006 uint32_t su = layout->stripe_unit;
13007 uint64_t stripes_per_object = object_size / su;
13008
13009 return (blockno % stripes_per_object) * su;
13010 }
13011
13012 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13013 const UserPerm& perms)
13014 {
13015 std::lock_guard lock(client_lock);
13016
13017 if (unmounting)
13018 return -ENOTCONN;
13019
13020 vinodeno_t vino = _get_vino(in);
13021
13022 ldout(cct, 3) << "ll_opendir " << vino << dendl;
13023 tout(cct) << "ll_opendir" << std::endl;
13024 tout(cct) << vino.ino.val << std::endl;
13025
13026 if (!fuse_default_permissions) {
13027 int r = may_open(in, flags, perms);
13028 if (r < 0)
13029 return r;
13030 }
13031
13032 int r = _opendir(in, dirpp, perms);
13033 tout(cct) << (unsigned long)*dirpp << std::endl;
13034
13035 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13036 << dendl;
13037 return r;
13038 }
13039
13040 int Client::ll_releasedir(dir_result_t *dirp)
13041 {
13042 std::lock_guard lock(client_lock);
13043 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13044 tout(cct) << "ll_releasedir" << std::endl;
13045 tout(cct) << (unsigned long)dirp << std::endl;
13046
13047 if (unmounting)
13048 return -ENOTCONN;
13049
13050 _closedir(dirp);
13051 return 0;
13052 }
13053
13054 int Client::ll_fsyncdir(dir_result_t *dirp)
13055 {
13056 std::lock_guard lock(client_lock);
13057 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13058 tout(cct) << "ll_fsyncdir" << std::endl;
13059 tout(cct) << (unsigned long)dirp << std::endl;
13060
13061 if (unmounting)
13062 return -ENOTCONN;
13063
13064 return _fsync(dirp->inode.get(), false);
13065 }
13066
13067 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13068 {
13069 ceph_assert(!(flags & O_CREAT));
13070
13071 std::lock_guard lock(client_lock);
13072
13073 if (unmounting)
13074 return -ENOTCONN;
13075
13076 vinodeno_t vino = _get_vino(in);
13077
13078 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13079 tout(cct) << "ll_open" << std::endl;
13080 tout(cct) << vino.ino.val << std::endl;
13081 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13082
13083 int r;
13084 if (!fuse_default_permissions) {
13085 r = may_open(in, flags, perms);
13086 if (r < 0)
13087 goto out;
13088 }
13089
13090 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13091
13092 out:
13093 Fh *fhptr = fhp ? *fhp : NULL;
13094 if (fhptr) {
13095 ll_unclosed_fh_set.insert(fhptr);
13096 }
13097 tout(cct) << (unsigned long)fhptr << std::endl;
13098 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13099 " = " << r << " (" << fhptr << ")" << dendl;
13100 return r;
13101 }
13102
13103 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13104 int flags, InodeRef *in, int caps, Fh **fhp,
13105 const UserPerm& perms)
13106 {
13107 *fhp = NULL;
13108
13109 vinodeno_t vparent = _get_vino(parent);
13110
13111 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13112 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13113 << ", gid " << perms.gid() << dendl;
13114 tout(cct) << "ll_create" << std::endl;
13115 tout(cct) << vparent.ino.val << std::endl;
13116 tout(cct) << name << std::endl;
13117 tout(cct) << mode << std::endl;
13118 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13119
13120 bool created = false;
13121 int r = _lookup(parent, name, caps, in, perms);
13122
13123 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13124 return -EEXIST;
13125
13126 if (r == -ENOENT && (flags & O_CREAT)) {
13127 if (!fuse_default_permissions) {
13128 r = may_create(parent, perms);
13129 if (r < 0)
13130 goto out;
13131 }
13132 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13133 perms);
13134 if (r < 0)
13135 goto out;
13136 }
13137
13138 if (r < 0)
13139 goto out;
13140
13141 ceph_assert(*in);
13142
13143 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13144 if (!created) {
13145 if (!fuse_default_permissions) {
13146 r = may_open(in->get(), flags, perms);
13147 if (r < 0) {
13148 if (*fhp) {
13149 int release_r = _release_fh(*fhp);
13150 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13151 }
13152 goto out;
13153 }
13154 }
13155 if (*fhp == NULL) {
13156 r = _open(in->get(), flags, mode, fhp, perms);
13157 if (r < 0)
13158 goto out;
13159 }
13160 }
13161
13162 out:
13163 if (*fhp) {
13164 ll_unclosed_fh_set.insert(*fhp);
13165 }
13166
13167 ino_t ino = 0;
13168 if (r >= 0) {
13169 Inode *inode = in->get();
13170 if (use_faked_inos())
13171 ino = inode->faked_ino;
13172 else
13173 ino = inode->ino;
13174 }
13175
13176 tout(cct) << (unsigned long)*fhp << std::endl;
13177 tout(cct) << ino << std::endl;
13178 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13179 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13180 *fhp << " " << hex << ino << dec << ")" << dendl;
13181
13182 return r;
13183 }
13184
13185 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13186 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13187 const UserPerm& perms)
13188 {
13189 std::lock_guard lock(client_lock);
13190 InodeRef in;
13191
13192 if (unmounting)
13193 return -ENOTCONN;
13194
13195 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13196 fhp, perms);
13197 if (r >= 0) {
13198 ceph_assert(in);
13199
13200 // passing an Inode in outp requires an additional ref
13201 if (outp) {
13202 _ll_get(in.get());
13203 *outp = in.get();
13204 }
13205 fill_stat(in, attr);
13206 } else {
13207 attr->st_ino = 0;
13208 }
13209
13210 return r;
13211 }
13212
13213 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13214 int oflags, Inode **outp, Fh **fhp,
13215 struct ceph_statx *stx, unsigned want, unsigned lflags,
13216 const UserPerm& perms)
13217 {
13218 unsigned caps = statx_to_mask(lflags, want);
13219 std::lock_guard lock(client_lock);
13220 InodeRef in;
13221
13222 if (unmounting)
13223 return -ENOTCONN;
13224
13225 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13226 if (r >= 0) {
13227 ceph_assert(in);
13228
13229 // passing an Inode in outp requires an additional ref
13230 if (outp) {
13231 _ll_get(in.get());
13232 *outp = in.get();
13233 }
13234 fill_statx(in, caps, stx);
13235 } else {
13236 stx->stx_ino = 0;
13237 stx->stx_mask = 0;
13238 }
13239
13240 return r;
13241 }
13242
13243 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13244 {
13245 std::lock_guard lock(client_lock);
13246 tout(cct) << "ll_lseek" << std::endl;
13247 tout(cct) << offset << std::endl;
13248 tout(cct) << whence << std::endl;
13249
13250 if (unmounting)
13251 return -ENOTCONN;
13252
13253 return _lseek(fh, offset, whence);
13254 }
13255
13256 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13257 {
13258 std::lock_guard lock(client_lock);
13259 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13260 tout(cct) << "ll_read" << std::endl;
13261 tout(cct) << (unsigned long)fh << std::endl;
13262 tout(cct) << off << std::endl;
13263 tout(cct) << len << std::endl;
13264
13265 if (unmounting)
13266 return -ENOTCONN;
13267
13268 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13269 len = std::min(len, (loff_t)INT_MAX);
13270 return _read(fh, off, len, bl);
13271 }
13272
13273 int Client::ll_read_block(Inode *in, uint64_t blockid,
13274 char *buf,
13275 uint64_t offset,
13276 uint64_t length,
13277 file_layout_t* layout)
13278 {
13279 std::lock_guard lock(client_lock);
13280
13281 if (unmounting)
13282 return -ENOTCONN;
13283
13284 vinodeno_t vino = _get_vino(in);
13285 object_t oid = file_object_t(vino.ino, blockid);
13286 C_SaferCond onfinish;
13287 bufferlist bl;
13288
13289 objecter->read(oid,
13290 object_locator_t(layout->pool_id),
13291 offset,
13292 length,
13293 vino.snapid,
13294 &bl,
13295 CEPH_OSD_FLAG_READ,
13296 &onfinish);
13297
13298 client_lock.unlock();
13299 int r = onfinish.wait();
13300 client_lock.lock();
13301
13302 if (r >= 0) {
13303 bl.begin().copy(bl.length(), buf);
13304 r = bl.length();
13305 }
13306
13307 return r;
13308 }
13309
13310 /* It appears that the OSD doesn't return success unless the entire
13311 buffer was written, return the write length on success. */
13312
13313 int Client::ll_write_block(Inode *in, uint64_t blockid,
13314 char* buf, uint64_t offset,
13315 uint64_t length, file_layout_t* layout,
13316 uint64_t snapseq, uint32_t sync)
13317 {
13318 vinodeno_t vino = ll_get_vino(in);
13319 int r = 0;
13320 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13321
13322 if (length == 0) {
13323 return -EINVAL;
13324 }
13325 if (true || sync) {
13326 /* if write is stable, the epilogue is waiting on
13327 * flock */
13328 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13329 }
13330 object_t oid = file_object_t(vino.ino, blockid);
13331 SnapContext fakesnap;
13332 ceph::bufferlist bl;
13333 if (length > 0) {
13334 bl.push_back(buffer::copy(buf, length));
13335 }
13336
13337 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13338 << dendl;
13339
13340 fakesnap.seq = snapseq;
13341
13342 /* lock just in time */
13343 client_lock.lock();
13344 if (unmounting) {
13345 client_lock.unlock();
13346 return -ENOTCONN;
13347 }
13348
13349 objecter->write(oid,
13350 object_locator_t(layout->pool_id),
13351 offset,
13352 length,
13353 fakesnap,
13354 bl,
13355 ceph::real_clock::now(),
13356 0,
13357 onsafe.get());
13358
13359 client_lock.unlock();
13360 if (nullptr != onsafe) {
13361 r = onsafe->wait();
13362 }
13363
13364 if (r < 0) {
13365 return r;
13366 } else {
13367 return length;
13368 }
13369 }
13370
13371 int Client::ll_commit_blocks(Inode *in,
13372 uint64_t offset,
13373 uint64_t length)
13374 {
13375 std::lock_guard lock(client_lock);
13376 /*
13377 BarrierContext *bctx;
13378 vinodeno_t vino = _get_vino(in);
13379 uint64_t ino = vino.ino;
13380
13381 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13382 << offset << " to " << length << dendl;
13383
13384 if (length == 0) {
13385 return -EINVAL;
13386 }
13387
13388 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13389 if (p != barriers.end()) {
13390 barrier_interval civ(offset, offset + length);
13391 p->second->commit_barrier(civ);
13392 }
13393 */
13394 return 0;
13395 }
13396
13397 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13398 {
13399 std::lock_guard lock(client_lock);
13400 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13401 "~" << len << dendl;
13402 tout(cct) << "ll_write" << std::endl;
13403 tout(cct) << (unsigned long)fh << std::endl;
13404 tout(cct) << off << std::endl;
13405 tout(cct) << len << std::endl;
13406
13407 if (unmounting)
13408 return -ENOTCONN;
13409
13410 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13411 len = std::min(len, (loff_t)INT_MAX);
13412 int r = _write(fh, off, len, data, NULL, 0);
13413 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13414 << dendl;
13415 return r;
13416 }
13417
13418 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13419 {
13420 std::lock_guard lock(client_lock);
13421 if (unmounting)
13422 return -ENOTCONN;
13423 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13424 }
13425
13426 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13427 {
13428 std::lock_guard lock(client_lock);
13429 if (unmounting)
13430 return -ENOTCONN;
13431 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13432 }
13433
13434 int Client::ll_flush(Fh *fh)
13435 {
13436 std::lock_guard lock(client_lock);
13437 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13438 tout(cct) << "ll_flush" << std::endl;
13439 tout(cct) << (unsigned long)fh << std::endl;
13440
13441 if (unmounting)
13442 return -ENOTCONN;
13443
13444 return _flush(fh);
13445 }
13446
13447 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13448 {
13449 std::lock_guard lock(client_lock);
13450 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13451 tout(cct) << "ll_fsync" << std::endl;
13452 tout(cct) << (unsigned long)fh << std::endl;
13453
13454 if (unmounting)
13455 return -ENOTCONN;
13456
13457 int r = _fsync(fh, syncdataonly);
13458 if (r) {
13459 // If we're returning an error, clear it from the FH
13460 fh->take_async_err();
13461 }
13462 return r;
13463 }
13464
13465 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13466 {
13467 std::lock_guard lock(client_lock);
13468 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13469 tout(cct) << "ll_sync_inode" << std::endl;
13470 tout(cct) << (unsigned long)in << std::endl;
13471
13472 if (unmounting)
13473 return -ENOTCONN;
13474
13475 return _fsync(in, syncdataonly);
13476 }
13477
13478 #ifdef FALLOC_FL_PUNCH_HOLE
13479
13480 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13481 {
13482 if (offset < 0 || length <= 0)
13483 return -EINVAL;
13484
13485 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13486 return -EOPNOTSUPP;
13487
13488 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13489 return -EOPNOTSUPP;
13490
13491 Inode *in = fh->inode.get();
13492
13493 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13494 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13495 return -ENOSPC;
13496 }
13497
13498 if (in->snapid != CEPH_NOSNAP)
13499 return -EROFS;
13500
13501 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13502 return -EBADF;
13503
13504 uint64_t size = offset + length;
13505 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13506 size > in->size &&
13507 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13508 return -EDQUOT;
13509 }
13510
13511 int have;
13512 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13513 if (r < 0)
13514 return r;
13515
13516 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13517 if (mode & FALLOC_FL_PUNCH_HOLE) {
13518 if (in->inline_version < CEPH_INLINE_NONE &&
13519 (have & CEPH_CAP_FILE_BUFFER)) {
13520 bufferlist bl;
13521 auto inline_iter = in->inline_data.cbegin();
13522 int len = in->inline_data.length();
13523 if (offset < len) {
13524 if (offset > 0)
13525 inline_iter.copy(offset, bl);
13526 int size = length;
13527 if (offset + size > len)
13528 size = len - offset;
13529 if (size > 0)
13530 bl.append_zero(size);
13531 if (offset + size < len) {
13532 inline_iter += size;
13533 inline_iter.copy(len - offset - size, bl);
13534 }
13535 in->inline_data = bl;
13536 in->inline_version++;
13537 }
13538 in->mtime = in->ctime = ceph_clock_now();
13539 in->change_attr++;
13540 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13541 } else {
13542 if (in->inline_version < CEPH_INLINE_NONE) {
13543 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13544 uninline_data(in, onuninline.get());
13545 }
13546
13547 C_SaferCond onfinish("Client::_punch_hole flock");
13548
13549 unsafe_sync_write++;
13550 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13551
13552 _invalidate_inode_cache(in, offset, length);
13553 filer->zero(in->ino, &in->layout,
13554 in->snaprealm->get_snap_context(),
13555 offset, length,
13556 ceph::real_clock::now(),
13557 0, true, &onfinish);
13558 in->mtime = in->ctime = ceph_clock_now();
13559 in->change_attr++;
13560 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13561
13562 client_lock.unlock();
13563 onfinish.wait();
13564 client_lock.lock();
13565 _sync_write_commit(in);
13566 }
13567 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13568 uint64_t size = offset + length;
13569 if (size > in->size) {
13570 in->size = size;
13571 in->mtime = in->ctime = ceph_clock_now();
13572 in->change_attr++;
13573 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13574
13575 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13576 check_caps(in, CHECK_CAPS_NODELAY);
13577 } else if (is_max_size_approaching(in)) {
13578 check_caps(in, 0);
13579 }
13580 }
13581 }
13582
13583 if (nullptr != onuninline) {
13584 client_lock.unlock();
13585 int ret = onuninline->wait();
13586 client_lock.lock();
13587
13588 if (ret >= 0 || ret == -ECANCELED) {
13589 in->inline_data.clear();
13590 in->inline_version = CEPH_INLINE_NONE;
13591 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13592 check_caps(in, 0);
13593 } else
13594 r = ret;
13595 }
13596
13597 put_cap_ref(in, CEPH_CAP_FILE_WR);
13598 return r;
13599 }
13600 #else
13601
13602 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13603 {
13604 return -EOPNOTSUPP;
13605 }
13606
13607 #endif
13608
13609
13610 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13611 {
13612 std::lock_guard lock(client_lock);
13613 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13614 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13615 tout(cct) << (unsigned long)fh << std::endl;
13616
13617 if (unmounting)
13618 return -ENOTCONN;
13619
13620 return _fallocate(fh, mode, offset, length);
13621 }
13622
13623 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13624 {
13625 std::lock_guard lock(client_lock);
13626 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13627
13628 if (unmounting)
13629 return -ENOTCONN;
13630
13631 Fh *fh = get_filehandle(fd);
13632 if (!fh)
13633 return -EBADF;
13634 #if defined(__linux__) && defined(O_PATH)
13635 if (fh->flags & O_PATH)
13636 return -EBADF;
13637 #endif
13638 return _fallocate(fh, mode, offset, length);
13639 }
13640
13641 int Client::ll_release(Fh *fh)
13642 {
13643 std::lock_guard lock(client_lock);
13644
13645 if (unmounting)
13646 return -ENOTCONN;
13647
13648 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13649 dendl;
13650 tout(cct) << __func__ << " (fh)" << std::endl;
13651 tout(cct) << (unsigned long)fh << std::endl;
13652
13653 if (ll_unclosed_fh_set.count(fh))
13654 ll_unclosed_fh_set.erase(fh);
13655 return _release_fh(fh);
13656 }
13657
13658 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13659 {
13660 std::lock_guard lock(client_lock);
13661
13662 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13663 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13664
13665 if (unmounting)
13666 return -ENOTCONN;
13667
13668 return _getlk(fh, fl, owner);
13669 }
13670
13671 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13672 {
13673 std::lock_guard lock(client_lock);
13674
13675 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13676 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13677
13678 if (unmounting)
13679 return -ENOTCONN;
13680
13681 return _setlk(fh, fl, owner, sleep);
13682 }
13683
13684 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13685 {
13686 std::lock_guard lock(client_lock);
13687
13688 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13689 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13690
13691 if (unmounting)
13692 return -ENOTCONN;
13693
13694 return _flock(fh, cmd, owner);
13695 }
13696
13697 int Client::set_deleg_timeout(uint32_t timeout)
13698 {
13699 std::lock_guard lock(client_lock);
13700
13701 /*
13702 * The whole point is to prevent blacklisting so we must time out the
13703 * delegation before the session autoclose timeout kicks in.
13704 */
13705 if (timeout >= mdsmap->get_session_autoclose())
13706 return -EINVAL;
13707
13708 deleg_timeout = timeout;
13709 return 0;
13710 }
13711
13712 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13713 {
13714 int ret = -EINVAL;
13715
13716 std::lock_guard lock(client_lock);
13717
13718 if (!mounted)
13719 return -ENOTCONN;
13720
13721 Inode *inode = fh->inode.get();
13722
13723 switch(cmd) {
13724 case CEPH_DELEGATION_NONE:
13725 inode->unset_deleg(fh);
13726 ret = 0;
13727 break;
13728 default:
13729 try {
13730 ret = inode->set_deleg(fh, cmd, cb, priv);
13731 } catch (std::bad_alloc&) {
13732 ret = -ENOMEM;
13733 }
13734 break;
13735 }
13736 return ret;
13737 }
13738
13739 class C_Client_RequestInterrupt : public Context {
13740 private:
13741 Client *client;
13742 MetaRequest *req;
13743 public:
13744 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13745 req->get();
13746 }
13747 void finish(int r) override {
13748 std::lock_guard l(client->client_lock);
13749 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13750 client->_interrupt_filelock(req);
13751 client->put_request(req);
13752 }
13753 };
13754
13755 void Client::ll_interrupt(void *d)
13756 {
13757 MetaRequest *req = static_cast<MetaRequest*>(d);
13758 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13759 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13760 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13761 }
13762
13763 // =========================================
13764 // layout
13765
13766 // expose file layouts
13767
13768 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13769 const UserPerm& perms)
13770 {
13771 std::lock_guard lock(client_lock);
13772
13773 if (unmounting)
13774 return -ENOTCONN;
13775
13776 filepath path(relpath);
13777 InodeRef in;
13778 int r = path_walk(path, &in, perms);
13779 if (r < 0)
13780 return r;
13781
13782 *lp = in->layout;
13783
13784 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13785 return 0;
13786 }
13787
13788 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13789 {
13790 std::lock_guard lock(client_lock);
13791
13792 if (unmounting)
13793 return -ENOTCONN;
13794
13795 Fh *f = get_filehandle(fd);
13796 if (!f)
13797 return -EBADF;
13798 Inode *in = f->inode.get();
13799
13800 *lp = in->layout;
13801
13802 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13803 return 0;
13804 }
13805
13806 int64_t Client::get_default_pool_id()
13807 {
13808 std::lock_guard lock(client_lock);
13809
13810 if (unmounting)
13811 return -ENOTCONN;
13812
13813 /* first data pool is the default */
13814 return mdsmap->get_first_data_pool();
13815 }
13816
13817 // expose osdmap
13818
13819 int64_t Client::get_pool_id(const char *pool_name)
13820 {
13821 std::lock_guard lock(client_lock);
13822
13823 if (unmounting)
13824 return -ENOTCONN;
13825
13826 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13827 pool_name);
13828 }
13829
13830 string Client::get_pool_name(int64_t pool)
13831 {
13832 std::lock_guard lock(client_lock);
13833
13834 if (unmounting)
13835 return string();
13836
13837 return objecter->with_osdmap([pool](const OSDMap& o) {
13838 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13839 });
13840 }
13841
13842 int Client::get_pool_replication(int64_t pool)
13843 {
13844 std::lock_guard lock(client_lock);
13845
13846 if (unmounting)
13847 return -ENOTCONN;
13848
13849 return objecter->with_osdmap([pool](const OSDMap& o) {
13850 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13851 });
13852 }
13853
13854 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13855 {
13856 std::lock_guard lock(client_lock);
13857
13858 if (unmounting)
13859 return -ENOTCONN;
13860
13861 Fh *f = get_filehandle(fd);
13862 if (!f)
13863 return -EBADF;
13864 Inode *in = f->inode.get();
13865
13866 vector<ObjectExtent> extents;
13867 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13868 ceph_assert(extents.size() == 1);
13869
13870 objecter->with_osdmap([&](const OSDMap& o) {
13871 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13872 o.pg_to_acting_osds(pg, osds);
13873 });
13874
13875 if (osds.empty())
13876 return -EINVAL;
13877
13878 /*
13879 * Return the remainder of the extent (stripe unit)
13880 *
13881 * If length = 1 is passed to Striper::file_to_extents we get a single
13882 * extent back, but its length is one so we still need to compute the length
13883 * to the end of the stripe unit.
13884 *
13885 * If length = su then we may get 1 or 2 objects back in the extents vector
13886 * which would have to be examined. Even then, the offsets are local to the
13887 * object, so matching up to the file offset is extra work.
13888 *
13889 * It seems simpler to stick with length = 1 and manually compute the
13890 * remainder.
13891 */
13892 if (len) {
13893 uint64_t su = in->layout.stripe_unit;
13894 *len = su - (off % su);
13895 }
13896
13897 return 0;
13898 }
13899
13900 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13901 {
13902 std::lock_guard lock(client_lock);
13903
13904 if (unmounting)
13905 return -ENOTCONN;
13906
13907 if (id < 0)
13908 return -EINVAL;
13909 return objecter->with_osdmap([&](const OSDMap& o) {
13910 return o.crush->get_full_location_ordered(id, path);
13911 });
13912 }
13913
13914 int Client::get_file_stripe_address(int fd, loff_t offset,
13915 vector<entity_addr_t>& address)
13916 {
13917 std::lock_guard lock(client_lock);
13918
13919 if (unmounting)
13920 return -ENOTCONN;
13921
13922 Fh *f = get_filehandle(fd);
13923 if (!f)
13924 return -EBADF;
13925 Inode *in = f->inode.get();
13926
13927 // which object?
13928 vector<ObjectExtent> extents;
13929 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13930 in->truncate_size, extents);
13931 ceph_assert(extents.size() == 1);
13932
13933 // now we have the object and its 'layout'
13934 return objecter->with_osdmap([&](const OSDMap& o) {
13935 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13936 vector<int> osds;
13937 o.pg_to_acting_osds(pg, osds);
13938 if (osds.empty())
13939 return -EINVAL;
13940 for (unsigned i = 0; i < osds.size(); i++) {
13941 entity_addr_t addr = o.get_addrs(osds[i]).front();
13942 address.push_back(addr);
13943 }
13944 return 0;
13945 });
13946 }
13947
13948 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13949 {
13950 std::lock_guard lock(client_lock);
13951
13952 if (unmounting)
13953 return -ENOTCONN;
13954
13955 return objecter->with_osdmap([&](const OSDMap& o) {
13956 if (!o.exists(osd))
13957 return -ENOENT;
13958
13959 addr = o.get_addrs(osd).front();
13960 return 0;
13961 });
13962 }
13963
13964 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13965 loff_t length, loff_t offset)
13966 {
13967 std::lock_guard lock(client_lock);
13968
13969 if (unmounting)
13970 return -ENOTCONN;
13971
13972 Fh *f = get_filehandle(fd);
13973 if (!f)
13974 return -EBADF;
13975 Inode *in = f->inode.get();
13976
13977 // map to a list of extents
13978 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13979
13980 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13981 return 0;
13982 }
13983
13984
13985 /* find an osd with the same ip. -ENXIO if none. */
13986 int Client::get_local_osd()
13987 {
13988 std::lock_guard lock(client_lock);
13989
13990 if (unmounting)
13991 return -ENOTCONN;
13992
13993 objecter->with_osdmap([this](const OSDMap& o) {
13994 if (o.get_epoch() != local_osd_epoch) {
13995 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
13996 local_osd_epoch = o.get_epoch();
13997 }
13998 });
13999 return local_osd;
14000 }
14001
14002
14003
14004
14005
14006
14007 // ===============================
14008
14009 void Client::ms_handle_connect(Connection *con)
14010 {
14011 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14012 }
14013
14014 bool Client::ms_handle_reset(Connection *con)
14015 {
14016 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14017 return false;
14018 }
14019
14020 void Client::ms_handle_remote_reset(Connection *con)
14021 {
14022 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14023 std::lock_guard l(client_lock);
14024 switch (con->get_peer_type()) {
14025 case CEPH_ENTITY_TYPE_MDS:
14026 {
14027 // kludge to figure out which mds this is; fixme with a Connection* state
14028 mds_rank_t mds = MDS_RANK_NONE;
14029 MetaSession *s = NULL;
14030 for (auto &p : mds_sessions) {
14031 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14032 mds = p.first;
14033 s = &p.second;
14034 }
14035 }
14036 if (mds >= 0) {
14037 assert (s != NULL);
14038 switch (s->state) {
14039 case MetaSession::STATE_CLOSING:
14040 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14041 _closed_mds_session(s);
14042 break;
14043
14044 case MetaSession::STATE_OPENING:
14045 {
14046 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14047 list<Context*> waiters;
14048 waiters.swap(s->waiting_for_open);
14049 _closed_mds_session(s);
14050 MetaSession *news = _get_or_open_mds_session(mds);
14051 news->waiting_for_open.swap(waiters);
14052 }
14053 break;
14054
14055 case MetaSession::STATE_OPEN:
14056 {
14057 objecter->maybe_request_map(); /* to check if we are blacklisted */
14058 const auto& conf = cct->_conf;
14059 if (conf->client_reconnect_stale) {
14060 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14061 _closed_mds_session(s);
14062 } else {
14063 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14064 s->state = MetaSession::STATE_STALE;
14065 }
14066 }
14067 break;
14068
14069 case MetaSession::STATE_NEW:
14070 case MetaSession::STATE_CLOSED:
14071 default:
14072 break;
14073 }
14074 }
14075 }
14076 break;
14077 }
14078 }
14079
14080 bool Client::ms_handle_refused(Connection *con)
14081 {
14082 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14083 return false;
14084 }
14085
14086 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14087 {
14088 Inode *quota_in = root_ancestor;
14089 SnapRealm *realm = in->snaprealm;
14090 while (realm) {
14091 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14092 if (realm->ino != in->ino) {
14093 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14094 if (p == inode_map.end())
14095 break;
14096
14097 if (p->second->quota.is_enable()) {
14098 quota_in = p->second;
14099 break;
14100 }
14101 }
14102 realm = realm->pparent;
14103 }
14104 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14105 return quota_in;
14106 }
14107
14108 /**
14109 * Traverse quota ancestors of the Inode, return true
14110 * if any of them passes the passed function
14111 */
14112 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14113 std::function<bool (const Inode &in)> test)
14114 {
14115 while (true) {
14116 ceph_assert(in != NULL);
14117 if (test(*in)) {
14118 return true;
14119 }
14120
14121 if (in == root_ancestor) {
14122 // We're done traversing, drop out
14123 return false;
14124 } else {
14125 // Continue up the tree
14126 in = get_quota_root(in, perms);
14127 }
14128 }
14129
14130 return false;
14131 }
14132
14133 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14134 {
14135 return check_quota_condition(in, perms,
14136 [](const Inode &in) {
14137 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14138 });
14139 }
14140
14141 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14142 const UserPerm& perms)
14143 {
14144 return check_quota_condition(in, perms,
14145 [&new_bytes](const Inode &in) {
14146 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14147 > in.quota.max_bytes;
14148 });
14149 }
14150
14151 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14152 {
14153 ceph_assert(in->size >= in->reported_size);
14154 const uint64_t size = in->size - in->reported_size;
14155 return check_quota_condition(in, perms,
14156 [&size](const Inode &in) {
14157 if (in.quota.max_bytes) {
14158 if (in.rstat.rbytes >= in.quota.max_bytes) {
14159 return true;
14160 }
14161
14162 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14163 return (space >> 4) < size;
14164 } else {
14165 return false;
14166 }
14167 });
14168 }
14169
14170 enum {
14171 POOL_CHECKED = 1,
14172 POOL_CHECKING = 2,
14173 POOL_READ = 4,
14174 POOL_WRITE = 8,
14175 };
14176
14177 int Client::check_pool_perm(Inode *in, int need)
14178 {
14179 if (!cct->_conf->client_check_pool_perm)
14180 return 0;
14181
14182 int64_t pool_id = in->layout.pool_id;
14183 std::string pool_ns = in->layout.pool_ns;
14184 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14185 int have = 0;
14186 while (true) {
14187 auto it = pool_perms.find(perm_key);
14188 if (it == pool_perms.end())
14189 break;
14190 if (it->second == POOL_CHECKING) {
14191 // avoid concurrent checkings
14192 wait_on_list(waiting_for_pool_perm);
14193 } else {
14194 have = it->second;
14195 ceph_assert(have & POOL_CHECKED);
14196 break;
14197 }
14198 }
14199
14200 if (!have) {
14201 if (in->snapid != CEPH_NOSNAP) {
14202 // pool permission check needs to write to the first object. But for snapshot,
14203 // head of the first object may have alread been deleted. To avoid creating
14204 // orphan object, skip the check for now.
14205 return 0;
14206 }
14207
14208 pool_perms[perm_key] = POOL_CHECKING;
14209
14210 char oid_buf[32];
14211 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14212 object_t oid = oid_buf;
14213
14214 SnapContext nullsnapc;
14215
14216 C_SaferCond rd_cond;
14217 ObjectOperation rd_op;
14218 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14219
14220 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14221 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14222
14223 C_SaferCond wr_cond;
14224 ObjectOperation wr_op;
14225 wr_op.create(true);
14226
14227 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14228 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14229
14230 client_lock.unlock();
14231 int rd_ret = rd_cond.wait();
14232 int wr_ret = wr_cond.wait();
14233 client_lock.lock();
14234
14235 bool errored = false;
14236
14237 if (rd_ret == 0 || rd_ret == -ENOENT)
14238 have |= POOL_READ;
14239 else if (rd_ret != -EPERM) {
14240 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14241 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14242 errored = true;
14243 }
14244
14245 if (wr_ret == 0 || wr_ret == -EEXIST)
14246 have |= POOL_WRITE;
14247 else if (wr_ret != -EPERM) {
14248 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14249 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14250 errored = true;
14251 }
14252
14253 if (errored) {
14254 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14255 // Raise EIO because actual error code might be misleading for
14256 // userspace filesystem user.
14257 pool_perms.erase(perm_key);
14258 signal_cond_list(waiting_for_pool_perm);
14259 return -EIO;
14260 }
14261
14262 pool_perms[perm_key] = have | POOL_CHECKED;
14263 signal_cond_list(waiting_for_pool_perm);
14264 }
14265
14266 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14267 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14268 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14269 return -EPERM;
14270 }
14271 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14272 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14273 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14274 return -EPERM;
14275 }
14276
14277 return 0;
14278 }
14279
14280 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14281 {
14282 if (acl_type == POSIX_ACL) {
14283 if (in->xattrs.count(ACL_EA_ACCESS)) {
14284 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14285
14286 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14287 }
14288 }
14289 return -EAGAIN;
14290 }
14291
14292 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14293 {
14294 if (acl_type == NO_ACL)
14295 return 0;
14296
14297 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14298 if (r < 0)
14299 goto out;
14300
14301 if (acl_type == POSIX_ACL) {
14302 if (in->xattrs.count(ACL_EA_ACCESS)) {
14303 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14304 bufferptr acl(access_acl.c_str(), access_acl.length());
14305 r = posix_acl_access_chmod(acl, mode);
14306 if (r < 0)
14307 goto out;
14308 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14309 } else {
14310 r = 0;
14311 }
14312 }
14313 out:
14314 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14315 return r;
14316 }
14317
14318 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14319 const UserPerm& perms)
14320 {
14321 if (acl_type == NO_ACL)
14322 return 0;
14323
14324 if (S_ISLNK(*mode))
14325 return 0;
14326
14327 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14328 if (r < 0)
14329 goto out;
14330
14331 if (acl_type == POSIX_ACL) {
14332 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14333 map<string, bufferptr> xattrs;
14334
14335 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14336 bufferptr acl(default_acl.c_str(), default_acl.length());
14337 r = posix_acl_inherit_mode(acl, mode);
14338 if (r < 0)
14339 goto out;
14340
14341 if (r > 0) {
14342 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14343 if (r < 0)
14344 goto out;
14345 if (r > 0)
14346 xattrs[ACL_EA_ACCESS] = acl;
14347 }
14348
14349 if (S_ISDIR(*mode))
14350 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14351
14352 r = xattrs.size();
14353 if (r > 0)
14354 encode(xattrs, xattrs_bl);
14355 } else {
14356 if (umask_cb)
14357 *mode &= ~umask_cb(callback_handle);
14358 r = 0;
14359 }
14360 }
14361 out:
14362 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14363 return r;
14364 }
14365
14366 void Client::set_filer_flags(int flags)
14367 {
14368 std::lock_guard l(client_lock);
14369 ceph_assert(flags == 0 ||
14370 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14371 objecter->add_global_op_flags(flags);
14372 }
14373
14374 void Client::clear_filer_flags(int flags)
14375 {
14376 std::lock_guard l(client_lock);
14377 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14378 objecter->clear_global_op_flag(flags);
14379 }
14380
14381 // called before mount
14382 void Client::set_uuid(const std::string& uuid)
14383 {
14384 std::lock_guard l(client_lock);
14385 assert(initialized);
14386 assert(!uuid.empty());
14387
14388 metadata["uuid"] = uuid;
14389 _close_sessions();
14390 }
14391
14392 // called before mount. 0 means infinite
14393 void Client::set_session_timeout(unsigned timeout)
14394 {
14395 std::lock_guard l(client_lock);
14396 assert(initialized);
14397
14398 metadata["timeout"] = stringify(timeout);
14399 }
14400
14401 // called before mount
14402 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14403 const std::string& fs_name)
14404 {
14405 std::lock_guard l(client_lock);
14406 if (!initialized)
14407 return -ENOTCONN;
14408
14409 if (uuid.empty())
14410 return -EINVAL;
14411
14412 {
14413 auto it = metadata.find("uuid");
14414 if (it != metadata.end() && it->second == uuid)
14415 return -EINVAL;
14416 }
14417
14418 int r = subscribe_mdsmap(fs_name);
14419 if (r < 0) {
14420 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14421 return r;
14422 }
14423
14424 if (metadata.empty())
14425 populate_metadata("");
14426
14427 while (mdsmap->get_epoch() == 0)
14428 wait_on_list(waiting_for_mdsmap);
14429
14430 reclaim_errno = 0;
14431 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14432 if (!mdsmap->is_up(mds)) {
14433 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14434 wait_on_list(waiting_for_mdsmap);
14435 continue;
14436 }
14437
14438 MetaSession *session;
14439 if (!have_open_session(mds)) {
14440 session = _get_or_open_mds_session(mds);
14441 if (session->state != MetaSession::STATE_OPENING) {
14442 // umounting?
14443 return -EINVAL;
14444 }
14445 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14446 wait_on_context_list(session->waiting_for_open);
14447 if (rejected_by_mds.count(mds))
14448 return -EPERM;
14449 continue;
14450 }
14451
14452 session = &mds_sessions.at(mds);
14453 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14454 return -EOPNOTSUPP;
14455
14456 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14457 session->reclaim_state == MetaSession::RECLAIMING) {
14458 session->reclaim_state = MetaSession::RECLAIMING;
14459 auto m = make_message<MClientReclaim>(uuid, flags);
14460 session->con->send_message2(std::move(m));
14461 wait_on_list(waiting_for_reclaim);
14462 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14463 return reclaim_errno ? : -ENOTRECOVERABLE;
14464 } else {
14465 mds++;
14466 }
14467 }
14468
14469 // didn't find target session in any mds
14470 if (reclaim_target_addrs.empty()) {
14471 if (flags & CEPH_RECLAIM_RESET)
14472 return -ENOENT;
14473 return -ENOTRECOVERABLE;
14474 }
14475
14476 if (flags & CEPH_RECLAIM_RESET)
14477 return 0;
14478
14479 // use blacklist to check if target session was killed
14480 // (config option mds_session_blacklist_on_evict needs to be true)
14481 C_SaferCond cond;
14482 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14483 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14484 client_lock.unlock();
14485 cond.wait();
14486 client_lock.lock();
14487 }
14488
14489 bool blacklisted = objecter->with_osdmap(
14490 [this](const OSDMap &osd_map) -> bool {
14491 return osd_map.is_blacklisted(reclaim_target_addrs);
14492 });
14493 if (blacklisted)
14494 return -ENOTRECOVERABLE;
14495
14496 metadata["reclaiming_uuid"] = uuid;
14497 return 0;
14498 }
14499
14500 void Client::finish_reclaim()
14501 {
14502 auto it = metadata.find("reclaiming_uuid");
14503 if (it == metadata.end()) {
14504 for (auto &p : mds_sessions)
14505 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14506 return;
14507 }
14508
14509 for (auto &p : mds_sessions) {
14510 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14511 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14512 p.second.con->send_message2(std::move(m));
14513 }
14514
14515 metadata["uuid"] = it->second;
14516 metadata.erase(it);
14517 }
14518
14519 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14520 {
14521 mds_rank_t from = mds_rank_t(reply->get_source().num());
14522 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14523
14524 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14525 if (!session) {
14526 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14527 return;
14528 }
14529
14530 if (reply->get_result() >= 0) {
14531 session->reclaim_state = MetaSession::RECLAIM_OK;
14532 if (reply->get_epoch() > reclaim_osd_epoch)
14533 reclaim_osd_epoch = reply->get_epoch();
14534 if (!reply->get_addrs().empty())
14535 reclaim_target_addrs = reply->get_addrs();
14536 } else {
14537 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14538 reclaim_errno = reply->get_result();
14539 }
14540
14541 signal_cond_list(waiting_for_reclaim);
14542 }
14543
14544 /**
14545 * This is included in cap release messages, to cause
14546 * the MDS to wait until this OSD map epoch. It is necessary
14547 * in corner cases where we cancel RADOS ops, so that
14548 * nobody else tries to do IO to the same objects in
14549 * the same epoch as the cancelled ops.
14550 */
14551 void Client::set_cap_epoch_barrier(epoch_t e)
14552 {
14553 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14554 cap_epoch_barrier = e;
14555 }
14556
14557 const char** Client::get_tracked_conf_keys() const
14558 {
14559 static const char* keys[] = {
14560 "client_cache_size",
14561 "client_cache_mid",
14562 "client_acl_type",
14563 "client_deleg_timeout",
14564 "client_deleg_break_on_open",
14565 NULL
14566 };
14567 return keys;
14568 }
14569
14570 void Client::handle_conf_change(const ConfigProxy& conf,
14571 const std::set <std::string> &changed)
14572 {
14573 std::lock_guard lock(client_lock);
14574
14575 if (changed.count("client_cache_mid")) {
14576 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14577 }
14578 if (changed.count("client_acl_type")) {
14579 acl_type = NO_ACL;
14580 if (cct->_conf->client_acl_type == "posix_acl")
14581 acl_type = POSIX_ACL;
14582 }
14583 }
14584
14585 void intrusive_ptr_add_ref(Inode *in)
14586 {
14587 in->get();
14588 }
14589
14590 void intrusive_ptr_release(Inode *in)
14591 {
14592 in->client->put_inode(in);
14593 }
14594
14595 mds_rank_t Client::_get_random_up_mds() const
14596 {
14597 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14598
14599 std::set<mds_rank_t> up;
14600 mdsmap->get_up_mds_set(up);
14601
14602 if (up.empty())
14603 return MDS_RANK_NONE;
14604 std::set<mds_rank_t>::const_iterator p = up.begin();
14605 for (int n = rand() % up.size(); n; n--)
14606 ++p;
14607 return *p;
14608 }
14609
14610
14611 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14612 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14613 {
14614 monclient->set_messenger(m);
14615 objecter->set_client_incarnation(0);
14616 }
14617
14618 StandaloneClient::~StandaloneClient()
14619 {
14620 delete objecter;
14621 objecter = nullptr;
14622 }
14623
14624 int StandaloneClient::init()
14625 {
14626 timer.init();
14627 objectcacher->start();
14628 objecter->init();
14629
14630 client_lock.lock();
14631 ceph_assert(!is_initialized());
14632
14633 messenger->add_dispatcher_tail(objecter);
14634 messenger->add_dispatcher_tail(this);
14635
14636 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14637 int r = monclient->init();
14638 if (r < 0) {
14639 // need to do cleanup because we're in an intermediate init state
14640 timer.shutdown();
14641 client_lock.unlock();
14642 objecter->shutdown();
14643 objectcacher->stop();
14644 monclient->shutdown();
14645 return r;
14646 }
14647 objecter->start();
14648
14649 client_lock.unlock();
14650 _finish_init();
14651
14652 return 0;
14653 }
14654
14655 void StandaloneClient::shutdown()
14656 {
14657 Client::shutdown();
14658 objecter->shutdown();
14659 monclient->shutdown();
14660 }