]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/Mutex.h"
74 #include "common/perf_counters.h"
75 #include "common/admin_socket.h"
76 #include "common/errno.h"
77 #include "include/str_list.h"
78
79 #define dout_subsys ceph_subsys_client
80
81 #include "include/lru.h"
82 #include "include/compat.h"
83 #include "include/stringify.h"
84
85 #include "Client.h"
86 #include "Inode.h"
87 #include "Dentry.h"
88 #include "Delegation.h"
89 #include "Dir.h"
90 #include "ClientSnapRealm.h"
91 #include "Fh.h"
92 #include "MetaSession.h"
93 #include "MetaRequest.h"
94 #include "ObjecterWriteback.h"
95 #include "posix_acl.h"
96
97 #include "include/ceph_assert.h"
98 #include "include/stat.h"
99
100 #include "include/cephfs/ceph_statx.h"
101
102 #if HAVE_GETGROUPLIST
103 #include <grp.h>
104 #include <pwd.h>
105 #include <unistd.h>
106 #endif
107
108 #undef dout_prefix
109 #define dout_prefix *_dout << "client." << whoami << " "
110
111 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113 // FreeBSD fails to define this
114 #ifndef O_DSYNC
115 #define O_DSYNC 0x0
116 #endif
117 // Darwin fails to define this
118 #ifndef O_RSYNC
119 #define O_RSYNC 0x0
120 #endif
121
122 #ifndef O_DIRECT
123 #define O_DIRECT 0x0
124 #endif
125
126 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129 {
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132 }
133
134
135 // -------------
136
137 Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139 {
140 }
141
142 bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
145 {
146 std::unique_ptr<Formatter> f(Formatter::create(format));
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
150 m_client->dump_mds_requests(f.get());
151 else if (command == "mds_sessions")
152 m_client->dump_mds_sessions(f.get());
153 else if (command == "dump_cache")
154 m_client->dump_cache(f.get());
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
158 m_client->dump_status(f.get());
159 else
160 ceph_abort_msg("bad command registered");
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
164 return true;
165 }
166
167
168 // -------------
169
170 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176 void Client::_reset_faked_inos()
177 {
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
182 last_used_faked_root = 0;
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184 }
185
186 void Client::_assign_faked_ino(Inode *in)
187 {
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
192 last_used_faked_ino = 2048;
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
195 ceph_assert(it != free_faked_inos.end());
196 if (last_used_faked_ino < it.get_start()) {
197 ceph_assert(it.get_len() > 0);
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206 }
207
208 /*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214 */
215 void Client::_assign_faked_root(Inode *in)
216 {
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232 }
233
234 void Client::_release_faked_ino(Inode *in)
235 {
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238 }
239
240 vinodeno_t Client::_map_faked_ino(ino_t ino)
241 {
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
250 return vino;
251 }
252
253 vinodeno_t Client::map_faked_ino(ino_t ino)
254 {
255 std::lock_guard lock(client_lock);
256 return _map_faked_ino(ino);
257 }
258
259 // cons/des
260
261 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
263 timer(m->cct, client_lock),
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
274 m_command_hook(this),
275 fscid(0)
276 {
277 _reset_faked_inos();
278
279 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
280 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
281
282 user_id = cct->_conf->client_mount_uid;
283 group_id = cct->_conf->client_mount_gid;
284
285 if (cct->_conf->client_acl_type == "posix_acl")
286 acl_type = POSIX_ACL;
287
288 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
289
290 // file handles
291 free_fd_set.insert(10, 1<<30);
292
293 mdsmap.reset(new MDSMap);
294
295 // osd interfaces
296 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
297 &client_lock));
298 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
299 client_flush_set_callback, // all commit callback
300 (void*)this,
301 cct->_conf->client_oc_size,
302 cct->_conf->client_oc_max_objects,
303 cct->_conf->client_oc_max_dirty,
304 cct->_conf->client_oc_target_dirty,
305 cct->_conf->client_oc_max_dirty_age,
306 true));
307 objecter_finisher.start();
308 filer.reset(new Filer(objecter, &objecter_finisher));
309 objecter->enable_blacklist_events();
310 }
311
312
313 Client::~Client()
314 {
315 ceph_assert(!client_lock.is_locked());
316
317 // It is necessary to hold client_lock, because any inode destruction
318 // may call into ObjectCacher, which asserts that it's lock (which is
319 // client_lock) is held.
320 client_lock.Lock();
321 tear_down_cache();
322 client_lock.Unlock();
323 }
324
325 void Client::tear_down_cache()
326 {
327 // fd's
328 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
329 it != fd_map.end();
330 ++it) {
331 Fh *fh = it->second;
332 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
333 _release_fh(fh);
334 }
335 fd_map.clear();
336
337 while (!opened_dirs.empty()) {
338 dir_result_t *dirp = *opened_dirs.begin();
339 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
340 _closedir(dirp);
341 }
342
343 // caps!
344 // *** FIXME ***
345
346 // empty lru
347 trim_cache();
348 ceph_assert(lru.lru_get_size() == 0);
349
350 // close root ino
351 ceph_assert(inode_map.size() <= 1 + root_parents.size());
352 if (root && inode_map.size() == 1 + root_parents.size()) {
353 delete root;
354 root = 0;
355 root_ancestor = 0;
356 while (!root_parents.empty())
357 root_parents.erase(root_parents.begin());
358 inode_map.clear();
359 _reset_faked_inos();
360 }
361
362 ceph_assert(inode_map.empty());
363 }
364
365 inodeno_t Client::get_root_ino()
366 {
367 std::lock_guard l(client_lock);
368 if (use_faked_inos())
369 return root->faked_ino;
370 else
371 return root->ino;
372 }
373
374 Inode *Client::get_root()
375 {
376 std::lock_guard l(client_lock);
377 root->ll_get();
378 return root;
379 }
380
381
382 // debug crapola
383
384 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
385 {
386 filepath path;
387 in->make_long_path(path);
388 ldout(cct, 1) << "dump_inode: "
389 << (disconnected ? "DISCONNECTED ":"")
390 << "inode " << in->ino
391 << " " << path
392 << " ref " << in->get_num_ref()
393 << *in << dendl;
394
395 if (f) {
396 f->open_object_section("inode");
397 f->dump_stream("path") << path;
398 if (disconnected)
399 f->dump_int("disconnected", 1);
400 in->dump(f);
401 f->close_section();
402 }
403
404 did.insert(in);
405 if (in->dir) {
406 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
407 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
408 it != in->dir->dentries.end();
409 ++it) {
410 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
411 if (f) {
412 f->open_object_section("dentry");
413 it->second->dump(f);
414 f->close_section();
415 }
416 if (it->second->inode)
417 dump_inode(f, it->second->inode.get(), did, false);
418 }
419 }
420 }
421
422 void Client::dump_cache(Formatter *f)
423 {
424 set<Inode*> did;
425
426 ldout(cct, 1) << __func__ << dendl;
427
428 if (f)
429 f->open_array_section("cache");
430
431 if (root)
432 dump_inode(f, root, did, true);
433
434 // make a second pass to catch anything disconnected
435 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
436 it != inode_map.end();
437 ++it) {
438 if (did.count(it->second))
439 continue;
440 dump_inode(f, it->second, did, true);
441 }
442
443 if (f)
444 f->close_section();
445 }
446
447 void Client::dump_status(Formatter *f)
448 {
449 ceph_assert(client_lock.is_locked_by_me());
450
451 ldout(cct, 1) << __func__ << dendl;
452
453 const epoch_t osd_epoch
454 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
455
456 if (f) {
457 f->open_object_section("metadata");
458 for (const auto& kv : metadata)
459 f->dump_string(kv.first.c_str(), kv.second);
460 f->close_section();
461
462 f->dump_int("dentry_count", lru.lru_get_size());
463 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
464 f->dump_int("id", get_nodeid().v);
465 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
466 f->dump_object("inst", inst);
467 f->dump_object("addr", inst.addr);
468 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
469 f->dump_string("addr_str", inst.addr.get_legacy_str());
470 f->dump_int("inode_count", inode_map.size());
471 f->dump_int("mds_epoch", mdsmap->get_epoch());
472 f->dump_int("osd_epoch", osd_epoch);
473 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
474 f->dump_bool("blacklisted", blacklisted);
475 }
476 }
477
478 int Client::init()
479 {
480 timer.init();
481 objectcacher->start();
482
483 client_lock.Lock();
484 ceph_assert(!initialized);
485
486 messenger->add_dispatcher_tail(this);
487 client_lock.Unlock();
488
489 _finish_init();
490 return 0;
491 }
492
493 void Client::_finish_init()
494 {
495 client_lock.Lock();
496 // logger
497 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
498 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
499 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
500 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
501 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
502 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
503 logger.reset(plb.create_perf_counters());
504 cct->get_perfcounters_collection()->add(logger.get());
505
506 client_lock.Unlock();
507
508 cct->_conf.add_observer(this);
509
510 AdminSocket* admin_socket = cct->get_admin_socket();
511 int ret = admin_socket->register_command("mds_requests",
512 "mds_requests",
513 &m_command_hook,
514 "show in-progress mds requests");
515 if (ret < 0) {
516 lderr(cct) << "error registering admin socket command: "
517 << cpp_strerror(-ret) << dendl;
518 }
519 ret = admin_socket->register_command("mds_sessions",
520 "mds_sessions",
521 &m_command_hook,
522 "show mds session state");
523 if (ret < 0) {
524 lderr(cct) << "error registering admin socket command: "
525 << cpp_strerror(-ret) << dendl;
526 }
527 ret = admin_socket->register_command("dump_cache",
528 "dump_cache",
529 &m_command_hook,
530 "show in-memory metadata cache contents");
531 if (ret < 0) {
532 lderr(cct) << "error registering admin socket command: "
533 << cpp_strerror(-ret) << dendl;
534 }
535 ret = admin_socket->register_command("kick_stale_sessions",
536 "kick_stale_sessions",
537 &m_command_hook,
538 "kick sessions that were remote reset");
539 if (ret < 0) {
540 lderr(cct) << "error registering admin socket command: "
541 << cpp_strerror(-ret) << dendl;
542 }
543 ret = admin_socket->register_command("status",
544 "status",
545 &m_command_hook,
546 "show overall client status");
547 if (ret < 0) {
548 lderr(cct) << "error registering admin socket command: "
549 << cpp_strerror(-ret) << dendl;
550 }
551
552 client_lock.Lock();
553 initialized = true;
554 client_lock.Unlock();
555 }
556
557 void Client::shutdown()
558 {
559 ldout(cct, 1) << __func__ << dendl;
560
561 // If we were not mounted, but were being used for sending
562 // MDS commands, we may have sessions that need closing.
563 client_lock.Lock();
564 _close_sessions();
565 client_lock.Unlock();
566
567 cct->_conf.remove_observer(this);
568
569 cct->get_admin_socket()->unregister_commands(&m_command_hook);
570
571 if (ino_invalidate_cb) {
572 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
573 async_ino_invalidator.wait_for_empty();
574 async_ino_invalidator.stop();
575 }
576
577 if (dentry_invalidate_cb) {
578 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
579 async_dentry_invalidator.wait_for_empty();
580 async_dentry_invalidator.stop();
581 }
582
583 if (switch_interrupt_cb) {
584 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
585 interrupt_finisher.wait_for_empty();
586 interrupt_finisher.stop();
587 }
588
589 if (remount_cb) {
590 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
591 remount_finisher.wait_for_empty();
592 remount_finisher.stop();
593 }
594
595 objectcacher->stop(); // outside of client_lock! this does a join.
596
597 client_lock.Lock();
598 ceph_assert(initialized);
599 initialized = false;
600 timer.shutdown();
601 client_lock.Unlock();
602
603 objecter_finisher.wait_for_empty();
604 objecter_finisher.stop();
605
606 if (logger) {
607 cct->get_perfcounters_collection()->remove(logger.get());
608 logger.reset();
609 }
610 }
611
612
613 // ===================
614 // metadata cache stuff
615
616 void Client::trim_cache(bool trim_kernel_dcache)
617 {
618 uint64_t max = cct->_conf->client_cache_size;
619 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
620 unsigned last = 0;
621 while (lru.lru_get_size() != last) {
622 last = lru.lru_get_size();
623
624 if (!unmounting && lru.lru_get_size() <= max) break;
625
626 // trim!
627 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
628 if (!dn)
629 break; // done
630
631 trim_dentry(dn);
632 }
633
634 if (trim_kernel_dcache && lru.lru_get_size() > max)
635 _invalidate_kernel_dcache();
636
637 // hose root?
638 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
639 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
640 delete root;
641 root = 0;
642 root_ancestor = 0;
643 while (!root_parents.empty())
644 root_parents.erase(root_parents.begin());
645 inode_map.clear();
646 _reset_faked_inos();
647 }
648 }
649
650 void Client::trim_cache_for_reconnect(MetaSession *s)
651 {
652 mds_rank_t mds = s->mds_num;
653 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
654
655 int trimmed = 0;
656 list<Dentry*> skipped;
657 while (lru.lru_get_size() > 0) {
658 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
659 if (!dn)
660 break;
661
662 if ((dn->inode && dn->inode->caps.count(mds)) ||
663 dn->dir->parent_inode->caps.count(mds)) {
664 trim_dentry(dn);
665 trimmed++;
666 } else
667 skipped.push_back(dn);
668 }
669
670 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
671 lru.lru_insert_mid(*p);
672
673 ldout(cct, 20) << __func__ << " mds." << mds
674 << " trimmed " << trimmed << " dentries" << dendl;
675
676 if (s->caps.size() > 0)
677 _invalidate_kernel_dcache();
678 }
679
680 void Client::trim_dentry(Dentry *dn)
681 {
682 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
683 << " in dir "
684 << std::hex << dn->dir->parent_inode->ino << std::dec
685 << dendl;
686 if (dn->inode) {
687 Inode *diri = dn->dir->parent_inode;
688 diri->dir_release_count++;
689 clear_dir_complete_and_ordered(diri, true);
690 }
691 unlink(dn, false, false); // drop dir, drop dentry
692 }
693
694
695 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
696 uint64_t truncate_seq, uint64_t truncate_size)
697 {
698 uint64_t prior_size = in->size;
699
700 if (truncate_seq > in->truncate_seq ||
701 (truncate_seq == in->truncate_seq && size > in->size)) {
702 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
703 in->size = size;
704 in->reported_size = size;
705 if (truncate_seq != in->truncate_seq) {
706 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
707 << truncate_seq << dendl;
708 in->truncate_seq = truncate_seq;
709 in->oset.truncate_seq = truncate_seq;
710
711 // truncate cached file data
712 if (prior_size > size) {
713 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
714 }
715 }
716
717 // truncate inline data
718 if (in->inline_version < CEPH_INLINE_NONE) {
719 uint32_t len = in->inline_data.length();
720 if (size < len)
721 in->inline_data.splice(size, len - size);
722 }
723 }
724 if (truncate_seq >= in->truncate_seq &&
725 in->truncate_size != truncate_size) {
726 if (in->is_file()) {
727 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
728 << truncate_size << dendl;
729 in->truncate_size = truncate_size;
730 in->oset.truncate_size = truncate_size;
731 } else {
732 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
733 }
734 }
735 }
736
737 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
738 utime_t ctime, utime_t mtime, utime_t atime)
739 {
740 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
741 << " ctime " << ctime << " mtime " << mtime << dendl;
742
743 if (time_warp_seq > in->time_warp_seq)
744 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
745 << " is higher than local time_warp_seq "
746 << in->time_warp_seq << dendl;
747
748 int warn = false;
749 // be careful with size, mtime, atime
750 if (issued & (CEPH_CAP_FILE_EXCL|
751 CEPH_CAP_FILE_WR|
752 CEPH_CAP_FILE_BUFFER|
753 CEPH_CAP_AUTH_EXCL|
754 CEPH_CAP_XATTR_EXCL)) {
755 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
756 if (ctime > in->ctime)
757 in->ctime = ctime;
758 if (time_warp_seq > in->time_warp_seq) {
759 //the mds updated times, so take those!
760 in->mtime = mtime;
761 in->atime = atime;
762 in->time_warp_seq = time_warp_seq;
763 } else if (time_warp_seq == in->time_warp_seq) {
764 //take max times
765 if (mtime > in->mtime)
766 in->mtime = mtime;
767 if (atime > in->atime)
768 in->atime = atime;
769 } else if (issued & CEPH_CAP_FILE_EXCL) {
770 //ignore mds values as we have a higher seq
771 } else warn = true;
772 } else {
773 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
774 if (time_warp_seq >= in->time_warp_seq) {
775 in->ctime = ctime;
776 in->mtime = mtime;
777 in->atime = atime;
778 in->time_warp_seq = time_warp_seq;
779 } else warn = true;
780 }
781 if (warn) {
782 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
783 << time_warp_seq << " is lower than local time_warp_seq "
784 << in->time_warp_seq
785 << dendl;
786 }
787 }
788
789 void Client::_fragmap_remove_non_leaves(Inode *in)
790 {
791 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
792 if (!in->dirfragtree.is_leaf(p->first))
793 in->fragmap.erase(p++);
794 else
795 ++p;
796 }
797
798 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
799 {
800 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
801 if (p->second == mds)
802 in->fragmap.erase(p++);
803 else
804 ++p;
805 }
806
807 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
808 MetaSession *session,
809 const UserPerm& request_perms)
810 {
811 Inode *in;
812 bool was_new = false;
813 if (inode_map.count(st->vino)) {
814 in = inode_map[st->vino];
815 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
816 } else {
817 in = new Inode(this, st->vino, &st->layout);
818 inode_map[st->vino] = in;
819
820 if (use_faked_inos())
821 _assign_faked_ino(in);
822
823 if (!root) {
824 root = in;
825 if (use_faked_inos())
826 _assign_faked_root(root);
827 root_ancestor = in;
828 cwd = root;
829 } else if (!mounted) {
830 root_parents[root_ancestor] = in;
831 root_ancestor = in;
832 }
833
834 // immutable bits
835 in->ino = st->vino.ino;
836 in->snapid = st->vino.snapid;
837 in->mode = st->mode & S_IFMT;
838 was_new = true;
839 }
840
841 in->rdev = st->rdev;
842 if (in->is_symlink())
843 in->symlink = st->symlink;
844
845 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
846 bool new_version = false;
847 if (in->version == 0 ||
848 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
849 (in->version & ~1) < st->version))
850 new_version = true;
851
852 int issued;
853 in->caps_issued(&issued);
854 issued |= in->caps_dirty();
855 int new_issued = ~issued & (int)st->cap.caps;
856
857 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
858 !(issued & CEPH_CAP_AUTH_EXCL)) {
859 in->mode = st->mode;
860 in->uid = st->uid;
861 in->gid = st->gid;
862 in->btime = st->btime;
863 }
864
865 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
866 !(issued & CEPH_CAP_LINK_EXCL)) {
867 in->nlink = st->nlink;
868 }
869
870 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
871 update_inode_file_time(in, issued, st->time_warp_seq,
872 st->ctime, st->mtime, st->atime);
873 }
874
875 if (new_version ||
876 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
877 in->layout = st->layout;
878 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
879 }
880
881 if (in->is_dir()) {
882 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
883 in->dirstat = st->dirstat;
884 }
885 // dir_layout/rstat/quota are not tracked by capability, update them only if
886 // the inode stat is from auth mds
887 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
888 in->dir_layout = st->dir_layout;
889 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
890 in->rstat = st->rstat;
891 in->quota = st->quota;
892 in->dir_pin = st->dir_pin;
893 }
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
898 }
899 }
900
901 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
902 st->xattrbl.length() &&
903 st->xattr_version > in->xattr_version) {
904 auto p = st->xattrbl.cbegin();
905 decode(in->xattrs, p);
906 in->xattr_version = st->xattr_version;
907 }
908
909 if (st->inline_version > in->inline_version) {
910 in->inline_data = st->inline_data;
911 in->inline_version = st->inline_version;
912 }
913
914 /* always take a newer change attr */
915 if (st->change_attr > in->change_attr)
916 in->change_attr = st->change_attr;
917
918 if (st->version > in->version)
919 in->version = st->version;
920
921 if (was_new)
922 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
923
924 if (!st->cap.caps)
925 return in; // as with readdir returning indoes in different snaprealms (no caps!)
926
927 if (in->snapid == CEPH_NOSNAP) {
928 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
929 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
930 st->cap.flags, request_perms);
931 if (in->auth_cap && in->auth_cap->session == session) {
932 in->max_size = st->max_size;
933 in->rstat = st->rstat;
934 }
935
936 // setting I_COMPLETE needs to happen after adding the cap
937 if (in->is_dir() &&
938 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
939 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
940 in->dirstat.nfiles == 0 &&
941 in->dirstat.nsubdirs == 0) {
942 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
943 in->flags |= I_COMPLETE | I_DIR_ORDERED;
944 if (in->dir) {
945 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
946 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
947 in->dir->readdir_cache.clear();
948 for (const auto& p : in->dir->dentries) {
949 unlink(p.second, true, true); // keep dir, keep dentry
950 }
951 if (in->dir->dentries.empty())
952 close_dir(in->dir);
953 }
954 }
955 } else {
956 in->snap_caps |= st->cap.caps;
957 }
958
959 return in;
960 }
961
962
963 /*
964 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
965 */
966 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
967 Inode *in, utime_t from, MetaSession *session,
968 Dentry *old_dentry)
969 {
970 Dentry *dn = NULL;
971 if (dir->dentries.count(dname))
972 dn = dir->dentries[dname];
973
974 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
975 << " in dir " << dir->parent_inode->vino() << " dn " << dn
976 << dendl;
977
978 if (dn && dn->inode) {
979 if (dn->inode->vino() == in->vino()) {
980 touch_dn(dn);
981 ldout(cct, 12) << " had dentry " << dname
982 << " with correct vino " << dn->inode->vino()
983 << dendl;
984 } else {
985 ldout(cct, 12) << " had dentry " << dname
986 << " with WRONG vino " << dn->inode->vino()
987 << dendl;
988 unlink(dn, true, true); // keep dir, keep dentry
989 }
990 }
991
992 if (!dn || !dn->inode) {
993 InodeRef tmp_ref(in);
994 if (old_dentry) {
995 if (old_dentry->dir != dir) {
996 Inode *old_diri = old_dentry->dir->parent_inode;
997 old_diri->dir_ordered_count++;
998 clear_dir_complete_and_ordered(old_diri, false);
999 }
1000 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1001 }
1002 Inode *diri = dir->parent_inode;
1003 diri->dir_ordered_count++;
1004 clear_dir_complete_and_ordered(diri, false);
1005 dn = link(dir, dname, in, dn);
1006 }
1007
1008 update_dentry_lease(dn, dlease, from, session);
1009 return dn;
1010 }
1011
1012 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1013 {
1014 utime_t dttl = from;
1015 dttl += (float)dlease->duration_ms / 1000.0;
1016
1017 ceph_assert(dn);
1018
1019 if (dlease->mask & CEPH_LOCK_DN) {
1020 if (dttl > dn->lease_ttl) {
1021 ldout(cct, 10) << "got dentry lease on " << dn->name
1022 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1023 dn->lease_ttl = dttl;
1024 dn->lease_mds = session->mds_num;
1025 dn->lease_seq = dlease->seq;
1026 dn->lease_gen = session->cap_gen;
1027 }
1028 }
1029 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1030 }
1031
1032
1033 /*
1034 * update MDS location cache for a single inode
1035 */
1036 void Client::update_dir_dist(Inode *in, DirStat *dst)
1037 {
1038 // auth
1039 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1040 if (dst->auth >= 0) {
1041 in->fragmap[dst->frag] = dst->auth;
1042 } else {
1043 in->fragmap.erase(dst->frag);
1044 }
1045 if (!in->dirfragtree.is_leaf(dst->frag)) {
1046 in->dirfragtree.force_to_leaf(cct, dst->frag);
1047 _fragmap_remove_non_leaves(in);
1048 }
1049
1050 // replicated
1051 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1052
1053 // dist
1054 /*
1055 if (!st->dirfrag_dist.empty()) { // FIXME
1056 set<int> dist = st->dirfrag_dist.begin()->second;
1057 if (dist.empty() && !in->dir_contacts.empty())
1058 ldout(cct, 9) << "lost dist spec for " << in->ino
1059 << " " << dist << dendl;
1060 if (!dist.empty() && in->dir_contacts.empty())
1061 ldout(cct, 9) << "got dist spec for " << in->ino
1062 << " " << dist << dendl;
1063 in->dir_contacts = dist;
1064 }
1065 */
1066 }
1067
1068 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1069 {
1070 if (diri->flags & I_COMPLETE) {
1071 if (complete) {
1072 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1073 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1074 } else {
1075 if (diri->flags & I_DIR_ORDERED) {
1076 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1077 diri->flags &= ~I_DIR_ORDERED;
1078 }
1079 }
1080 if (diri->dir)
1081 diri->dir->readdir_cache.clear();
1082 }
1083 }
1084
1085 /*
1086 * insert results from readdir or lssnap into the metadata cache.
1087 */
1088 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1089
1090 auto& reply = request->reply;
1091 ConnectionRef con = request->reply->get_connection();
1092 uint64_t features;
1093 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1094 features = (uint64_t)-1;
1095 }
1096 else {
1097 features = con->get_features();
1098 }
1099
1100 dir_result_t *dirp = request->dirp;
1101 ceph_assert(dirp);
1102
1103 // the extra buffer list is only set for readdir and lssnap replies
1104 auto p = reply->get_extra_bl().cbegin();
1105 if (!p.end()) {
1106 // snapdir?
1107 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1108 ceph_assert(diri);
1109 diri = open_snapdir(diri);
1110 }
1111
1112 // only open dir if we're actually adding stuff to it!
1113 Dir *dir = diri->open_dir();
1114 ceph_assert(dir);
1115
1116 // dirstat
1117 DirStat dst(p, features);
1118 __u32 numdn;
1119 __u16 flags;
1120 decode(numdn, p);
1121 decode(flags, p);
1122
1123 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1124 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1125
1126 frag_t fg = (unsigned)request->head.args.readdir.frag;
1127 unsigned readdir_offset = dirp->next_offset;
1128 string readdir_start = dirp->last_name;
1129 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1130
1131 unsigned last_hash = 0;
1132 if (hash_order) {
1133 if (!readdir_start.empty()) {
1134 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1135 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1136 /* mds understands offset_hash */
1137 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1138 }
1139 }
1140
1141 if (fg != dst.frag) {
1142 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1143 fg = dst.frag;
1144 if (!hash_order) {
1145 readdir_offset = 2;
1146 readdir_start.clear();
1147 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1148 }
1149 }
1150
1151 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1152 << ", hash_order=" << hash_order
1153 << ", readdir_start " << readdir_start
1154 << ", last_hash " << last_hash
1155 << ", next_offset " << readdir_offset << dendl;
1156
1157 if (diri->snapid != CEPH_SNAPDIR &&
1158 fg.is_leftmost() && readdir_offset == 2 &&
1159 !(hash_order && last_hash)) {
1160 dirp->release_count = diri->dir_release_count;
1161 dirp->ordered_count = diri->dir_ordered_count;
1162 dirp->start_shared_gen = diri->shared_gen;
1163 dirp->cache_index = 0;
1164 }
1165
1166 dirp->buffer_frag = fg;
1167
1168 _readdir_drop_dirp_buffer(dirp);
1169 dirp->buffer.reserve(numdn);
1170
1171 string dname;
1172 LeaseStat dlease;
1173 for (unsigned i=0; i<numdn; i++) {
1174 decode(dname, p);
1175 dlease.decode(p, features);
1176 InodeStat ist(p, features);
1177
1178 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1179
1180 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1181 request->perms);
1182 Dentry *dn;
1183 if (diri->dir->dentries.count(dname)) {
1184 Dentry *olddn = diri->dir->dentries[dname];
1185 if (olddn->inode != in) {
1186 // replace incorrect dentry
1187 unlink(olddn, true, true); // keep dir, dentry
1188 dn = link(dir, dname, in, olddn);
1189 ceph_assert(dn == olddn);
1190 } else {
1191 // keep existing dn
1192 dn = olddn;
1193 touch_dn(dn);
1194 }
1195 } else {
1196 // new dn
1197 dn = link(dir, dname, in, NULL);
1198 }
1199
1200 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1201 if (hash_order) {
1202 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1203 if (hash != last_hash)
1204 readdir_offset = 2;
1205 last_hash = hash;
1206 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1207 } else {
1208 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1209 }
1210 // add to readdir cache
1211 if (dirp->release_count == diri->dir_release_count &&
1212 dirp->ordered_count == diri->dir_ordered_count &&
1213 dirp->start_shared_gen == diri->shared_gen) {
1214 if (dirp->cache_index == dir->readdir_cache.size()) {
1215 if (i == 0) {
1216 ceph_assert(!dirp->inode->is_complete_and_ordered());
1217 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1218 }
1219 dir->readdir_cache.push_back(dn);
1220 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1221 if (dirp->inode->is_complete_and_ordered())
1222 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1223 else
1224 dir->readdir_cache[dirp->cache_index] = dn;
1225 } else {
1226 ceph_abort_msg("unexpected readdir buffer idx");
1227 }
1228 dirp->cache_index++;
1229 }
1230 // add to cached result list
1231 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1232 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1233 }
1234
1235 if (numdn > 0)
1236 dirp->last_name = dname;
1237 if (end)
1238 dirp->next_offset = 2;
1239 else
1240 dirp->next_offset = readdir_offset;
1241
1242 if (dir->is_empty())
1243 close_dir(dir);
1244 }
1245 }
1246
1247 /** insert_trace
1248 *
1249 * insert a trace from a MDS reply into the cache.
1250 */
1251 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1252 {
1253 auto& reply = request->reply;
1254 int op = request->get_op();
1255
1256 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1257 << " is_target=" << (int)reply->head.is_target
1258 << " is_dentry=" << (int)reply->head.is_dentry
1259 << dendl;
1260
1261 auto p = reply->get_trace_bl().cbegin();
1262 if (request->got_unsafe) {
1263 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1264 ceph_assert(p.end());
1265 return NULL;
1266 }
1267
1268 if (p.end()) {
1269 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1270
1271 Dentry *d = request->dentry();
1272 if (d) {
1273 Inode *diri = d->dir->parent_inode;
1274 diri->dir_release_count++;
1275 clear_dir_complete_and_ordered(diri, true);
1276 }
1277
1278 if (d && reply->get_result() == 0) {
1279 if (op == CEPH_MDS_OP_RENAME) {
1280 // rename
1281 Dentry *od = request->old_dentry();
1282 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1283 ceph_assert(od);
1284 unlink(od, true, true); // keep dir, dentry
1285 } else if (op == CEPH_MDS_OP_RMDIR ||
1286 op == CEPH_MDS_OP_UNLINK) {
1287 // unlink, rmdir
1288 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1289 unlink(d, true, true); // keep dir, dentry
1290 }
1291 }
1292 return NULL;
1293 }
1294
1295 ConnectionRef con = request->reply->get_connection();
1296 uint64_t features;
1297 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1298 features = (uint64_t)-1;
1299 }
1300 else {
1301 features = con->get_features();
1302 }
1303 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1304
1305 // snap trace
1306 SnapRealm *realm = NULL;
1307 if (reply->snapbl.length())
1308 update_snap_trace(reply->snapbl, &realm);
1309
1310 ldout(cct, 10) << " hrm "
1311 << " is_target=" << (int)reply->head.is_target
1312 << " is_dentry=" << (int)reply->head.is_dentry
1313 << dendl;
1314
1315 InodeStat dirst;
1316 DirStat dst;
1317 string dname;
1318 LeaseStat dlease;
1319 InodeStat ist;
1320
1321 if (reply->head.is_dentry) {
1322 dirst.decode(p, features);
1323 dst.decode(p, features);
1324 decode(dname, p);
1325 dlease.decode(p, features);
1326 }
1327
1328 Inode *in = 0;
1329 if (reply->head.is_target) {
1330 ist.decode(p, features);
1331 if (cct->_conf->client_debug_getattr_caps) {
1332 unsigned wanted = 0;
1333 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1334 wanted = request->head.args.getattr.mask;
1335 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1336 wanted = request->head.args.open.mask;
1337
1338 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1339 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1340 ceph_abort_msg("MDS reply does not contain xattrs");
1341 }
1342
1343 in = add_update_inode(&ist, request->sent_stamp, session,
1344 request->perms);
1345 }
1346
1347 Inode *diri = NULL;
1348 if (reply->head.is_dentry) {
1349 diri = add_update_inode(&dirst, request->sent_stamp, session,
1350 request->perms);
1351 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1352
1353 if (in) {
1354 Dir *dir = diri->open_dir();
1355 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1356 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1357 } else {
1358 Dentry *dn = NULL;
1359 if (diri->dir && diri->dir->dentries.count(dname)) {
1360 dn = diri->dir->dentries[dname];
1361 if (dn->inode) {
1362 diri->dir_ordered_count++;
1363 clear_dir_complete_and_ordered(diri, false);
1364 unlink(dn, true, true); // keep dir, dentry
1365 }
1366 }
1367 if (dlease.duration_ms > 0) {
1368 if (!dn) {
1369 Dir *dir = diri->open_dir();
1370 dn = link(dir, dname, NULL, NULL);
1371 }
1372 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1373 }
1374 }
1375 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1376 op == CEPH_MDS_OP_MKSNAP) {
1377 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1378 // fake it for snap lookup
1379 vinodeno_t vino = ist.vino;
1380 vino.snapid = CEPH_SNAPDIR;
1381 ceph_assert(inode_map.count(vino));
1382 diri = inode_map[vino];
1383
1384 string dname = request->path.last_dentry();
1385
1386 LeaseStat dlease;
1387 dlease.duration_ms = 0;
1388
1389 if (in) {
1390 Dir *dir = diri->open_dir();
1391 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1392 } else {
1393 if (diri->dir && diri->dir->dentries.count(dname)) {
1394 Dentry *dn = diri->dir->dentries[dname];
1395 if (dn->inode)
1396 unlink(dn, true, true); // keep dir, dentry
1397 }
1398 }
1399 }
1400
1401 if (in) {
1402 if (op == CEPH_MDS_OP_READDIR ||
1403 op == CEPH_MDS_OP_LSSNAP) {
1404 insert_readdir_results(request, session, in);
1405 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1406 // hack: return parent inode instead
1407 in = diri;
1408 }
1409
1410 if (request->dentry() == NULL && in != request->inode()) {
1411 // pin the target inode if its parent dentry is not pinned
1412 request->set_other_inode(in);
1413 }
1414 }
1415
1416 if (realm)
1417 put_snap_realm(realm);
1418
1419 request->target = in;
1420 return in;
1421 }
1422
1423 // -------
1424
1425 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1426 {
1427 mds_rank_t mds = MDS_RANK_NONE;
1428 __u32 hash = 0;
1429 bool is_hash = false;
1430
1431 Inode *in = NULL;
1432 Dentry *de = NULL;
1433
1434 if (req->resend_mds >= 0) {
1435 mds = req->resend_mds;
1436 req->resend_mds = -1;
1437 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1438 goto out;
1439 }
1440
1441 if (cct->_conf->client_use_random_mds)
1442 goto random_mds;
1443
1444 in = req->inode();
1445 de = req->dentry();
1446 if (in) {
1447 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1448 if (req->path.depth()) {
1449 hash = in->hash_dentry_name(req->path[0]);
1450 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1451 << " on " << req->path[0]
1452 << " => " << hash << dendl;
1453 is_hash = true;
1454 }
1455 } else if (de) {
1456 if (de->inode) {
1457 in = de->inode.get();
1458 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1459 } else {
1460 in = de->dir->parent_inode;
1461 hash = in->hash_dentry_name(de->name);
1462 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1463 << " on " << de->name
1464 << " => " << hash << dendl;
1465 is_hash = true;
1466 }
1467 }
1468 if (in) {
1469 if (in->snapid != CEPH_NOSNAP) {
1470 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1471 while (in->snapid != CEPH_NOSNAP) {
1472 if (in->snapid == CEPH_SNAPDIR)
1473 in = in->snapdir_parent.get();
1474 else if (!in->dentries.empty())
1475 /* In most cases there will only be one dentry, so getting it
1476 * will be the correct action. If there are multiple hard links,
1477 * I think the MDS should be able to redirect as needed*/
1478 in = in->get_first_parent()->dir->parent_inode;
1479 else {
1480 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1481 break;
1482 }
1483 }
1484 is_hash = false;
1485 }
1486
1487 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1488 << " hash=" << hash << dendl;
1489
1490 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1491 frag_t fg = in->dirfragtree[hash];
1492 if (in->fragmap.count(fg)) {
1493 mds = in->fragmap[fg];
1494 if (phash_diri)
1495 *phash_diri = in;
1496 } else if (in->auth_cap) {
1497 mds = in->auth_cap->session->mds_num;
1498 }
1499 if (mds >= 0) {
1500 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1501 goto out;
1502 }
1503 }
1504
1505 if (in->auth_cap && req->auth_is_best()) {
1506 mds = in->auth_cap->session->mds_num;
1507 } else if (!in->caps.empty()) {
1508 mds = in->caps.begin()->second.session->mds_num;
1509 } else {
1510 goto random_mds;
1511 }
1512 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1513
1514 goto out;
1515 }
1516
1517 random_mds:
1518 if (mds < 0) {
1519 mds = _get_random_up_mds();
1520 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1521 }
1522
1523 out:
1524 ldout(cct, 20) << "mds is " << mds << dendl;
1525 return mds;
1526 }
1527
1528
1529 void Client::connect_mds_targets(mds_rank_t mds)
1530 {
1531 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1532 ceph_assert(mds_sessions.count(mds));
1533 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1534 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1535 q != info.export_targets.end();
1536 ++q) {
1537 if (mds_sessions.count(*q) == 0 &&
1538 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1539 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1540 << " export target mds." << *q << dendl;
1541 _open_mds_session(*q);
1542 }
1543 }
1544 }
1545
1546 void Client::dump_mds_sessions(Formatter *f)
1547 {
1548 f->dump_int("id", get_nodeid().v);
1549 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1550 f->dump_object("inst", inst);
1551 f->dump_stream("inst_str") << inst;
1552 f->dump_stream("addr_str") << inst.addr;
1553 f->open_array_section("sessions");
1554 for (const auto &p : mds_sessions) {
1555 f->open_object_section("session");
1556 p.second.dump(f);
1557 f->close_section();
1558 }
1559 f->close_section();
1560 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1561 }
1562 void Client::dump_mds_requests(Formatter *f)
1563 {
1564 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1565 p != mds_requests.end();
1566 ++p) {
1567 f->open_object_section("request");
1568 p->second->dump(f);
1569 f->close_section();
1570 }
1571 }
1572
1573 int Client::verify_reply_trace(int r,
1574 MetaRequest *request, const MConstRef<MClientReply>& reply,
1575 InodeRef *ptarget, bool *pcreated,
1576 const UserPerm& perms)
1577 {
1578 // check whether this request actually did the create, and set created flag
1579 bufferlist extra_bl;
1580 inodeno_t created_ino;
1581 bool got_created_ino = false;
1582 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1583
1584 extra_bl = reply->get_extra_bl();
1585 if (extra_bl.length() >= 8) {
1586 // if the extra bufferlist has a buffer, we assume its the created inode
1587 // and that this request to create succeeded in actually creating
1588 // the inode (won the race with other create requests)
1589 decode(created_ino, extra_bl);
1590 got_created_ino = true;
1591 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1592 }
1593
1594 if (pcreated)
1595 *pcreated = got_created_ino;
1596
1597 if (request->target) {
1598 *ptarget = request->target;
1599 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1600 } else {
1601 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1602 (*ptarget) = p->second;
1603 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1604 } else {
1605 // we got a traceless reply, and need to look up what we just
1606 // created. for now, do this by name. someday, do this by the
1607 // ino... which we know! FIXME.
1608 InodeRef target;
1609 Dentry *d = request->dentry();
1610 if (d) {
1611 if (d->dir) {
1612 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1613 << d->dir->parent_inode->ino << "/" << d->name
1614 << " got_ino " << got_created_ino
1615 << " ino " << created_ino
1616 << dendl;
1617 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1618 &target, perms);
1619 } else {
1620 // if the dentry is not linked, just do our best. see #5021.
1621 ceph_abort_msg("how did this happen? i want logs!");
1622 }
1623 } else {
1624 Inode *in = request->inode();
1625 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1626 << in->ino << dendl;
1627 r = _getattr(in, request->regetattr_mask, perms, true);
1628 target = in;
1629 }
1630 if (r >= 0) {
1631 // verify ino returned in reply and trace_dist are the same
1632 if (got_created_ino &&
1633 created_ino.val != target->ino.val) {
1634 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1635 r = -EINTR;
1636 }
1637 if (ptarget)
1638 ptarget->swap(target);
1639 }
1640 }
1641 }
1642
1643 return r;
1644 }
1645
1646
1647 /**
1648 * make a request
1649 *
1650 * Blocking helper to make an MDS request.
1651 *
1652 * If the ptarget flag is set, behavior changes slightly: the caller
1653 * expects to get a pointer to the inode we are creating or operating
1654 * on. As a result, we will follow up any traceless mutation reply
1655 * with a getattr or lookup to transparently handle a traceless reply
1656 * from the MDS (as when the MDS restarts and the client has to replay
1657 * a request).
1658 *
1659 * @param request the MetaRequest to execute
1660 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1661 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1662 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1663 * @param use_mds [optional] prefer a specific mds (-1 for default)
1664 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1665 */
1666 int Client::make_request(MetaRequest *request,
1667 const UserPerm& perms,
1668 InodeRef *ptarget, bool *pcreated,
1669 mds_rank_t use_mds,
1670 bufferlist *pdirbl)
1671 {
1672 int r = 0;
1673
1674 // assign a unique tid
1675 ceph_tid_t tid = ++last_tid;
1676 request->set_tid(tid);
1677
1678 // and timestamp
1679 request->op_stamp = ceph_clock_now();
1680
1681 // make note
1682 mds_requests[tid] = request->get();
1683 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1684 oldest_tid = tid;
1685
1686 request->set_caller_perms(perms);
1687
1688 if (cct->_conf->client_inject_fixed_oldest_tid) {
1689 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1690 request->set_oldest_client_tid(1);
1691 } else {
1692 request->set_oldest_client_tid(oldest_tid);
1693 }
1694
1695 // hack target mds?
1696 if (use_mds >= 0)
1697 request->resend_mds = use_mds;
1698
1699 while (1) {
1700 if (request->aborted())
1701 break;
1702
1703 if (blacklisted) {
1704 request->abort(-EBLACKLISTED);
1705 break;
1706 }
1707
1708 // set up wait cond
1709 Cond caller_cond;
1710 request->caller_cond = &caller_cond;
1711
1712 // choose mds
1713 Inode *hash_diri = NULL;
1714 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1715 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1716 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1717 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1718 if (hash_diri) {
1719 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1720 _fragmap_remove_stopped_mds(hash_diri, mds);
1721 } else {
1722 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1723 request->resend_mds = _get_random_up_mds();
1724 }
1725 } else {
1726 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1727 wait_on_list(waiting_for_mdsmap);
1728 }
1729 continue;
1730 }
1731
1732 // open a session?
1733 MetaSession *session = NULL;
1734 if (!have_open_session(mds)) {
1735 session = _get_or_open_mds_session(mds);
1736
1737 // wait
1738 if (session->state == MetaSession::STATE_OPENING) {
1739 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1740 wait_on_context_list(session->waiting_for_open);
1741 // Abort requests on REJECT from MDS
1742 if (rejected_by_mds.count(mds)) {
1743 request->abort(-EPERM);
1744 break;
1745 }
1746 continue;
1747 }
1748
1749 if (!have_open_session(mds))
1750 continue;
1751 } else {
1752 session = &mds_sessions.at(mds);
1753 }
1754
1755 // send request.
1756 send_request(request, session);
1757
1758 // wait for signal
1759 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1760 request->kick = false;
1761 while (!request->reply && // reply
1762 request->resend_mds < 0 && // forward
1763 !request->kick)
1764 caller_cond.Wait(client_lock);
1765 request->caller_cond = NULL;
1766
1767 // did we get a reply?
1768 if (request->reply)
1769 break;
1770 }
1771
1772 if (!request->reply) {
1773 ceph_assert(request->aborted());
1774 ceph_assert(!request->got_unsafe);
1775 r = request->get_abort_code();
1776 request->item.remove_myself();
1777 unregister_request(request);
1778 put_request(request);
1779 return r;
1780 }
1781
1782 // got it!
1783 auto reply = std::move(request->reply);
1784 r = reply->get_result();
1785 if (r >= 0)
1786 request->success = true;
1787
1788 // kick dispatcher (we've got it!)
1789 ceph_assert(request->dispatch_cond);
1790 request->dispatch_cond->Signal();
1791 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1792 request->dispatch_cond = 0;
1793
1794 if (r >= 0 && ptarget)
1795 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1796
1797 if (pdirbl)
1798 *pdirbl = reply->get_extra_bl();
1799
1800 // -- log times --
1801 utime_t lat = ceph_clock_now();
1802 lat -= request->sent_stamp;
1803 ldout(cct, 20) << "lat " << lat << dendl;
1804 logger->tinc(l_c_lat, lat);
1805 logger->tinc(l_c_reply, lat);
1806
1807 put_request(request);
1808 return r;
1809 }
1810
1811 void Client::unregister_request(MetaRequest *req)
1812 {
1813 mds_requests.erase(req->tid);
1814 if (req->tid == oldest_tid) {
1815 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1816 while (true) {
1817 if (p == mds_requests.end()) {
1818 oldest_tid = 0;
1819 break;
1820 }
1821 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1822 oldest_tid = p->first;
1823 break;
1824 }
1825 ++p;
1826 }
1827 }
1828 put_request(req);
1829 }
1830
1831 void Client::put_request(MetaRequest *request)
1832 {
1833 if (request->_put()) {
1834 int op = -1;
1835 if (request->success)
1836 op = request->get_op();
1837 InodeRef other_in;
1838 request->take_other_inode(&other_in);
1839 delete request;
1840
1841 if (other_in &&
1842 (op == CEPH_MDS_OP_RMDIR ||
1843 op == CEPH_MDS_OP_RENAME ||
1844 op == CEPH_MDS_OP_RMSNAP)) {
1845 _try_to_trim_inode(other_in.get(), false);
1846 }
1847 }
1848 }
1849
1850 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1851 mds_rank_t mds, int drop,
1852 int unless, int force)
1853 {
1854 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1855 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1856 << ", have:" << ", force:" << force << ")" << dendl;
1857 int released = 0;
1858 auto it = in->caps.find(mds);
1859 if (it != in->caps.end()) {
1860 Cap &cap = it->second;
1861 drop &= ~(in->dirty_caps | get_caps_used(in));
1862 if ((drop & cap.issued) &&
1863 !(unless & cap.issued)) {
1864 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1865 cap.issued &= ~drop;
1866 cap.implemented &= ~drop;
1867 released = 1;
1868 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
1869 } else {
1870 released = force;
1871 }
1872 if (released) {
1873 ceph_mds_request_release rel;
1874 rel.ino = in->ino;
1875 rel.cap_id = cap.cap_id;
1876 rel.seq = cap.seq;
1877 rel.issue_seq = cap.issue_seq;
1878 rel.mseq = cap.mseq;
1879 rel.caps = cap.implemented;
1880 rel.wanted = cap.wanted;
1881 rel.dname_len = 0;
1882 rel.dname_seq = 0;
1883 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1884 }
1885 }
1886 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1887 << released << dendl;
1888 return released;
1889 }
1890
1891 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1892 mds_rank_t mds, int drop, int unless)
1893 {
1894 ldout(cct, 20) << __func__ << " enter(dn:"
1895 << dn << ")" << dendl;
1896 int released = 0;
1897 if (dn->dir)
1898 released = encode_inode_release(dn->dir->parent_inode, req,
1899 mds, drop, unless, 1);
1900 if (released && dn->lease_mds == mds) {
1901 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1902 auto& rel = req->cap_releases.back();
1903 rel.item.dname_len = dn->name.length();
1904 rel.item.dname_seq = dn->lease_seq;
1905 rel.dname = dn->name;
1906 }
1907 ldout(cct, 25) << __func__ << " exit(dn:"
1908 << dn << ")" << dendl;
1909 }
1910
1911
1912 /*
1913 * This requires the MClientRequest *request member to be set.
1914 * It will error out horribly without one.
1915 * Additionally, if you set any *drop member, you'd better have
1916 * set the corresponding dentry!
1917 */
1918 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1919 {
1920 ldout(cct, 20) << __func__ << " enter (req: "
1921 << req << ", mds: " << mds << ")" << dendl;
1922 if (req->inode_drop && req->inode())
1923 encode_inode_release(req->inode(), req,
1924 mds, req->inode_drop,
1925 req->inode_unless);
1926
1927 if (req->old_inode_drop && req->old_inode())
1928 encode_inode_release(req->old_inode(), req,
1929 mds, req->old_inode_drop,
1930 req->old_inode_unless);
1931 if (req->other_inode_drop && req->other_inode())
1932 encode_inode_release(req->other_inode(), req,
1933 mds, req->other_inode_drop,
1934 req->other_inode_unless);
1935
1936 if (req->dentry_drop && req->dentry())
1937 encode_dentry_release(req->dentry(), req,
1938 mds, req->dentry_drop,
1939 req->dentry_unless);
1940
1941 if (req->old_dentry_drop && req->old_dentry())
1942 encode_dentry_release(req->old_dentry(), req,
1943 mds, req->old_dentry_drop,
1944 req->old_dentry_unless);
1945 ldout(cct, 25) << __func__ << " exit (req: "
1946 << req << ", mds " << mds <<dendl;
1947 }
1948
1949 bool Client::have_open_session(mds_rank_t mds)
1950 {
1951 const auto &it = mds_sessions.find(mds);
1952 return it != mds_sessions.end() &&
1953 (it->second.state == MetaSession::STATE_OPEN ||
1954 it->second.state == MetaSession::STATE_STALE);
1955 }
1956
1957 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1958 {
1959 const auto &it = mds_sessions.find(mds);
1960 if (it == mds_sessions.end() || it->second.con != con) {
1961 return NULL;
1962 } else {
1963 return &it->second;
1964 }
1965 }
1966
1967 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1968 {
1969 auto it = mds_sessions.find(mds);
1970 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1971 }
1972
1973 /**
1974 * Populate a map of strings with client-identifying metadata,
1975 * such as the hostname. Call this once at initialization.
1976 */
1977 void Client::populate_metadata(const std::string &mount_root)
1978 {
1979 // Hostname
1980 struct utsname u;
1981 int r = uname(&u);
1982 if (r >= 0) {
1983 metadata["hostname"] = u.nodename;
1984 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1985 } else {
1986 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1987 }
1988
1989 metadata["pid"] = stringify(getpid());
1990
1991 // Ceph entity id (the '0' in "client.0")
1992 metadata["entity_id"] = cct->_conf->name.get_id();
1993
1994 // Our mount position
1995 if (!mount_root.empty()) {
1996 metadata["root"] = mount_root;
1997 }
1998
1999 // Ceph version
2000 metadata["ceph_version"] = pretty_version_to_str();
2001 metadata["ceph_sha1"] = git_version_to_str();
2002
2003 // Apply any metadata from the user's configured overrides
2004 std::vector<std::string> tokens;
2005 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2006 for (const auto &i : tokens) {
2007 auto eqpos = i.find("=");
2008 // Throw out anything that isn't of the form "<str>=<str>"
2009 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2010 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2011 continue;
2012 }
2013 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2014 }
2015 }
2016
2017 /**
2018 * Optionally add or override client metadata fields.
2019 */
2020 void Client::update_metadata(std::string const &k, std::string const &v)
2021 {
2022 std::lock_guard l(client_lock);
2023 ceph_assert(initialized);
2024
2025 auto it = metadata.find(k);
2026 if (it != metadata.end()) {
2027 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2028 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2029 }
2030
2031 metadata[k] = v;
2032 }
2033
2034 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2035 {
2036 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2037 auto addrs = mdsmap->get_addrs(mds);
2038 auto em = mds_sessions.emplace(std::piecewise_construct,
2039 std::forward_as_tuple(mds),
2040 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2041 ceph_assert(em.second); /* not already present */
2042 MetaSession *session = &em.first->second;
2043
2044 // Maybe skip sending a request to open if this MDS daemon
2045 // has previously sent us a REJECT.
2046 if (rejected_by_mds.count(mds)) {
2047 if (rejected_by_mds[mds] == session->addrs) {
2048 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
2049 "because we were rejected" << dendl;
2050 return session;
2051 } else {
2052 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
2053 "rejected us, trying with new inst" << dendl;
2054 rejected_by_mds.erase(mds);
2055 }
2056 }
2057
2058 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2059 m->metadata = metadata;
2060 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2061 session->con->send_message2(std::move(m));
2062 return session;
2063 }
2064
2065 void Client::_close_mds_session(MetaSession *s)
2066 {
2067 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2068 s->state = MetaSession::STATE_CLOSING;
2069 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2070 }
2071
2072 void Client::_closed_mds_session(MetaSession *s)
2073 {
2074 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2075 s->state = MetaSession::STATE_CLOSED;
2076 s->con->mark_down();
2077 signal_context_list(s->waiting_for_open);
2078 mount_cond.Signal();
2079 remove_session_caps(s);
2080 kick_requests_closed(s);
2081 mds_sessions.erase(s->mds_num);
2082 }
2083
2084 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2085 {
2086 mds_rank_t from = mds_rank_t(m->get_source().num());
2087 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2088
2089 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2090 if (!session) {
2091 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2092 return;
2093 }
2094
2095 switch (m->get_op()) {
2096 case CEPH_SESSION_OPEN:
2097 {
2098 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2099 missing_features -= m->supported_features;
2100 if (!missing_features.empty()) {
2101 lderr(cct) << "mds." << from << " lacks required features '"
2102 << missing_features << "', closing session " << dendl;
2103 rejected_by_mds[session->mds_num] = session->addrs;
2104 _close_mds_session(session);
2105 _closed_mds_session(session);
2106 break;
2107 }
2108 session->mds_features = std::move(m->supported_features);
2109
2110 renew_caps(session);
2111 session->state = MetaSession::STATE_OPEN;
2112 if (unmounting)
2113 mount_cond.Signal();
2114 else
2115 connect_mds_targets(from);
2116 signal_context_list(session->waiting_for_open);
2117 break;
2118 }
2119
2120 case CEPH_SESSION_CLOSE:
2121 _closed_mds_session(session);
2122 break;
2123
2124 case CEPH_SESSION_RENEWCAPS:
2125 if (session->cap_renew_seq == m->get_seq()) {
2126 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2127 session->cap_ttl =
2128 session->last_cap_renew_request + mdsmap->get_session_timeout();
2129 if (was_stale)
2130 wake_up_session_caps(session, false);
2131 }
2132 break;
2133
2134 case CEPH_SESSION_STALE:
2135 // invalidate session caps/leases
2136 session->cap_gen++;
2137 session->cap_ttl = ceph_clock_now();
2138 session->cap_ttl -= 1;
2139 renew_caps(session);
2140 break;
2141
2142 case CEPH_SESSION_RECALL_STATE:
2143 trim_caps(session, m->get_max_caps());
2144 break;
2145
2146 case CEPH_SESSION_FLUSHMSG:
2147 /* flush cap release */
2148 if (auto& m = session->release; m) {
2149 session->con->send_message2(std::move(m));
2150 }
2151 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2152 break;
2153
2154 case CEPH_SESSION_FORCE_RO:
2155 force_session_readonly(session);
2156 break;
2157
2158 case CEPH_SESSION_REJECT:
2159 {
2160 std::string_view error_str;
2161 auto it = m->metadata.find("error_string");
2162 if (it != m->metadata.end())
2163 error_str = it->second;
2164 else
2165 error_str = "unknown error";
2166 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2167
2168 rejected_by_mds[session->mds_num] = session->addrs;
2169 _closed_mds_session(session);
2170 }
2171 break;
2172
2173 default:
2174 ceph_abort();
2175 }
2176 }
2177
2178 bool Client::_any_stale_sessions() const
2179 {
2180 ceph_assert(client_lock.is_locked_by_me());
2181
2182 for (const auto &p : mds_sessions) {
2183 if (p.second.state == MetaSession::STATE_STALE) {
2184 return true;
2185 }
2186 }
2187
2188 return false;
2189 }
2190
2191 void Client::_kick_stale_sessions()
2192 {
2193 ldout(cct, 1) << __func__ << dendl;
2194
2195 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2196 MetaSession &s = it->second;
2197 ++it;
2198 if (s.state == MetaSession::STATE_STALE)
2199 _closed_mds_session(&s);
2200 }
2201 }
2202
2203 void Client::send_request(MetaRequest *request, MetaSession *session,
2204 bool drop_cap_releases)
2205 {
2206 // make the request
2207 mds_rank_t mds = session->mds_num;
2208 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2209 << " for mds." << mds << dendl;
2210 auto r = build_client_request(request);
2211 if (request->dentry()) {
2212 r->set_dentry_wanted();
2213 }
2214 if (request->got_unsafe) {
2215 r->set_replayed_op();
2216 if (request->target)
2217 r->head.ino = request->target->ino;
2218 } else {
2219 encode_cap_releases(request, mds);
2220 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2221 request->cap_releases.clear();
2222 else
2223 r->releases.swap(request->cap_releases);
2224 }
2225 r->set_mdsmap_epoch(mdsmap->get_epoch());
2226 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2227 objecter->with_osdmap([r](const OSDMap& o) {
2228 r->set_osdmap_epoch(o.get_epoch());
2229 });
2230 }
2231
2232 if (request->mds == -1) {
2233 request->sent_stamp = ceph_clock_now();
2234 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2235 }
2236 request->mds = mds;
2237
2238 Inode *in = request->inode();
2239 if (in) {
2240 auto it = in->caps.find(mds);
2241 if (it != in->caps.end()) {
2242 request->sent_on_mseq = it->second.mseq;
2243 }
2244 }
2245
2246 session->requests.push_back(&request->item);
2247
2248 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2249 session->con->send_message2(std::move(r));
2250 }
2251
2252 MClientRequest::ref Client::build_client_request(MetaRequest *request)
2253 {
2254 auto req = MClientRequest::create(request->get_op());
2255 req->set_tid(request->tid);
2256 req->set_stamp(request->op_stamp);
2257 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2258
2259 // if the filepath's haven't been set, set them!
2260 if (request->path.empty()) {
2261 Inode *in = request->inode();
2262 Dentry *de = request->dentry();
2263 if (in)
2264 in->make_nosnap_relative_path(request->path);
2265 else if (de) {
2266 if (de->inode)
2267 de->inode->make_nosnap_relative_path(request->path);
2268 else if (de->dir) {
2269 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2270 request->path.push_dentry(de->name);
2271 }
2272 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2273 << " No path, inode, or appropriately-endowed dentry given!"
2274 << dendl;
2275 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2276 << " No path, inode, or dentry given!"
2277 << dendl;
2278 }
2279 req->set_filepath(request->get_filepath());
2280 req->set_filepath2(request->get_filepath2());
2281 req->set_data(request->data);
2282 req->set_retry_attempt(request->retry_attempt++);
2283 req->head.num_fwd = request->num_fwd;
2284 const gid_t *_gids;
2285 int gid_count = request->perms.get_gids(&_gids);
2286 req->set_gid_list(gid_count, _gids);
2287 return req;
2288 }
2289
2290
2291
2292 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2293 {
2294 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2295 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2296 if (!session) {
2297 return;
2298 }
2299 ceph_tid_t tid = fwd->get_tid();
2300
2301 if (mds_requests.count(tid) == 0) {
2302 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2303 return;
2304 }
2305
2306 MetaRequest *request = mds_requests[tid];
2307 ceph_assert(request);
2308
2309 // reset retry counter
2310 request->retry_attempt = 0;
2311
2312 // request not forwarded, or dest mds has no session.
2313 // resend.
2314 ldout(cct, 10) << __func__ << " tid " << tid
2315 << " fwd " << fwd->get_num_fwd()
2316 << " to mds." << fwd->get_dest_mds()
2317 << ", resending to " << fwd->get_dest_mds()
2318 << dendl;
2319
2320 request->mds = -1;
2321 request->item.remove_myself();
2322 request->num_fwd = fwd->get_num_fwd();
2323 request->resend_mds = fwd->get_dest_mds();
2324 request->caller_cond->Signal();
2325 }
2326
2327 bool Client::is_dir_operation(MetaRequest *req)
2328 {
2329 int op = req->get_op();
2330 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2331 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2332 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2333 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2334 return true;
2335 return false;
2336 }
2337
2338 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2339 {
2340 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2341 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2342 if (!session) {
2343 return;
2344 }
2345
2346 ceph_tid_t tid = reply->get_tid();
2347 bool is_safe = reply->is_safe();
2348
2349 if (mds_requests.count(tid) == 0) {
2350 lderr(cct) << __func__ << " no pending request on tid " << tid
2351 << " safe is:" << is_safe << dendl;
2352 return;
2353 }
2354 MetaRequest *request = mds_requests.at(tid);
2355
2356 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2357 << " tid " << tid << dendl;
2358
2359 if (request->got_unsafe && !is_safe) {
2360 //duplicate response
2361 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2362 << mds_num << " safe:" << is_safe << dendl;
2363 return;
2364 }
2365
2366 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2367 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2368 << " from mds." << request->mds << dendl;
2369 request->send_to_auth = true;
2370 request->resend_mds = choose_target_mds(request);
2371 Inode *in = request->inode();
2372 std::map<mds_rank_t, Cap>::const_iterator it;
2373 if (request->resend_mds >= 0 &&
2374 request->resend_mds == request->mds &&
2375 (in == NULL ||
2376 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2377 request->sent_on_mseq == it->second.mseq)) {
2378 ldout(cct, 20) << "have to return ESTALE" << dendl;
2379 } else {
2380 request->caller_cond->Signal();
2381 return;
2382 }
2383 }
2384
2385 ceph_assert(!request->reply);
2386 request->reply = reply;
2387 insert_trace(request, session);
2388
2389 // Handle unsafe reply
2390 if (!is_safe) {
2391 request->got_unsafe = true;
2392 session->unsafe_requests.push_back(&request->unsafe_item);
2393 if (is_dir_operation(request)) {
2394 Inode *dir = request->inode();
2395 ceph_assert(dir);
2396 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2397 }
2398 if (request->target) {
2399 InodeRef &in = request->target;
2400 in->unsafe_ops.push_back(&request->unsafe_target_item);
2401 }
2402 }
2403
2404 // Only signal the caller once (on the first reply):
2405 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2406 if (!is_safe || !request->got_unsafe) {
2407 Cond cond;
2408 request->dispatch_cond = &cond;
2409
2410 // wake up waiter
2411 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2412 request->caller_cond->Signal();
2413
2414 // wake for kick back
2415 while (request->dispatch_cond) {
2416 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
2417 cond.Wait(client_lock);
2418 }
2419 }
2420
2421 if (is_safe) {
2422 // the filesystem change is committed to disk
2423 // we're done, clean up
2424 if (request->got_unsafe) {
2425 request->unsafe_item.remove_myself();
2426 request->unsafe_dir_item.remove_myself();
2427 request->unsafe_target_item.remove_myself();
2428 signal_cond_list(request->waitfor_safe);
2429 }
2430 request->item.remove_myself();
2431 unregister_request(request);
2432 }
2433 if (unmounting)
2434 mount_cond.Signal();
2435 }
2436
2437 void Client::_handle_full_flag(int64_t pool)
2438 {
2439 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2440 << "on " << pool << dendl;
2441 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2442 // to do this rather than blocking, because otherwise when we fill up we
2443 // potentially lock caps forever on files with dirty pages, and we need
2444 // to be able to release those caps to the MDS so that it can delete files
2445 // and free up space.
2446 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2447
2448 // For all inodes with layouts in this pool and a pending flush write op
2449 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2450 // from ObjectCacher so that it doesn't re-issue the write in response to
2451 // the ENOSPC error.
2452 // Fortunately since we're cancelling everything in a given pool, we don't
2453 // need to know which ops belong to which ObjectSet, we can just blow all
2454 // the un-flushed cached data away and mark any dirty inodes' async_err
2455 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2456 // affecting this pool, and all the objectsets we're purging were also
2457 // in this pool.
2458 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2459 i != inode_map.end(); ++i)
2460 {
2461 Inode *inode = i->second;
2462 if (inode->oset.dirty_or_tx
2463 && (pool == -1 || inode->layout.pool_id == pool)) {
2464 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2465 << " has dirty objects, purging and setting ENOSPC" << dendl;
2466 objectcacher->purge_set(&inode->oset);
2467 inode->set_async_err(-ENOSPC);
2468 }
2469 }
2470
2471 if (cancelled_epoch != (epoch_t)-1) {
2472 set_cap_epoch_barrier(cancelled_epoch);
2473 }
2474 }
2475
2476 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2477 {
2478 std::set<entity_addr_t> new_blacklists;
2479 objecter->consume_blacklist_events(&new_blacklists);
2480
2481 const auto myaddrs = messenger->get_myaddrs();
2482 bool new_blacklist = false;
2483 bool prenautilus = objecter->with_osdmap(
2484 [&](const OSDMap& o) {
2485 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2486 });
2487 if (!blacklisted) {
2488 for (auto a : myaddrs.v) {
2489 // blacklist entries are always TYPE_ANY for nautilus+
2490 a.set_type(entity_addr_t::TYPE_ANY);
2491 if (new_blacklists.count(a)) {
2492 new_blacklist = true;
2493 break;
2494 }
2495 if (prenautilus) {
2496 // ...except pre-nautilus, they were TYPE_LEGACY
2497 a.set_type(entity_addr_t::TYPE_LEGACY);
2498 if (new_blacklists.count(a)) {
2499 new_blacklist = true;
2500 break;
2501 }
2502 }
2503 }
2504 }
2505 if (new_blacklist) {
2506 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2507 return o.get_epoch();
2508 });
2509 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2510 blacklisted = true;
2511
2512 _abort_mds_sessions(-EBLACKLISTED);
2513
2514 // Since we know all our OSD ops will fail, cancel them all preemtively,
2515 // so that on an unhealthy cluster we can umount promptly even if e.g.
2516 // some PGs were inaccessible.
2517 objecter->op_cancel_writes(-EBLACKLISTED);
2518
2519 } else if (blacklisted) {
2520 // Handle case where we were blacklisted but no longer are
2521 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2522 return o.is_blacklisted(myaddrs);});
2523 }
2524
2525 // Always subscribe to next osdmap for blacklisted client
2526 // until this client is not blacklisted.
2527 if (blacklisted) {
2528 objecter->maybe_request_map();
2529 }
2530
2531 if (objecter->osdmap_full_flag()) {
2532 _handle_full_flag(-1);
2533 } else {
2534 // Accumulate local list of full pools so that I can drop
2535 // the objecter lock before re-entering objecter in
2536 // cancel_writes
2537 std::vector<int64_t> full_pools;
2538
2539 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2540 for (const auto& kv : o.get_pools()) {
2541 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2542 full_pools.push_back(kv.first);
2543 }
2544 }
2545 });
2546
2547 for (auto p : full_pools)
2548 _handle_full_flag(p);
2549
2550 // Subscribe to subsequent maps to watch for the full flag going
2551 // away. For the global full flag objecter does this for us, but
2552 // it pays no attention to the per-pool full flag so in this branch
2553 // we do it ourselves.
2554 if (!full_pools.empty()) {
2555 objecter->maybe_request_map();
2556 }
2557 }
2558 }
2559
2560
2561 // ------------------------
2562 // incoming messages
2563
2564
2565 bool Client::ms_dispatch2(const MessageRef &m)
2566 {
2567 std::lock_guard l(client_lock);
2568 if (!initialized) {
2569 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2570 return true;
2571 }
2572
2573 switch (m->get_type()) {
2574 // mounting and mds sessions
2575 case CEPH_MSG_MDS_MAP:
2576 handle_mds_map(MMDSMap::msgref_cast(m));
2577 break;
2578 case CEPH_MSG_FS_MAP:
2579 handle_fs_map(MFSMap::msgref_cast(m));
2580 break;
2581 case CEPH_MSG_FS_MAP_USER:
2582 handle_fs_map_user(MFSMapUser::msgref_cast(m));
2583 break;
2584 case CEPH_MSG_CLIENT_SESSION:
2585 handle_client_session(MClientSession::msgref_cast(m));
2586 break;
2587
2588 case CEPH_MSG_OSD_MAP:
2589 handle_osd_map(MOSDMap::msgref_cast(m));
2590 break;
2591
2592 // requests
2593 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2594 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
2595 break;
2596 case CEPH_MSG_CLIENT_REPLY:
2597 handle_client_reply(MClientReply::msgref_cast(m));
2598 break;
2599
2600 // reclaim reply
2601 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2602 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
2603 break;
2604
2605 case CEPH_MSG_CLIENT_SNAP:
2606 handle_snap(MClientSnap::msgref_cast(m));
2607 break;
2608 case CEPH_MSG_CLIENT_CAPS:
2609 handle_caps(MClientCaps::msgref_cast(m));
2610 break;
2611 case CEPH_MSG_CLIENT_LEASE:
2612 handle_lease(MClientLease::msgref_cast(m));
2613 break;
2614 case MSG_COMMAND_REPLY:
2615 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2616 handle_command_reply(MCommandReply::msgref_cast(m));
2617 } else {
2618 return false;
2619 }
2620 break;
2621 case CEPH_MSG_CLIENT_QUOTA:
2622 handle_quota(MClientQuota::msgref_cast(m));
2623 break;
2624
2625 default:
2626 return false;
2627 }
2628
2629 // unmounting?
2630 if (unmounting) {
2631 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2632 << "+" << inode_map.size() << dendl;
2633 long unsigned size = lru.lru_get_size() + inode_map.size();
2634 trim_cache();
2635 if (size < lru.lru_get_size() + inode_map.size()) {
2636 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2637 mount_cond.Signal();
2638 } else {
2639 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2640 << "+" << inode_map.size() << dendl;
2641 }
2642 }
2643
2644 return true;
2645 }
2646
2647 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2648 {
2649 fsmap.reset(new FSMap(m->get_fsmap()));
2650
2651 signal_cond_list(waiting_for_fsmap);
2652
2653 monclient->sub_got("fsmap", fsmap->get_epoch());
2654 }
2655
2656 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2657 {
2658 fsmap_user.reset(new FSMapUser);
2659 *fsmap_user = m->get_fsmap();
2660
2661 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2662 signal_cond_list(waiting_for_fsmap);
2663 }
2664
2665 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2666 {
2667 mds_gid_t old_inc, new_inc;
2668 if (m->get_epoch() <= mdsmap->get_epoch()) {
2669 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2670 << " is identical to or older than our "
2671 << mdsmap->get_epoch() << dendl;
2672 return;
2673 }
2674
2675 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2676
2677 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2678 oldmap.swap(mdsmap);
2679
2680 mdsmap->decode(m->get_encoded());
2681
2682 // Cancel any commands for missing or laggy GIDs
2683 std::list<ceph_tid_t> cancel_ops;
2684 auto &commands = command_table.get_commands();
2685 for (const auto &i : commands) {
2686 auto &op = i.second;
2687 const mds_gid_t op_mds_gid = op.mds_gid;
2688 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2689 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2690 cancel_ops.push_back(i.first);
2691 if (op.outs) {
2692 std::ostringstream ss;
2693 ss << "MDS " << op_mds_gid << " went away";
2694 *(op.outs) = ss.str();
2695 }
2696 op.con->mark_down();
2697 if (op.on_finish) {
2698 op.on_finish->complete(-ETIMEDOUT);
2699 }
2700 }
2701 }
2702
2703 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2704 i != cancel_ops.end(); ++i) {
2705 command_table.erase(*i);
2706 }
2707
2708 // reset session
2709 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2710 mds_rank_t mds = p->first;
2711 MetaSession *session = &p->second;
2712 ++p;
2713
2714 int oldstate = oldmap->get_state(mds);
2715 int newstate = mdsmap->get_state(mds);
2716 if (!mdsmap->is_up(mds)) {
2717 session->con->mark_down();
2718 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2719 old_inc = oldmap->get_incarnation(mds);
2720 new_inc = mdsmap->get_incarnation(mds);
2721 if (old_inc != new_inc) {
2722 ldout(cct, 1) << "mds incarnation changed from "
2723 << old_inc << " to " << new_inc << dendl;
2724 oldstate = MDSMap::STATE_NULL;
2725 }
2726 session->con->mark_down();
2727 session->addrs = mdsmap->get_addrs(mds);
2728 // When new MDS starts to take over, notify kernel to trim unused entries
2729 // in its dcache/icache. Hopefully, the kernel will release some unused
2730 // inodes before the new MDS enters reconnect state.
2731 trim_cache_for_reconnect(session);
2732 } else if (oldstate == newstate)
2733 continue; // no change
2734
2735 session->mds_state = newstate;
2736 if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
2737 // missed reconnect close the session so that it can be reopened
2738 _closed_mds_session(session);
2739 continue;
2740 }
2741 if (newstate == MDSMap::STATE_RECONNECT) {
2742 session->con = messenger->connect_to_mds(session->addrs);
2743 send_reconnect(session);
2744 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2745 if (oldstate < MDSMap::STATE_ACTIVE) {
2746 // kick new requests
2747 kick_requests(session);
2748 kick_flushing_caps(session);
2749 signal_context_list(session->waiting_for_open);
2750 wake_up_session_caps(session, true);
2751 }
2752 connect_mds_targets(mds);
2753 } else if (newstate == MDSMap::STATE_NULL &&
2754 mds >= mdsmap->get_max_mds()) {
2755 _closed_mds_session(session);
2756 }
2757 }
2758
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap);
2761
2762 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2763 }
2764
2765 void Client::send_reconnect(MetaSession *session)
2766 {
2767 mds_rank_t mds = session->mds_num;
2768 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2769
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session);
2772
2773 session->readonly = false;
2774
2775 session->release.reset();
2776
2777 // reset my cap seq number
2778 session->seq = 0;
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session);
2783
2784 early_kick_flushing_caps(session);
2785
2786 auto m = MClientReconnect::create();
2787 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2788
2789 // i have an open session.
2790 ceph::unordered_set<inodeno_t> did_snaprealm;
2791 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2792 p != inode_map.end();
2793 ++p) {
2794 Inode *in = p->second;
2795 auto it = in->caps.find(mds);
2796 if (it != in->caps.end()) {
2797 if (allow_multi &&
2798 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2799 m->mark_more();
2800 session->con->send_message2(std::move(m));
2801
2802 m = MClientReconnect::create();
2803 }
2804
2805 Cap &cap = it->second;
2806 ldout(cct, 10) << " caps on " << p->first
2807 << " " << ccap_string(cap.issued)
2808 << " wants " << ccap_string(in->caps_wanted())
2809 << dendl;
2810 filepath path;
2811 in->make_long_path(path);
2812 ldout(cct, 10) << " path " << path << dendl;
2813
2814 bufferlist flockbl;
2815 _encode_filelocks(in, flockbl);
2816
2817 cap.seq = 0; // reset seq.
2818 cap.issue_seq = 0; // reset seq.
2819 cap.mseq = 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap.gen < session->cap_gen) {
2822 cap.gen = session->cap_gen;
2823 cap.issued = cap.implemented = CEPH_CAP_PIN;
2824 } else {
2825 cap.issued = cap.implemented;
2826 }
2827 snapid_t snap_follows = 0;
2828 if (!in->cap_snaps.empty())
2829 snap_follows = in->cap_snaps.begin()->first;
2830
2831 m->add_cap(p->first.ino,
2832 cap.cap_id,
2833 path.get_ino(), path.get_path(), // ino
2834 in->caps_wanted(), // wanted
2835 cap.issued, // issued
2836 in->snaprealm->ino,
2837 snap_follows,
2838 flockbl);
2839
2840 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2841 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2842 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2843 did_snaprealm.insert(in->snaprealm->ino);
2844 }
2845 }
2846 }
2847
2848 if (!allow_multi)
2849 m->set_encoding_version(0); // use connection features to choose encoding
2850 session->con->send_message2(std::move(m));
2851
2852 mount_cond.Signal();
2853
2854 if (session->reclaim_state == MetaSession::RECLAIMING)
2855 signal_cond_list(waiting_for_reclaim);
2856 }
2857
2858
2859 void Client::kick_requests(MetaSession *session)
2860 {
2861 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2862 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2863 p != mds_requests.end();
2864 ++p) {
2865 MetaRequest *req = p->second;
2866 if (req->got_unsafe)
2867 continue;
2868 if (req->aborted()) {
2869 if (req->caller_cond) {
2870 req->kick = true;
2871 req->caller_cond->Signal();
2872 }
2873 continue;
2874 }
2875 if (req->retry_attempt > 0)
2876 continue; // new requests only
2877 if (req->mds == session->mds_num) {
2878 send_request(p->second, session);
2879 }
2880 }
2881 }
2882
2883 void Client::resend_unsafe_requests(MetaSession *session)
2884 {
2885 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2886 !iter.end();
2887 ++iter)
2888 send_request(*iter, session);
2889
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2893 p != mds_requests.end();
2894 ++p) {
2895 MetaRequest *req = p->second;
2896 if (req->got_unsafe)
2897 continue;
2898 if (req->aborted())
2899 continue;
2900 if (req->retry_attempt == 0)
2901 continue; // old requests only
2902 if (req->mds == session->mds_num)
2903 send_request(req, session, true);
2904 }
2905 }
2906
2907 void Client::wait_unsafe_requests()
2908 {
2909 list<MetaRequest*> last_unsafe_reqs;
2910 for (const auto &p : mds_sessions) {
2911 const MetaSession &s = p.second;
2912 if (!s.unsafe_requests.empty()) {
2913 MetaRequest *req = s.unsafe_requests.back();
2914 req->get();
2915 last_unsafe_reqs.push_back(req);
2916 }
2917 }
2918
2919 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2920 p != last_unsafe_reqs.end();
2921 ++p) {
2922 MetaRequest *req = *p;
2923 if (req->unsafe_item.is_on_list())
2924 wait_on_list(req->waitfor_safe);
2925 put_request(req);
2926 }
2927 }
2928
2929 void Client::kick_requests_closed(MetaSession *session)
2930 {
2931 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2932 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2933 p != mds_requests.end(); ) {
2934 MetaRequest *req = p->second;
2935 ++p;
2936 if (req->mds == session->mds_num) {
2937 if (req->caller_cond) {
2938 req->kick = true;
2939 req->caller_cond->Signal();
2940 }
2941 req->item.remove_myself();
2942 if (req->got_unsafe) {
2943 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2944 req->unsafe_item.remove_myself();
2945 req->unsafe_dir_item.remove_myself();
2946 req->unsafe_target_item.remove_myself();
2947 signal_cond_list(req->waitfor_safe);
2948 unregister_request(req);
2949 }
2950 }
2951 }
2952 ceph_assert(session->requests.empty());
2953 ceph_assert(session->unsafe_requests.empty());
2954 }
2955
2956
2957
2958
2959 /************
2960 * leases
2961 */
2962
2963 void Client::got_mds_push(MetaSession *s)
2964 {
2965 s->seq++;
2966 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2967 if (s->state == MetaSession::STATE_CLOSING) {
2968 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2969 }
2970 }
2971
2972 void Client::handle_lease(const MConstRef<MClientLease>& m)
2973 {
2974 ldout(cct, 10) << __func__ << " " << *m << dendl;
2975
2976 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2977
2978 mds_rank_t mds = mds_rank_t(m->get_source().num());
2979 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2980 if (!session) {
2981 return;
2982 }
2983
2984 got_mds_push(session);
2985
2986 ceph_seq_t seq = m->get_seq();
2987
2988 Inode *in;
2989 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2990 if (inode_map.count(vino) == 0) {
2991 ldout(cct, 10) << " don't have vino " << vino << dendl;
2992 goto revoke;
2993 }
2994 in = inode_map[vino];
2995
2996 if (m->get_mask() & CEPH_LOCK_DN) {
2997 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2998 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2999 goto revoke;
3000 }
3001 Dentry *dn = in->dir->dentries[m->dname];
3002 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3003 dn->lease_mds = -1;
3004 }
3005
3006 revoke:
3007 {
3008 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3009 m->get_connection()->send_message2(std::move(reply));
3010 }
3011 }
3012
3013 void Client::put_inode(Inode *in, int n)
3014 {
3015 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3016 int left = in->_put(n);
3017 if (left == 0) {
3018 // release any caps
3019 remove_all_caps(in);
3020
3021 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3022 bool unclean = objectcacher->release_set(&in->oset);
3023 ceph_assert(!unclean);
3024 inode_map.erase(in->vino());
3025 if (use_faked_inos())
3026 _release_faked_ino(in);
3027
3028 if (in == root) {
3029 root = 0;
3030 root_ancestor = 0;
3031 while (!root_parents.empty())
3032 root_parents.erase(root_parents.begin());
3033 }
3034
3035 delete in;
3036 }
3037 }
3038
3039 void Client::close_dir(Dir *dir)
3040 {
3041 Inode *in = dir->parent_inode;
3042 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3043 ceph_assert(dir->is_empty());
3044 ceph_assert(in->dir == dir);
3045 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3046 if (!in->dentries.empty())
3047 in->get_first_parent()->put(); // unpin dentry
3048
3049 delete in->dir;
3050 in->dir = 0;
3051 put_inode(in); // unpin inode
3052 }
3053
3054 /**
3055 * Don't call this with in==NULL, use get_or_create for that
3056 * leave dn set to default NULL unless you're trying to add
3057 * a new inode to a pre-created Dentry
3058 */
3059 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3060 {
3061 if (!dn) {
3062 // create a new Dentry
3063 dn = new Dentry(dir, name);
3064
3065 lru.lru_insert_mid(dn); // mid or top?
3066
3067 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3068 << " dn " << dn << " (new dn)" << dendl;
3069 } else {
3070 ceph_assert(!dn->inode);
3071 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3072 << " dn " << dn << " (old dn)" << dendl;
3073 }
3074
3075 if (in) { // link to inode
3076 InodeRef tmp_ref;
3077 // only one parent for directories!
3078 if (in->is_dir() && !in->dentries.empty()) {
3079 tmp_ref = in; // prevent unlink below from freeing the inode.
3080 Dentry *olddn = in->get_first_parent();
3081 ceph_assert(olddn->dir != dir || olddn->name != name);
3082 Inode *old_diri = olddn->dir->parent_inode;
3083 old_diri->dir_release_count++;
3084 clear_dir_complete_and_ordered(old_diri, true);
3085 unlink(olddn, true, true); // keep dir, dentry
3086 }
3087
3088 dn->link(in);
3089 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3090 }
3091
3092 return dn;
3093 }
3094
3095 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3096 {
3097 InodeRef in(dn->inode);
3098 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3099 << " inode " << dn->inode << dendl;
3100
3101 // unlink from inode
3102 if (dn->inode) {
3103 dn->unlink();
3104 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3105 }
3106
3107 if (keepdentry) {
3108 dn->lease_mds = -1;
3109 } else {
3110 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3111
3112 // unlink from dir
3113 Dir *dir = dn->dir;
3114 dn->detach();
3115
3116 // delete den
3117 lru.lru_remove(dn);
3118 dn->put();
3119
3120 if (dir->is_empty() && !keepdir)
3121 close_dir(dir);
3122 }
3123 }
3124
3125 /**
3126 * For asynchronous flushes, check for errors from the IO and
3127 * update the inode if necessary
3128 */
3129 class C_Client_FlushComplete : public Context {
3130 private:
3131 Client *client;
3132 InodeRef inode;
3133 public:
3134 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3135 void finish(int r) override {
3136 ceph_assert(client->client_lock.is_locked_by_me());
3137 if (r != 0) {
3138 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3139 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3140 << " 0x" << std::hex << inode->ino << std::dec
3141 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3142 inode->set_async_err(r);
3143 }
3144 }
3145 };
3146
3147
3148 /****
3149 * caps
3150 */
3151
3152 void Client::get_cap_ref(Inode *in, int cap)
3153 {
3154 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3155 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3156 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3157 in->get();
3158 }
3159 if ((cap & CEPH_CAP_FILE_CACHE) &&
3160 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3161 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3162 in->get();
3163 }
3164 in->get_cap_ref(cap);
3165 }
3166
3167 void Client::put_cap_ref(Inode *in, int cap)
3168 {
3169 int last = in->put_cap_ref(cap);
3170 if (last) {
3171 int put_nref = 0;
3172 int drop = last & ~in->caps_issued();
3173 if (in->snapid == CEPH_NOSNAP) {
3174 if ((last & CEPH_CAP_FILE_WR) &&
3175 !in->cap_snaps.empty() &&
3176 in->cap_snaps.rbegin()->second.writing) {
3177 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3178 in->cap_snaps.rbegin()->second.writing = 0;
3179 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3180 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3181 }
3182 if (last & CEPH_CAP_FILE_BUFFER) {
3183 for (auto &p : in->cap_snaps)
3184 p.second.dirty_data = 0;
3185 signal_cond_list(in->waitfor_commit);
3186 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3187 ++put_nref;
3188 }
3189 }
3190 if (last & CEPH_CAP_FILE_CACHE) {
3191 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3192 ++put_nref;
3193 }
3194 if (drop)
3195 check_caps(in, 0);
3196 if (put_nref)
3197 put_inode(in, put_nref);
3198 }
3199 }
3200
3201 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3202 {
3203 int r = check_pool_perm(in, need);
3204 if (r < 0)
3205 return r;
3206
3207 while (1) {
3208 int file_wanted = in->caps_file_wanted();
3209 if ((file_wanted & need) != need) {
3210 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3211 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3212 << dendl;
3213 return -EBADF;
3214 }
3215
3216 int implemented;
3217 int have = in->caps_issued(&implemented);
3218
3219 bool waitfor_caps = false;
3220 bool waitfor_commit = false;
3221
3222 if (have & need & CEPH_CAP_FILE_WR) {
3223 if (endoff > 0 &&
3224 (endoff >= (loff_t)in->max_size ||
3225 endoff > (loff_t)(in->size << 1)) &&
3226 endoff > (loff_t)in->wanted_max_size) {
3227 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3228 in->wanted_max_size = endoff;
3229 check_caps(in, 0);
3230 }
3231
3232 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3233 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3234 waitfor_caps = true;
3235 }
3236 if (!in->cap_snaps.empty()) {
3237 if (in->cap_snaps.rbegin()->second.writing) {
3238 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3239 waitfor_caps = true;
3240 }
3241 for (auto &p : in->cap_snaps) {
3242 if (p.second.dirty_data) {
3243 waitfor_commit = true;
3244 break;
3245 }
3246 }
3247 if (waitfor_commit) {
3248 _flush(in, new C_Client_FlushComplete(this, in));
3249 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3250 }
3251 }
3252 }
3253
3254 if (!waitfor_caps && !waitfor_commit) {
3255 if ((have & need) == need) {
3256 int revoking = implemented & ~have;
3257 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3258 << " need " << ccap_string(need) << " want " << ccap_string(want)
3259 << " revoking " << ccap_string(revoking)
3260 << dendl;
3261 if ((revoking & want) == 0) {
3262 *phave = need | (have & want);
3263 in->get_cap_ref(need);
3264 return 0;
3265 }
3266 }
3267 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3268 waitfor_caps = true;
3269 }
3270
3271 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3272 in->auth_cap->session->readonly)
3273 return -EROFS;
3274
3275 if (in->flags & I_CAP_DROPPED) {
3276 int mds_wanted = in->caps_mds_wanted();
3277 if ((mds_wanted & need) != need) {
3278 int ret = _renew_caps(in);
3279 if (ret < 0)
3280 return ret;
3281 continue;
3282 }
3283 if (!(file_wanted & ~mds_wanted))
3284 in->flags &= ~I_CAP_DROPPED;
3285 }
3286
3287 if (waitfor_caps)
3288 wait_on_list(in->waitfor_caps);
3289 else if (waitfor_commit)
3290 wait_on_list(in->waitfor_commit);
3291 }
3292 }
3293
3294 int Client::get_caps_used(Inode *in)
3295 {
3296 unsigned used = in->caps_used();
3297 if (!(used & CEPH_CAP_FILE_CACHE) &&
3298 !objectcacher->set_is_empty(&in->oset))
3299 used |= CEPH_CAP_FILE_CACHE;
3300 return used;
3301 }
3302
3303 void Client::cap_delay_requeue(Inode *in)
3304 {
3305 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3306 in->hold_caps_until = ceph_clock_now();
3307 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3308 delayed_list.push_back(&in->delay_cap_item);
3309 }
3310
3311 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3312 bool sync, int used, int want, int retain,
3313 int flush, ceph_tid_t flush_tid)
3314 {
3315 int held = cap->issued | cap->implemented;
3316 int revoking = cap->implemented & ~cap->issued;
3317 retain &= ~revoking;
3318 int dropping = cap->issued & ~retain;
3319 int op = CEPH_CAP_OP_UPDATE;
3320
3321 ldout(cct, 10) << __func__ << " " << *in
3322 << " mds." << session->mds_num << " seq " << cap->seq
3323 << (sync ? " sync " : " async ")
3324 << " used " << ccap_string(used)
3325 << " want " << ccap_string(want)
3326 << " flush " << ccap_string(flush)
3327 << " retain " << ccap_string(retain)
3328 << " held "<< ccap_string(held)
3329 << " revoking " << ccap_string(revoking)
3330 << " dropping " << ccap_string(dropping)
3331 << dendl;
3332
3333 if (cct->_conf->client_inject_release_failure && revoking) {
3334 const int would_have_issued = cap->issued & retain;
3335 const int would_have_implemented = cap->implemented & (cap->issued | used);
3336 // Simulated bug:
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3340 cap->issued = cap->issued | cap->implemented;
3341
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3346 cap->issued ^= xattr_mask & revoking;
3347 cap->implemented ^= xattr_mask & revoking;
3348
3349 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3350 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3351 } else {
3352 // Normal behaviour
3353 cap->issued &= retain;
3354 cap->implemented &= cap->issued | used;
3355 }
3356
3357 snapid_t follows = 0;
3358
3359 if (flush)
3360 follows = in->snaprealm->get_snap_context().seq;
3361
3362 auto m = MClientCaps::create(op,
3363 in->ino,
3364 0,
3365 cap->cap_id, cap->seq,
3366 cap->implemented,
3367 want,
3368 flush,
3369 cap->mseq,
3370 cap_epoch_barrier);
3371 m->caller_uid = in->cap_dirtier_uid;
3372 m->caller_gid = in->cap_dirtier_gid;
3373
3374 m->head.issue_seq = cap->issue_seq;
3375 m->set_tid(flush_tid);
3376
3377 m->head.uid = in->uid;
3378 m->head.gid = in->gid;
3379 m->head.mode = in->mode;
3380
3381 m->head.nlink = in->nlink;
3382
3383 if (flush & CEPH_CAP_XATTR_EXCL) {
3384 encode(in->xattrs, m->xattrbl);
3385 m->head.xattr_version = in->xattr_version;
3386 }
3387
3388 m->size = in->size;
3389 m->max_size = in->max_size;
3390 m->truncate_seq = in->truncate_seq;
3391 m->truncate_size = in->truncate_size;
3392 m->mtime = in->mtime;
3393 m->atime = in->atime;
3394 m->ctime = in->ctime;
3395 m->btime = in->btime;
3396 m->time_warp_seq = in->time_warp_seq;
3397 m->change_attr = in->change_attr;
3398 if (sync)
3399 m->flags |= MClientCaps::FLAG_SYNC;
3400 if (!in->cap_snaps.empty())
3401 m->flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3402
3403 if (flush & CEPH_CAP_FILE_WR) {
3404 m->inline_version = in->inline_version;
3405 m->inline_data = in->inline_data;
3406 }
3407
3408 in->reported_size = in->size;
3409 m->set_snap_follows(follows);
3410 cap->wanted = want;
3411 if (cap == in->auth_cap) {
3412 m->set_max_size(in->wanted_max_size);
3413 in->requested_max_size = in->wanted_max_size;
3414 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3415 }
3416
3417 if (!session->flushing_caps_tids.empty())
3418 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3419
3420 session->con->send_message2(std::move(m));
3421 }
3422
3423 static bool is_max_size_approaching(Inode *in)
3424 {
3425 /* mds will adjust max size according to the reported size */
3426 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3427 return false;
3428 if (in->size >= in->max_size)
3429 return true;
3430 /* half of previous max_size increment has been used */
3431 if (in->max_size > in->reported_size &&
3432 (in->size << 1) >= in->max_size + in->reported_size)
3433 return true;
3434 return false;
3435 }
3436
3437 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3438 {
3439 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3440 return used;
3441 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3442 return used;
3443
3444 if (issued & CEPH_CAP_FILE_LAZYIO) {
3445 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3446 used &= ~CEPH_CAP_FILE_CACHE;
3447 used |= CEPH_CAP_FILE_LAZYIO;
3448 }
3449 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3450 used &= ~CEPH_CAP_FILE_BUFFER;
3451 used |= CEPH_CAP_FILE_LAZYIO;
3452 }
3453 } else {
3454 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3455 used &= ~CEPH_CAP_FILE_CACHE;
3456 used |= CEPH_CAP_FILE_LAZYIO;
3457 }
3458 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3459 used &= ~CEPH_CAP_FILE_BUFFER;
3460 used |= CEPH_CAP_FILE_LAZYIO;
3461 }
3462 }
3463 return used;
3464 }
3465
3466 /**
3467 * check_caps
3468 *
3469 * Examine currently used and wanted versus held caps. Release, flush or ack
3470 * revoked caps to the MDS as appropriate.
3471 *
3472 * @param in the inode to check
3473 * @param flags flags to apply to cap check
3474 */
3475 void Client::check_caps(Inode *in, unsigned flags)
3476 {
3477 unsigned wanted = in->caps_wanted();
3478 unsigned used = get_caps_used(in);
3479 unsigned cap_used;
3480
3481 int implemented;
3482 int issued = in->caps_issued(&implemented);
3483 int revoking = implemented & ~issued;
3484
3485 int orig_used = used;
3486 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3487
3488 int retain = wanted | used | CEPH_CAP_PIN;
3489 if (!unmounting && in->nlink > 0) {
3490 if (wanted) {
3491 retain |= CEPH_CAP_ANY;
3492 } else if (in->is_dir() &&
3493 (issued & CEPH_CAP_FILE_SHARED) &&
3494 (in->flags & I_COMPLETE)) {
3495 // we do this here because we don't want to drop to Fs (and then
3496 // drop the Fs if we do a create!) if that alone makes us send lookups
3497 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3498 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3499 retain |= wanted;
3500 } else {
3501 retain |= CEPH_CAP_ANY_SHARED;
3502 // keep RD only if we didn't have the file open RW,
3503 // because then the mds would revoke it anyway to
3504 // journal max_size=0.
3505 if (in->max_size == 0)
3506 retain |= CEPH_CAP_ANY_RD;
3507 }
3508 }
3509
3510 ldout(cct, 10) << __func__ << " on " << *in
3511 << " wanted " << ccap_string(wanted)
3512 << " used " << ccap_string(used)
3513 << " issued " << ccap_string(issued)
3514 << " revoking " << ccap_string(revoking)
3515 << " flags=" << flags
3516 << dendl;
3517
3518 if (in->snapid != CEPH_NOSNAP)
3519 return; //snap caps last forever, can't write
3520
3521 if (in->caps.empty())
3522 return; // guard if at end of func
3523
3524 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3525 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3526 if (_release(in))
3527 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3528 }
3529
3530 if (!in->cap_snaps.empty())
3531 flush_snaps(in);
3532
3533 for (auto &p : in->caps) {
3534 mds_rank_t mds = p.first;
3535 Cap &cap = p.second;
3536
3537 MetaSession *session = &mds_sessions.at(mds);
3538
3539 cap_used = used;
3540 if (in->auth_cap && &cap != in->auth_cap)
3541 cap_used &= ~in->auth_cap->issued;
3542
3543 revoking = cap.implemented & ~cap.issued;
3544
3545 ldout(cct, 10) << " cap mds." << mds
3546 << " issued " << ccap_string(cap.issued)
3547 << " implemented " << ccap_string(cap.implemented)
3548 << " revoking " << ccap_string(revoking) << dendl;
3549
3550 if (in->wanted_max_size > in->max_size &&
3551 in->wanted_max_size > in->requested_max_size &&
3552 &cap == in->auth_cap)
3553 goto ack;
3554
3555 /* approaching file_max? */
3556 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3557 &cap == in->auth_cap &&
3558 is_max_size_approaching(in)) {
3559 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3560 << ", reported " << in->reported_size << dendl;
3561 goto ack;
3562 }
3563
3564 /* completed revocation? */
3565 if (revoking && (revoking & cap_used) == 0) {
3566 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3567 goto ack;
3568 }
3569
3570 /* want more caps from mds? */
3571 if (wanted & ~(cap.wanted | cap.issued))
3572 goto ack;
3573
3574 if (!revoking && unmounting && (cap_used == 0))
3575 goto ack;
3576
3577 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3578 !in->dirty_caps) // and we have no dirty caps
3579 continue;
3580
3581 if (!(flags & CHECK_CAPS_NODELAY)) {
3582 ldout(cct, 10) << "delaying cap release" << dendl;
3583 cap_delay_requeue(in);
3584 continue;
3585 }
3586
3587 ack:
3588 // re-send old cap/snapcap flushes first.
3589 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3590 session->mds_state < MDSMap::STATE_ACTIVE &&
3591 session->early_flushing_caps.count(in) == 0) {
3592 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3593 << " to mds." << session->mds_num << dendl;
3594 session->early_flushing_caps.insert(in);
3595 if (in->cap_snaps.size())
3596 flush_snaps(in, true);
3597 if (in->flushing_caps)
3598 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3599 }
3600
3601 int flushing;
3602 ceph_tid_t flush_tid;
3603 if (in->auth_cap == &cap && in->dirty_caps) {
3604 flushing = mark_caps_flushing(in, &flush_tid);
3605 } else {
3606 flushing = 0;
3607 flush_tid = 0;
3608 }
3609
3610 send_cap(in, session, &cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3611 retain, flushing, flush_tid);
3612 }
3613 }
3614
3615
3616 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3617 {
3618 int used = get_caps_used(in);
3619 int dirty = in->caps_dirty();
3620 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3621
3622 if (in->cap_snaps.size() &&
3623 in->cap_snaps.rbegin()->second.writing) {
3624 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3625 return;
3626 } else if (in->caps_dirty() ||
3627 (used & CEPH_CAP_FILE_WR) ||
3628 (dirty & CEPH_CAP_ANY_WR)) {
3629 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3630 ceph_assert(capsnapem.second); /* element inserted */
3631 CapSnap &capsnap = capsnapem.first->second;
3632 capsnap.context = old_snapc;
3633 capsnap.issued = in->caps_issued();
3634 capsnap.dirty = in->caps_dirty();
3635
3636 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3637
3638 capsnap.uid = in->uid;
3639 capsnap.gid = in->gid;
3640 capsnap.mode = in->mode;
3641 capsnap.btime = in->btime;
3642 capsnap.xattrs = in->xattrs;
3643 capsnap.xattr_version = in->xattr_version;
3644 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3645 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3646
3647 if (used & CEPH_CAP_FILE_WR) {
3648 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3649 capsnap.writing = 1;
3650 } else {
3651 finish_cap_snap(in, capsnap, used);
3652 }
3653 } else {
3654 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3655 }
3656 }
3657
3658 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3659 {
3660 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3661 capsnap.size = in->size;
3662 capsnap.mtime = in->mtime;
3663 capsnap.atime = in->atime;
3664 capsnap.ctime = in->ctime;
3665 capsnap.time_warp_seq = in->time_warp_seq;
3666 capsnap.change_attr = in->change_attr;
3667 capsnap.dirty |= in->caps_dirty();
3668
3669 /* Only reset it if it wasn't set before */
3670 if (capsnap.cap_dirtier_uid == -1) {
3671 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3672 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3673 }
3674
3675 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3676 capsnap.inline_data = in->inline_data;
3677 capsnap.inline_version = in->inline_version;
3678 }
3679
3680 if (used & CEPH_CAP_FILE_BUFFER) {
3681 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3682 << " WRBUFFER, delaying" << dendl;
3683 } else {
3684 capsnap.dirty_data = 0;
3685 flush_snaps(in);
3686 }
3687 }
3688
3689 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3690 {
3691 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3692 in->cap_snaps.at(seq).dirty_data = 0;
3693 flush_snaps(in);
3694 }
3695
3696 void Client::flush_snaps(Inode *in, bool all_again)
3697 {
3698 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3699 ceph_assert(in->cap_snaps.size());
3700
3701 // pick auth mds
3702 ceph_assert(in->auth_cap);
3703 MetaSession *session = in->auth_cap->session;
3704 int mseq = in->auth_cap->mseq;
3705
3706 for (auto &p : in->cap_snaps) {
3707 CapSnap &capsnap = p.second;
3708 if (!all_again) {
3709 // only flush once per session
3710 if (capsnap.flush_tid > 0)
3711 continue;
3712 }
3713
3714 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3715 << " follows " << p.first
3716 << " size " << capsnap.size
3717 << " mtime " << capsnap.mtime
3718 << " dirty_data=" << capsnap.dirty_data
3719 << " writing=" << capsnap.writing
3720 << " on " << *in << dendl;
3721 if (capsnap.dirty_data || capsnap.writing)
3722 continue;
3723
3724 if (capsnap.flush_tid == 0) {
3725 capsnap.flush_tid = ++last_flush_tid;
3726 if (!in->flushing_cap_item.is_on_list())
3727 session->flushing_caps.push_back(&in->flushing_cap_item);
3728 session->flushing_caps_tids.insert(capsnap.flush_tid);
3729 }
3730
3731 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3732 cap_epoch_barrier);
3733 m->caller_uid = capsnap.cap_dirtier_uid;
3734 m->caller_gid = capsnap.cap_dirtier_gid;
3735
3736 m->set_client_tid(capsnap.flush_tid);
3737 m->head.snap_follows = p.first;
3738
3739 m->head.caps = capsnap.issued;
3740 m->head.dirty = capsnap.dirty;
3741
3742 m->head.uid = capsnap.uid;
3743 m->head.gid = capsnap.gid;
3744 m->head.mode = capsnap.mode;
3745 m->btime = capsnap.btime;
3746
3747 m->size = capsnap.size;
3748
3749 m->head.xattr_version = capsnap.xattr_version;
3750 encode(capsnap.xattrs, m->xattrbl);
3751
3752 m->ctime = capsnap.ctime;
3753 m->btime = capsnap.btime;
3754 m->mtime = capsnap.mtime;
3755 m->atime = capsnap.atime;
3756 m->time_warp_seq = capsnap.time_warp_seq;
3757 m->change_attr = capsnap.change_attr;
3758
3759 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3760 m->inline_version = in->inline_version;
3761 m->inline_data = in->inline_data;
3762 }
3763
3764 ceph_assert(!session->flushing_caps_tids.empty());
3765 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3766
3767 session->con->send_message2(std::move(m));
3768 }
3769 }
3770
3771
3772
3773 void Client::wait_on_list(list<Cond*>& ls)
3774 {
3775 Cond cond;
3776 ls.push_back(&cond);
3777 cond.Wait(client_lock);
3778 ls.remove(&cond);
3779 }
3780
3781 void Client::signal_cond_list(list<Cond*>& ls)
3782 {
3783 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3784 (*it)->Signal();
3785 }
3786
3787 void Client::wait_on_context_list(list<Context*>& ls)
3788 {
3789 Cond cond;
3790 bool done = false;
3791 int r;
3792 ls.push_back(new C_Cond(&cond, &done, &r));
3793 while (!done)
3794 cond.Wait(client_lock);
3795 }
3796
3797 void Client::signal_context_list(list<Context*>& ls)
3798 {
3799 while (!ls.empty()) {
3800 ls.front()->complete(0);
3801 ls.pop_front();
3802 }
3803 }
3804
3805 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3806 {
3807 for (const auto &cap : s->caps) {
3808 auto &in = cap->inode;
3809 if (reconnect) {
3810 in.requested_max_size = 0;
3811 in.wanted_max_size = 0;
3812 } else {
3813 if (cap->gen < s->cap_gen) {
3814 // mds did not re-issue stale cap.
3815 cap->issued = cap->implemented = CEPH_CAP_PIN;
3816 // make sure mds knows what we want.
3817 if (in.caps_file_wanted() & ~cap->wanted)
3818 in.flags |= I_CAP_DROPPED;
3819 }
3820 }
3821 signal_cond_list(in.waitfor_caps);
3822 }
3823 }
3824
3825
3826 // flush dirty data (from objectcache)
3827
3828 class C_Client_CacheInvalidate : public Context {
3829 private:
3830 Client *client;
3831 vinodeno_t ino;
3832 int64_t offset, length;
3833 public:
3834 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3835 client(c), offset(off), length(len) {
3836 if (client->use_faked_inos())
3837 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3838 else
3839 ino = in->vino();
3840 }
3841 void finish(int r) override {
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3843 ceph_assert(!client->client_lock.is_locked_by_me());
3844 client->_async_invalidate(ino, offset, length);
3845 }
3846 };
3847
3848 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3849 {
3850 if (unmounting)
3851 return;
3852 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3853 ino_invalidate_cb(callback_handle, ino, off, len);
3854 }
3855
3856 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3857
3858 if (ino_invalidate_cb)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3861 }
3862
3863 void Client::_invalidate_inode_cache(Inode *in)
3864 {
3865 ldout(cct, 10) << __func__ << " " << *in << dendl;
3866
3867 // invalidate our userspace inode cache
3868 if (cct->_conf->client_oc) {
3869 objectcacher->release_set(&in->oset);
3870 if (!objectcacher->set_is_empty(&in->oset))
3871 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3872 }
3873
3874 _schedule_invalidate_callback(in, 0, 0);
3875 }
3876
3877 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3878 {
3879 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3880
3881 // invalidate our userspace inode cache
3882 if (cct->_conf->client_oc) {
3883 vector<ObjectExtent> ls;
3884 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3885 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3886 }
3887
3888 _schedule_invalidate_callback(in, off, len);
3889 }
3890
3891 bool Client::_release(Inode *in)
3892 {
3893 ldout(cct, 20) << "_release " << *in << dendl;
3894 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3895 _invalidate_inode_cache(in);
3896 return true;
3897 }
3898 return false;
3899 }
3900
3901 bool Client::_flush(Inode *in, Context *onfinish)
3902 {
3903 ldout(cct, 10) << "_flush " << *in << dendl;
3904
3905 if (!in->oset.dirty_or_tx) {
3906 ldout(cct, 10) << " nothing to flush" << dendl;
3907 onfinish->complete(0);
3908 return true;
3909 }
3910
3911 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3912 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3913 objectcacher->purge_set(&in->oset);
3914 if (onfinish) {
3915 onfinish->complete(-ENOSPC);
3916 }
3917 return true;
3918 }
3919
3920 return objectcacher->flush_set(&in->oset, onfinish);
3921 }
3922
3923 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3924 {
3925 ceph_assert(client_lock.is_locked());
3926 if (!in->oset.dirty_or_tx) {
3927 ldout(cct, 10) << " nothing to flush" << dendl;
3928 return;
3929 }
3930
3931 C_SaferCond onflush("Client::_flush_range flock");
3932 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3933 offset, size, &onflush);
3934 if (!ret) {
3935 // wait for flush
3936 client_lock.Unlock();
3937 onflush.wait();
3938 client_lock.Lock();
3939 }
3940 }
3941
3942 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3943 {
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3946 Inode *in = static_cast<Inode *>(oset->parent);
3947 ceph_assert(in);
3948 _flushed(in);
3949 }
3950
3951 void Client::_flushed(Inode *in)
3952 {
3953 ldout(cct, 10) << "_flushed " << *in << dendl;
3954
3955 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3956 }
3957
3958
3959
3960 // checks common to add_update_cap, handle_cap_grant
3961 void Client::check_cap_issue(Inode *in, unsigned issued)
3962 {
3963 unsigned had = in->caps_issued();
3964
3965 if ((issued & CEPH_CAP_FILE_CACHE) &&
3966 !(had & CEPH_CAP_FILE_CACHE))
3967 in->cache_gen++;
3968
3969 if ((issued & CEPH_CAP_FILE_SHARED) &&
3970 !(had & CEPH_CAP_FILE_SHARED)) {
3971 in->shared_gen++;
3972
3973 if (in->is_dir())
3974 clear_dir_complete_and_ordered(in, true);
3975 }
3976 }
3977
3978 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3979 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3980 inodeno_t realm, int flags, const UserPerm& cap_perms)
3981 {
3982 if (!in->is_any_caps()) {
3983 ceph_assert(in->snaprealm == 0);
3984 in->snaprealm = get_snap_realm(realm);
3985 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3986 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3987 } else {
3988 ceph_assert(in->snaprealm);
3989 if ((flags & CEPH_CAP_FLAG_AUTH) &&
3990 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
3991 in->snaprealm_item.remove_myself();
3992 auto oldrealm = in->snaprealm;
3993 in->snaprealm = get_snap_realm(realm);
3994 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3995 put_snap_realm(oldrealm);
3996 }
3997 }
3998
3999 mds_rank_t mds = mds_session->mds_num;
4000 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4001 Cap &cap = capem.first->second;
4002 if (!capem.second) {
4003 if (cap.gen < mds_session->cap_gen)
4004 cap.issued = cap.implemented = CEPH_CAP_PIN;
4005
4006 /*
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4010 *
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4014 */
4015 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4016 ceph_assert(&cap == in->auth_cap);
4017 ceph_assert(cap.cap_id == cap_id);
4018 seq = cap.seq;
4019 mseq = cap.mseq;
4020 issued |= cap.issued;
4021 flags |= CEPH_CAP_FLAG_AUTH;
4022 }
4023 }
4024
4025 check_cap_issue(in, issued);
4026
4027 if (flags & CEPH_CAP_FLAG_AUTH) {
4028 if (in->auth_cap != &cap &&
4029 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4030 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4031 ldout(cct, 10) << __func__ << " changing auth cap: "
4032 << "add myself to new auth MDS' flushing caps list" << dendl;
4033 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4034 }
4035 in->auth_cap = &cap;
4036 }
4037 }
4038
4039 unsigned old_caps = cap.issued;
4040 cap.cap_id = cap_id;
4041 cap.issued = issued;
4042 cap.implemented |= issued;
4043 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4044 cap.wanted = wanted;
4045 else
4046 cap.wanted |= wanted;
4047 cap.seq = seq;
4048 cap.issue_seq = seq;
4049 cap.mseq = mseq;
4050 cap.gen = mds_session->cap_gen;
4051 cap.latest_perms = cap_perms;
4052 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4053 << " from mds." << mds
4054 << " on " << *in
4055 << dendl;
4056
4057 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4058 // non-auth MDS is revoking the newly grant caps ?
4059 for (auto &p : in->caps) {
4060 if (&p.second == &cap)
4061 continue;
4062 if (p.second.implemented & ~p.second.issued & issued) {
4063 check_caps(in, CHECK_CAPS_NODELAY);
4064 break;
4065 }
4066 }
4067 }
4068
4069 if (issued & ~old_caps)
4070 signal_cond_list(in->waitfor_caps);
4071 }
4072
4073 void Client::remove_cap(Cap *cap, bool queue_release)
4074 {
4075 auto &in = cap->inode;
4076 MetaSession *session = cap->session;
4077 mds_rank_t mds = cap->session->mds_num;
4078
4079 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4080
4081 if (queue_release) {
4082 session->enqueue_cap_release(
4083 in.ino,
4084 cap->cap_id,
4085 cap->issue_seq,
4086 cap->mseq,
4087 cap_epoch_barrier);
4088 }
4089
4090 if (in.auth_cap == cap) {
4091 if (in.flushing_cap_item.is_on_list()) {
4092 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4093 in.flushing_cap_item.remove_myself();
4094 }
4095 in.auth_cap = NULL;
4096 }
4097 size_t n = in.caps.erase(mds);
4098 ceph_assert(n == 1);
4099 cap = nullptr;
4100
4101 if (!in.is_any_caps()) {
4102 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4103 in.snaprealm_item.remove_myself();
4104 put_snap_realm(in.snaprealm);
4105 in.snaprealm = 0;
4106 }
4107 }
4108
4109 void Client::remove_all_caps(Inode *in)
4110 {
4111 while (!in->caps.empty())
4112 remove_cap(&in->caps.begin()->second, true);
4113 }
4114
4115 void Client::remove_session_caps(MetaSession *s)
4116 {
4117 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4118
4119 while (s->caps.size()) {
4120 Cap *cap = *s->caps.begin();
4121 InodeRef in(&cap->inode);
4122 bool dirty_caps = false, cap_snaps = false;
4123 if (in->auth_cap == cap) {
4124 cap_snaps = !in->cap_snaps.empty();
4125 dirty_caps = in->dirty_caps | in->flushing_caps;
4126 in->wanted_max_size = 0;
4127 in->requested_max_size = 0;
4128 }
4129 if (cap->wanted | cap->issued)
4130 in->flags |= I_CAP_DROPPED;
4131 remove_cap(cap, false);
4132 if (cap_snaps) {
4133 in->cap_snaps.clear();
4134 }
4135 if (dirty_caps) {
4136 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4137 if (in->flushing_caps) {
4138 num_flushing_caps--;
4139 in->flushing_cap_tids.clear();
4140 }
4141 in->flushing_caps = 0;
4142 in->mark_caps_clean();
4143 put_inode(in.get());
4144 }
4145 signal_cond_list(in->waitfor_caps);
4146 }
4147 s->flushing_caps_tids.clear();
4148 sync_cond.Signal();
4149 }
4150
4151 int Client::_do_remount(bool retry_on_error)
4152 {
4153 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4154
4155 errno = 0;
4156 int r = remount_cb(callback_handle);
4157 if (r == 0) {
4158 retries_on_invalidate = 0;
4159 } else {
4160 int e = errno;
4161 client_t whoami = get_nodeid();
4162 if (r == -1) {
4163 lderr(cct) <<
4164 "failed to remount (to trim kernel dentries): "
4165 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4166 } else {
4167 lderr(cct) <<
4168 "failed to remount (to trim kernel dentries): "
4169 "return code = " << r << dendl;
4170 }
4171 bool should_abort =
4172 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4173 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4174 !(retry_on_error && (++retries_on_invalidate < max_retries));
4175 if (should_abort && !unmounting) {
4176 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4177 ceph_abort();
4178 }
4179 }
4180 return r;
4181 }
4182
4183 class C_Client_Remount : public Context {
4184 private:
4185 Client *client;
4186 public:
4187 explicit C_Client_Remount(Client *c) : client(c) {}
4188 void finish(int r) override {
4189 ceph_assert(r == 0);
4190 client->_do_remount(true);
4191 }
4192 };
4193
4194 void Client::_invalidate_kernel_dcache()
4195 {
4196 if (unmounting)
4197 return;
4198 if (can_invalidate_dentries) {
4199 if (dentry_invalidate_cb && root->dir) {
4200 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4201 p != root->dir->dentries.end();
4202 ++p) {
4203 if (p->second->inode)
4204 _schedule_invalidate_dentry_callback(p->second, false);
4205 }
4206 }
4207 } else if (remount_cb) {
4208 // Hacky:
4209 // when remounting a file system, linux kernel trims all unused dentries in the fs
4210 remount_finisher.queue(new C_Client_Remount(this));
4211 }
4212 }
4213
4214 void Client::_trim_negative_child_dentries(InodeRef& in)
4215 {
4216 if (!in->is_dir())
4217 return;
4218
4219 Dir* dir = in->dir;
4220 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4221 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4222 Dentry *dn = p->second;
4223 ++p;
4224 ceph_assert(!dn->inode);
4225 if (dn->lru_is_expireable())
4226 unlink(dn, true, false); // keep dir, drop dentry
4227 }
4228 if (dir->dentries.empty()) {
4229 close_dir(dir);
4230 }
4231 }
4232
4233 if (in->flags & I_SNAPDIR_OPEN) {
4234 InodeRef snapdir = open_snapdir(in.get());
4235 _trim_negative_child_dentries(snapdir);
4236 }
4237 }
4238
4239 void Client::trim_caps(MetaSession *s, uint64_t max)
4240 {
4241 mds_rank_t mds = s->mds_num;
4242 size_t caps_size = s->caps.size();
4243 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4244 << " caps " << caps_size << dendl;
4245
4246 uint64_t trimmed = 0;
4247 auto p = s->caps.begin();
4248 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4249 * looking at from getting deleted during traversal. */
4250 while ((caps_size - trimmed) > max && !p.end()) {
4251 Cap *cap = *p;
4252 InodeRef in(&cap->inode);
4253
4254 // Increment p early because it will be invalidated if cap
4255 // is deleted inside remove_cap
4256 ++p;
4257
4258 if (in->caps.size() > 1 && cap != in->auth_cap) {
4259 int mine = cap->issued | cap->implemented;
4260 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4261 // disposable non-auth cap
4262 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4263 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4264 cap = (remove_cap(cap, true), nullptr);
4265 trimmed++;
4266 }
4267 } else {
4268 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4269 _trim_negative_child_dentries(in);
4270 bool all = true;
4271 auto q = in->dentries.begin();
4272 while (q != in->dentries.end()) {
4273 Dentry *dn = *q;
4274 ++q;
4275 if (dn->lru_is_expireable()) {
4276 if (can_invalidate_dentries &&
4277 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4278 // Only issue one of these per DN for inodes in root: handle
4279 // others more efficiently by calling for root-child DNs at
4280 // the end of this function.
4281 _schedule_invalidate_dentry_callback(dn, true);
4282 }
4283 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4284 to_trim.insert(dn);
4285 } else {
4286 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4287 all = false;
4288 }
4289 }
4290 if (all && in->ino != MDS_INO_ROOT) {
4291 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4292 trimmed++;
4293 }
4294 }
4295 }
4296 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4297 for (const auto &dn : to_trim) {
4298 trim_dentry(dn);
4299 }
4300 to_trim.clear();
4301
4302 caps_size = s->caps.size();
4303 if (caps_size > (size_t)max)
4304 _invalidate_kernel_dcache();
4305 }
4306
4307 void Client::force_session_readonly(MetaSession *s)
4308 {
4309 s->readonly = true;
4310 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4311 auto &in = (*p)->inode;
4312 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4313 signal_cond_list(in.waitfor_caps);
4314 }
4315 }
4316
4317 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4318 {
4319 MetaSession *session = in->auth_cap->session;
4320
4321 int flushing = in->dirty_caps;
4322 ceph_assert(flushing);
4323
4324 ceph_tid_t flush_tid = ++last_flush_tid;
4325 in->flushing_cap_tids[flush_tid] = flushing;
4326
4327 if (!in->flushing_caps) {
4328 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4329 num_flushing_caps++;
4330 } else {
4331 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4332 }
4333
4334 in->flushing_caps |= flushing;
4335 in->mark_caps_clean();
4336
4337 if (!in->flushing_cap_item.is_on_list())
4338 session->flushing_caps.push_back(&in->flushing_cap_item);
4339 session->flushing_caps_tids.insert(flush_tid);
4340
4341 *ptid = flush_tid;
4342 return flushing;
4343 }
4344
4345 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4346 {
4347 for (auto &p : in->cap_snaps) {
4348 CapSnap &capsnap = p.second;
4349 if (capsnap.flush_tid > 0) {
4350 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4351 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4352 }
4353 }
4354 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4355 it != in->flushing_cap_tids.end();
4356 ++it) {
4357 old_s->flushing_caps_tids.erase(it->first);
4358 new_s->flushing_caps_tids.insert(it->first);
4359 }
4360 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4361 }
4362
4363 /*
4364 * Flush all caps back to the MDS. Because the callers generally wait on the
4365 * result of this function (syncfs and umount cases), we set
4366 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4367 */
4368 void Client::flush_caps_sync()
4369 {
4370 ldout(cct, 10) << __func__ << dendl;
4371 xlist<Inode*>::iterator p = delayed_list.begin();
4372 while (!p.end()) {
4373 unsigned flags = CHECK_CAPS_NODELAY;
4374 Inode *in = *p;
4375
4376 ++p;
4377 delayed_list.pop_front();
4378 if (p.end() && dirty_list.empty())
4379 flags |= CHECK_CAPS_SYNCHRONOUS;
4380 check_caps(in, flags);
4381 }
4382
4383 // other caps, too
4384 p = dirty_list.begin();
4385 while (!p.end()) {
4386 unsigned flags = CHECK_CAPS_NODELAY;
4387 Inode *in = *p;
4388
4389 ++p;
4390 if (p.end())
4391 flags |= CHECK_CAPS_SYNCHRONOUS;
4392 check_caps(in, flags);
4393 }
4394 }
4395
4396 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4397 {
4398 ldout(cct, 10) << __func__ << " " << in << " mds." << session->mds_num << dendl;
4399 Cap *cap = in->auth_cap;
4400 ceph_assert(cap->session == session);
4401
4402 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4403 p != in->flushing_cap_tids.end();
4404 ++p) {
4405 bool req_sync = false;
4406
4407 /* If this is a synchronous request, then flush the journal on last one */
4408 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4409 req_sync = true;
4410
4411 send_cap(in, session, cap, req_sync,
4412 (get_caps_used(in) | in->caps_dirty()),
4413 in->caps_wanted(), (cap->issued | cap->implemented),
4414 p->second, p->first);
4415 }
4416 }
4417
4418 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4419 {
4420 while (in->flushing_caps) {
4421 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4422 ceph_assert(it != in->flushing_cap_tids.end());
4423 if (it->first > want)
4424 break;
4425 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4426 << ccap_string(it->second) << " want " << want
4427 << " last " << it->first << dendl;
4428 wait_on_list(in->waitfor_caps);
4429 }
4430 }
4431
4432 void Client::wait_sync_caps(ceph_tid_t want)
4433 {
4434 retry:
4435 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4436 << num_flushing_caps << " total flushing)" << dendl;
4437 for (auto &p : mds_sessions) {
4438 MetaSession *s = &p.second;
4439 if (s->flushing_caps_tids.empty())
4440 continue;
4441 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4442 if (oldest_tid <= want) {
4443 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4444 << " (want " << want << ")" << dendl;
4445 sync_cond.Wait(client_lock);
4446 goto retry;
4447 }
4448 }
4449 }
4450
4451 void Client::kick_flushing_caps(MetaSession *session)
4452 {
4453 mds_rank_t mds = session->mds_num;
4454 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4455
4456 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4457 Inode *in = *p;
4458 if (session->early_flushing_caps.count(in))
4459 continue;
4460 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4461 if (in->cap_snaps.size())
4462 flush_snaps(in, true);
4463 if (in->flushing_caps)
4464 flush_caps(in, session);
4465 }
4466
4467 session->early_flushing_caps.clear();
4468 }
4469
4470 void Client::early_kick_flushing_caps(MetaSession *session)
4471 {
4472 session->early_flushing_caps.clear();
4473
4474 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4475 Inode *in = *p;
4476 Cap *cap = in->auth_cap;
4477 ceph_assert(cap);
4478
4479 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4480 // stage. This guarantees that MDS processes the cap flush message before issuing
4481 // the flushing caps to other client.
4482 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4483 continue;
4484
4485 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4486 << " to mds." << session->mds_num << dendl;
4487
4488 session->early_flushing_caps.insert(in);
4489
4490 // send_reconnect() also will reset these sequence numbers. make sure
4491 // sequence numbers in cap flush message match later reconnect message.
4492 cap->seq = 0;
4493 cap->issue_seq = 0;
4494 cap->mseq = 0;
4495 cap->issued = cap->implemented;
4496
4497 if (in->cap_snaps.size())
4498 flush_snaps(in, true);
4499 if (in->flushing_caps)
4500 flush_caps(in, session);
4501
4502 }
4503 }
4504
4505 void SnapRealm::build_snap_context()
4506 {
4507 set<snapid_t> snaps;
4508 snapid_t max_seq = seq;
4509
4510 // start with prior_parents?
4511 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4512 snaps.insert(prior_parent_snaps[i]);
4513
4514 // current parent's snaps
4515 if (pparent) {
4516 const SnapContext& psnapc = pparent->get_snap_context();
4517 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4518 if (psnapc.snaps[i] >= parent_since)
4519 snaps.insert(psnapc.snaps[i]);
4520 if (psnapc.seq > max_seq)
4521 max_seq = psnapc.seq;
4522 }
4523
4524 // my snaps
4525 for (unsigned i=0; i<my_snaps.size(); i++)
4526 snaps.insert(my_snaps[i]);
4527
4528 // ok!
4529 cached_snap_context.seq = max_seq;
4530 cached_snap_context.snaps.resize(0);
4531 cached_snap_context.snaps.reserve(snaps.size());
4532 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4533 cached_snap_context.snaps.push_back(*p);
4534 }
4535
4536 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4537 {
4538 list<SnapRealm*> q;
4539 q.push_back(realm);
4540
4541 while (!q.empty()) {
4542 realm = q.front();
4543 q.pop_front();
4544
4545 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4546 realm->invalidate_cache();
4547
4548 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4549 p != realm->pchildren.end();
4550 ++p)
4551 q.push_back(*p);
4552 }
4553 }
4554
4555 SnapRealm *Client::get_snap_realm(inodeno_t r)
4556 {
4557 SnapRealm *realm = snap_realms[r];
4558 if (!realm)
4559 snap_realms[r] = realm = new SnapRealm(r);
4560 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4561 realm->nref++;
4562 return realm;
4563 }
4564
4565 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4566 {
4567 if (snap_realms.count(r) == 0) {
4568 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4569 return NULL;
4570 }
4571 SnapRealm *realm = snap_realms[r];
4572 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4573 realm->nref++;
4574 return realm;
4575 }
4576
4577 void Client::put_snap_realm(SnapRealm *realm)
4578 {
4579 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4580 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4581 if (--realm->nref == 0) {
4582 snap_realms.erase(realm->ino);
4583 if (realm->pparent) {
4584 realm->pparent->pchildren.erase(realm);
4585 put_snap_realm(realm->pparent);
4586 }
4587 delete realm;
4588 }
4589 }
4590
4591 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4592 {
4593 if (realm->parent != parent) {
4594 ldout(cct, 10) << __func__ << " " << *realm
4595 << " " << realm->parent << " -> " << parent << dendl;
4596 realm->parent = parent;
4597 if (realm->pparent) {
4598 realm->pparent->pchildren.erase(realm);
4599 put_snap_realm(realm->pparent);
4600 }
4601 realm->pparent = get_snap_realm(parent);
4602 realm->pparent->pchildren.insert(realm);
4603 return true;
4604 }
4605 return false;
4606 }
4607
4608 static bool has_new_snaps(const SnapContext& old_snapc,
4609 const SnapContext& new_snapc)
4610 {
4611 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4612 }
4613
4614
4615 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4616 {
4617 SnapRealm *first_realm = NULL;
4618 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4619
4620 map<SnapRealm*, SnapContext> dirty_realms;
4621
4622 auto p = bl.cbegin();
4623 while (!p.end()) {
4624 SnapRealmInfo info;
4625 decode(info, p);
4626 SnapRealm *realm = get_snap_realm(info.ino());
4627
4628 bool invalidate = false;
4629
4630 if (info.seq() > realm->seq) {
4631 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4632 << dendl;
4633
4634 if (flush) {
4635 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4636 // flush me + children
4637 list<SnapRealm*> q;
4638 q.push_back(realm);
4639 while (!q.empty()) {
4640 SnapRealm *realm = q.front();
4641 q.pop_front();
4642
4643 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4644 p != realm->pchildren.end();
4645 ++p)
4646 q.push_back(*p);
4647
4648 if (dirty_realms.count(realm) == 0) {
4649 realm->nref++;
4650 dirty_realms[realm] = realm->get_snap_context();
4651 }
4652 }
4653 }
4654
4655 // update
4656 realm->seq = info.seq();
4657 realm->created = info.created();
4658 realm->parent_since = info.parent_since();
4659 realm->prior_parent_snaps = info.prior_parent_snaps;
4660 realm->my_snaps = info.my_snaps;
4661 invalidate = true;
4662 }
4663
4664 // _always_ verify parent
4665 if (adjust_realm_parent(realm, info.parent()))
4666 invalidate = true;
4667
4668 if (invalidate) {
4669 invalidate_snaprealm_and_children(realm);
4670 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4671 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4672 } else {
4673 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4674 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4675 }
4676
4677 if (!first_realm)
4678 first_realm = realm;
4679 else
4680 put_snap_realm(realm);
4681 }
4682
4683 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4684 q != dirty_realms.end();
4685 ++q) {
4686 SnapRealm *realm = q->first;
4687 // if there are new snaps ?
4688 if (has_new_snaps(q->second, realm->get_snap_context())) {
4689 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4690 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4691 while (!r.end()) {
4692 Inode *in = *r;
4693 ++r;
4694 queue_cap_snap(in, q->second);
4695 }
4696 } else {
4697 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4698 }
4699 put_snap_realm(realm);
4700 }
4701
4702 if (realm_ret)
4703 *realm_ret = first_realm;
4704 else
4705 put_snap_realm(first_realm);
4706 }
4707
4708 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4709 {
4710 ldout(cct, 10) << __func__ << " " << *m << dendl;
4711 mds_rank_t mds = mds_rank_t(m->get_source().num());
4712 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4713 if (!session) {
4714 return;
4715 }
4716
4717 got_mds_push(session);
4718
4719 map<Inode*, SnapContext> to_move;
4720 SnapRealm *realm = 0;
4721
4722 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4723 ceph_assert(m->head.split);
4724 SnapRealmInfo info;
4725 auto p = m->bl.cbegin();
4726 decode(info, p);
4727 ceph_assert(info.ino() == m->head.split);
4728
4729 // flush, then move, ino's.
4730 realm = get_snap_realm(info.ino());
4731 ldout(cct, 10) << " splitting off " << *realm << dendl;
4732 for (auto& ino : m->split_inos) {
4733 vinodeno_t vino(ino, CEPH_NOSNAP);
4734 if (inode_map.count(vino)) {
4735 Inode *in = inode_map[vino];
4736 if (!in->snaprealm || in->snaprealm == realm)
4737 continue;
4738 if (in->snaprealm->created > info.created()) {
4739 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4740 << *in->snaprealm << dendl;
4741 continue;
4742 }
4743 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4744
4745
4746 in->snaprealm_item.remove_myself();
4747 to_move[in] = in->snaprealm->get_snap_context();
4748 put_snap_realm(in->snaprealm);
4749 }
4750 }
4751
4752 // move child snaprealms, too
4753 for (auto& child_realm : m->split_realms) {
4754 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4755 SnapRealm *child = get_snap_realm_maybe(child_realm);
4756 if (!child)
4757 continue;
4758 adjust_realm_parent(child, realm->ino);
4759 put_snap_realm(child);
4760 }
4761 }
4762
4763 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4764
4765 if (realm) {
4766 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4767 Inode *in = p->first;
4768 in->snaprealm = realm;
4769 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4770 realm->nref++;
4771 // queue for snap writeback
4772 if (has_new_snaps(p->second, realm->get_snap_context()))
4773 queue_cap_snap(in, p->second);
4774 }
4775 put_snap_realm(realm);
4776 }
4777 }
4778
4779 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4780 {
4781 mds_rank_t mds = mds_rank_t(m->get_source().num());
4782 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4783 if (!session) {
4784 return;
4785 }
4786
4787 got_mds_push(session);
4788
4789 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4790
4791 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4792 if (inode_map.count(vino)) {
4793 Inode *in = NULL;
4794 in = inode_map[vino];
4795
4796 if (in) {
4797 in->quota = m->quota;
4798 in->rstat = m->rstat;
4799 }
4800 }
4801 }
4802
4803 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4804 {
4805 mds_rank_t mds = mds_rank_t(m->get_source().num());
4806 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807 if (!session) {
4808 return;
4809 }
4810
4811 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4812 // Pause RADOS operations until we see the required epoch
4813 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4814 }
4815
4816 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4817 // Record the barrier so that we will transmit it to MDS when releasing
4818 set_cap_epoch_barrier(m->osd_epoch_barrier);
4819 }
4820
4821 got_mds_push(session);
4822
4823 Inode *in;
4824 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4825 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4826 in = it->second;
4827 } else {
4828 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4829 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4830 session->enqueue_cap_release(
4831 m->get_ino(),
4832 m->get_cap_id(),
4833 m->get_seq(),
4834 m->get_mseq(),
4835 cap_epoch_barrier);
4836 } else {
4837 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4838 }
4839
4840 // in case the mds is waiting on e.g. a revocation
4841 flush_cap_releases();
4842 return;
4843 }
4844
4845 switch (m->get_op()) {
4846 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4847 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4848 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4849 }
4850
4851 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4852 Cap &cap = in->caps.at(mds);
4853
4854 switch (m->get_op()) {
4855 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4856 case CEPH_CAP_OP_IMPORT:
4857 case CEPH_CAP_OP_REVOKE:
4858 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4859 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4860 }
4861 } else {
4862 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4863 return;
4864 }
4865 }
4866
4867 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4868 {
4869 mds_rank_t mds = session->mds_num;
4870
4871 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4872 << " IMPORT from mds." << mds << dendl;
4873
4874 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4875 Cap *cap = NULL;
4876 UserPerm cap_perms;
4877 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4878 cap = &it->second;
4879 cap_perms = cap->latest_perms;
4880 }
4881
4882 // add/update it
4883 SnapRealm *realm = NULL;
4884 update_snap_trace(m->snapbl, &realm);
4885
4886 add_update_cap(in, session, m->get_cap_id(),
4887 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4888 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4889
4890 if (cap && cap->cap_id == m->peer.cap_id) {
4891 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4892 }
4893
4894 if (realm)
4895 put_snap_realm(realm);
4896
4897 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4898 // reflush any/all caps (if we are now the auth_cap)
4899 if (in->cap_snaps.size())
4900 flush_snaps(in, true);
4901 if (in->flushing_caps)
4902 flush_caps(in, session);
4903 }
4904 }
4905
4906 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4907 {
4908 mds_rank_t mds = session->mds_num;
4909
4910 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4911 << " EXPORT from mds." << mds << dendl;
4912
4913 auto it = in->caps.find(mds);
4914 if (it != in->caps.end()) {
4915 Cap &cap = it->second;
4916 if (cap.cap_id == m->get_cap_id()) {
4917 if (m->peer.cap_id) {
4918 const auto peer_mds = mds_rank_t(m->peer.mds);
4919 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4920 auto it = in->caps.find(peer_mds);
4921 if (it != in->caps.end()) {
4922 Cap &tcap = it->second;
4923 if (tcap.cap_id == m->peer.cap_id &&
4924 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4925 tcap.cap_id = m->peer.cap_id;
4926 tcap.seq = m->peer.seq - 1;
4927 tcap.issue_seq = tcap.seq;
4928 tcap.issued |= cap.issued;
4929 tcap.implemented |= cap.issued;
4930 if (&cap == in->auth_cap)
4931 in->auth_cap = &tcap;
4932 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4933 adjust_session_flushing_caps(in, session, tsession);
4934 }
4935 } else {
4936 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4937 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4938 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4939 cap.latest_perms);
4940 }
4941 } else {
4942 if (cap.wanted | cap.issued)
4943 in->flags |= I_CAP_DROPPED;
4944 }
4945
4946 remove_cap(&cap, false);
4947 }
4948 }
4949 }
4950
4951 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4952 {
4953 mds_rank_t mds = session->mds_num;
4954 ceph_assert(in->caps.count(mds));
4955
4956 ldout(cct, 10) << __func__ << " on ino " << *in
4957 << " size " << in->size << " -> " << m->get_size()
4958 << dendl;
4959
4960 int issued;
4961 in->caps_issued(&issued);
4962 issued |= in->caps_dirty();
4963 update_inode_file_size(in, issued, m->get_size(),
4964 m->get_truncate_seq(), m->get_truncate_size());
4965 }
4966
4967 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
4968 {
4969 ceph_tid_t flush_ack_tid = m->get_client_tid();
4970 int dirty = m->get_dirty();
4971 int cleaned = 0;
4972 int flushed = 0;
4973
4974 auto it = in->flushing_cap_tids.begin();
4975 if (it->first < flush_ack_tid) {
4976 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4977 << " got unexpected flush ack tid " << flush_ack_tid
4978 << " expected is " << it->first << dendl;
4979 }
4980 for (; it != in->flushing_cap_tids.end(); ) {
4981 if (it->first == flush_ack_tid)
4982 cleaned = it->second;
4983 if (it->first <= flush_ack_tid) {
4984 session->flushing_caps_tids.erase(it->first);
4985 in->flushing_cap_tids.erase(it++);
4986 ++flushed;
4987 continue;
4988 }
4989 cleaned &= ~it->second;
4990 if (!cleaned)
4991 break;
4992 ++it;
4993 }
4994
4995 ldout(cct, 5) << __func__ << " mds." << session->mds_num
4996 << " cleaned " << ccap_string(cleaned) << " on " << *in
4997 << " with " << ccap_string(dirty) << dendl;
4998
4999 if (flushed) {
5000 signal_cond_list(in->waitfor_caps);
5001 if (session->flushing_caps_tids.empty() ||
5002 *session->flushing_caps_tids.begin() > flush_ack_tid)
5003 sync_cond.Signal();
5004 }
5005
5006 if (!dirty) {
5007 in->cap_dirtier_uid = -1;
5008 in->cap_dirtier_gid = -1;
5009 }
5010
5011 if (!cleaned) {
5012 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5013 } else {
5014 if (in->flushing_caps) {
5015 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5016 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5017 in->flushing_caps &= ~cleaned;
5018 if (in->flushing_caps == 0) {
5019 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5020 num_flushing_caps--;
5021 if (in->cap_snaps.empty())
5022 in->flushing_cap_item.remove_myself();
5023 }
5024 if (!in->caps_dirty())
5025 put_inode(in);
5026 }
5027 }
5028 }
5029
5030
5031 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5032 {
5033 mds_rank_t mds = session->mds_num;
5034 ceph_assert(in->caps.count(mds));
5035 snapid_t follows = m->get_snap_follows();
5036
5037 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5038 auto& capsnap = it->second;
5039 if (m->get_client_tid() != capsnap.flush_tid) {
5040 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
5041 } else {
5042 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5043 << " on " << *in << dendl;
5044 InodeRef tmp_ref;
5045 if (in->get_num_ref() == 1)
5046 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5047 if (in->flushing_caps == 0 && in->cap_snaps.empty())
5048 in->flushing_cap_item.remove_myself();
5049 session->flushing_caps_tids.erase(capsnap.flush_tid);
5050 in->cap_snaps.erase(it);
5051 }
5052 } else {
5053 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5054 << " on " << *in << dendl;
5055 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5056 }
5057 }
5058
5059 class C_Client_DentryInvalidate : public Context {
5060 private:
5061 Client *client;
5062 vinodeno_t dirino;
5063 vinodeno_t ino;
5064 string name;
5065 public:
5066 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5067 client(c), name(dn->name) {
5068 if (client->use_faked_inos()) {
5069 dirino.ino = dn->dir->parent_inode->faked_ino;
5070 if (del)
5071 ino.ino = dn->inode->faked_ino;
5072 } else {
5073 dirino = dn->dir->parent_inode->vino();
5074 if (del)
5075 ino = dn->inode->vino();
5076 }
5077 if (!del)
5078 ino.ino = inodeno_t();
5079 }
5080 void finish(int r) override {
5081 // _async_dentry_invalidate is responsible for its own locking
5082 ceph_assert(!client->client_lock.is_locked_by_me());
5083 client->_async_dentry_invalidate(dirino, ino, name);
5084 }
5085 };
5086
5087 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5088 {
5089 if (unmounting)
5090 return;
5091 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5092 << " in dir " << dirino << dendl;
5093 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5094 }
5095
5096 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5097 {
5098 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5099 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5100 }
5101
5102 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5103 {
5104 int ref = in->get_num_ref();
5105
5106 if (in->dir && !in->dir->dentries.empty()) {
5107 for (auto p = in->dir->dentries.begin();
5108 p != in->dir->dentries.end(); ) {
5109 Dentry *dn = p->second;
5110 ++p;
5111 /* rmsnap removes whole subtree, need trim inodes recursively.
5112 * we don't need to invalidate dentries recursively. because
5113 * invalidating a directory dentry effectively invalidate
5114 * whole subtree */
5115 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5116 _try_to_trim_inode(dn->inode.get(), false);
5117
5118 if (dn->lru_is_expireable())
5119 unlink(dn, true, false); // keep dir, drop dentry
5120 }
5121 if (in->dir->dentries.empty()) {
5122 close_dir(in->dir);
5123 --ref;
5124 }
5125 }
5126
5127 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5128 InodeRef snapdir = open_snapdir(in);
5129 _try_to_trim_inode(snapdir.get(), false);
5130 --ref;
5131 }
5132
5133 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5134 auto q = in->dentries.begin();
5135 while (q != in->dentries.end()) {
5136 Dentry *dn = *q;
5137 ++q;
5138 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5139 // so in->dentries doesn't always reflect the state of kernel's dcache.
5140 _schedule_invalidate_dentry_callback(dn, true);
5141 unlink(dn, true, true);
5142 }
5143 }
5144 }
5145
5146 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5147 {
5148 mds_rank_t mds = session->mds_num;
5149 int used = get_caps_used(in);
5150 int wanted = in->caps_wanted();
5151
5152 const unsigned new_caps = m->get_caps();
5153 const bool was_stale = session->cap_gen > cap->gen;
5154 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5155 << " mds." << mds << " seq " << m->get_seq()
5156 << " caps now " << ccap_string(new_caps)
5157 << " was " << ccap_string(cap->issued)
5158 << (was_stale ? "" : " (stale)") << dendl;
5159
5160 if (was_stale)
5161 cap->issued = cap->implemented = CEPH_CAP_PIN;
5162 cap->seq = m->get_seq();
5163 cap->gen = session->cap_gen;
5164
5165 check_cap_issue(in, new_caps);
5166
5167 // update inode
5168 int issued;
5169 in->caps_issued(&issued);
5170 issued |= in->caps_dirty();
5171
5172 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5173 !(issued & CEPH_CAP_AUTH_EXCL)) {
5174 in->mode = m->head.mode;
5175 in->uid = m->head.uid;
5176 in->gid = m->head.gid;
5177 in->btime = m->btime;
5178 }
5179 bool deleted_inode = false;
5180 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5181 !(issued & CEPH_CAP_LINK_EXCL)) {
5182 in->nlink = m->head.nlink;
5183 if (in->nlink == 0 &&
5184 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5185 deleted_inode = true;
5186 }
5187 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5188 m->xattrbl.length() &&
5189 m->head.xattr_version > in->xattr_version) {
5190 auto p = m->xattrbl.cbegin();
5191 decode(in->xattrs, p);
5192 in->xattr_version = m->head.xattr_version;
5193 }
5194
5195 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5196 in->dirstat.nfiles = m->get_nfiles();
5197 in->dirstat.nsubdirs = m->get_nsubdirs();
5198 }
5199
5200 if (new_caps & CEPH_CAP_ANY_RD) {
5201 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5202 m->get_ctime(), m->get_mtime(), m->get_atime());
5203 }
5204
5205 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5206 in->layout = m->get_layout();
5207 update_inode_file_size(in, issued, m->get_size(),
5208 m->get_truncate_seq(), m->get_truncate_size());
5209 }
5210
5211 if (m->inline_version > in->inline_version) {
5212 in->inline_data = m->inline_data;
5213 in->inline_version = m->inline_version;
5214 }
5215
5216 /* always take a newer change attr */
5217 if (m->get_change_attr() > in->change_attr)
5218 in->change_attr = m->get_change_attr();
5219
5220 // max_size
5221 if (cap == in->auth_cap &&
5222 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5223 (m->get_max_size() != in->max_size)) {
5224 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5225 in->max_size = m->get_max_size();
5226 if (in->max_size > in->wanted_max_size) {
5227 in->wanted_max_size = 0;
5228 in->requested_max_size = 0;
5229 }
5230 }
5231
5232 bool check = false;
5233 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5234 (wanted & ~(cap->wanted | new_caps))) {
5235 // If mds is importing cap, prior cap messages that update 'wanted'
5236 // may get dropped by mds (migrate seq mismatch).
5237 //
5238 // We don't send cap message to update 'wanted' if what we want are
5239 // already issued. If mds revokes caps, cap message that releases caps
5240 // also tells mds what we want. But if caps got revoked by mds forcedly
5241 // (session stale). We may haven't told mds what we want.
5242 check = true;
5243 }
5244
5245
5246 // update caps
5247 auto revoked = cap->issued & ~new_caps;
5248 if (revoked) {
5249 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5250 cap->issued = new_caps;
5251 cap->implemented |= new_caps;
5252
5253 // recall delegations if we're losing caps necessary for them
5254 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5255 in->recall_deleg(false);
5256 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5257 in->recall_deleg(true);
5258
5259 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5260 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5261 !_flush(in, new C_Client_FlushComplete(this, in))) {
5262 // waitin' for flush
5263 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5264 if (_release(in))
5265 check = true;
5266 } else {
5267 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5268 check = true;
5269 }
5270 } else if (cap->issued == new_caps) {
5271 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5272 } else {
5273 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5274 cap->issued = new_caps;
5275 cap->implemented |= new_caps;
5276
5277 if (cap == in->auth_cap) {
5278 // non-auth MDS is revoking the newly grant caps ?
5279 for (const auto &p : in->caps) {
5280 if (&p.second == cap)
5281 continue;
5282 if (p.second.implemented & ~p.second.issued & new_caps) {
5283 check = true;
5284 break;
5285 }
5286 }
5287 }
5288 }
5289
5290 if (check)
5291 check_caps(in, 0);
5292
5293 // wake up waiters
5294 if (new_caps)
5295 signal_cond_list(in->waitfor_caps);
5296
5297 // may drop inode's last ref
5298 if (deleted_inode)
5299 _try_to_trim_inode(in, true);
5300 }
5301
5302 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5303 {
5304 if (perms.uid() == 0)
5305 return 0;
5306
5307 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5308 int ret = _posix_acl_permission(in, perms, want);
5309 if (ret != -EAGAIN)
5310 return ret;
5311 }
5312
5313 // check permissions before doing anything else
5314 if (!in->check_mode(perms, want))
5315 return -EACCES;
5316 return 0;
5317 }
5318
5319 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5320 const UserPerm& perms)
5321 {
5322 int r = _getattr_for_perm(in, perms);
5323 if (r < 0)
5324 goto out;
5325
5326 r = 0;
5327 if (strncmp(name, "system.", 7) == 0) {
5328 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5329 r = -EPERM;
5330 } else {
5331 r = inode_permission(in, perms, want);
5332 }
5333 out:
5334 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5335 return r;
5336 }
5337
5338 ostream& operator<<(ostream &out, const UserPerm& perm) {
5339 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5340 return out;
5341 }
5342
5343 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5344 const UserPerm& perms)
5345 {
5346 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5347 int r = _getattr_for_perm(in, perms);
5348 if (r < 0)
5349 goto out;
5350
5351 if (mask & CEPH_SETATTR_SIZE) {
5352 r = inode_permission(in, perms, MAY_WRITE);
5353 if (r < 0)
5354 goto out;
5355 }
5356
5357 r = -EPERM;
5358 if (mask & CEPH_SETATTR_UID) {
5359 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5360 goto out;
5361 }
5362 if (mask & CEPH_SETATTR_GID) {
5363 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5364 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5365 goto out;
5366 }
5367
5368 if (mask & CEPH_SETATTR_MODE) {
5369 if (perms.uid() != 0 && perms.uid() != in->uid)
5370 goto out;
5371
5372 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5373 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5374 stx->stx_mode &= ~S_ISGID;
5375 }
5376
5377 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5378 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5379 if (perms.uid() != 0 && perms.uid() != in->uid) {
5380 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5381 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5382 check_mask |= CEPH_SETATTR_MTIME;
5383 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5384 check_mask |= CEPH_SETATTR_ATIME;
5385 if (check_mask & mask) {
5386 goto out;
5387 } else {
5388 r = inode_permission(in, perms, MAY_WRITE);
5389 if (r < 0)
5390 goto out;
5391 }
5392 }
5393 }
5394 r = 0;
5395 out:
5396 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5397 return r;
5398 }
5399
5400 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5401 {
5402 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5403 unsigned want = 0;
5404
5405 if ((flags & O_ACCMODE) == O_WRONLY)
5406 want = MAY_WRITE;
5407 else if ((flags & O_ACCMODE) == O_RDWR)
5408 want = MAY_READ | MAY_WRITE;
5409 else if ((flags & O_ACCMODE) == O_RDONLY)
5410 want = MAY_READ;
5411 if (flags & O_TRUNC)
5412 want |= MAY_WRITE;
5413
5414 int r = 0;
5415 switch (in->mode & S_IFMT) {
5416 case S_IFLNK:
5417 r = -ELOOP;
5418 goto out;
5419 case S_IFDIR:
5420 if (want & MAY_WRITE) {
5421 r = -EISDIR;
5422 goto out;
5423 }
5424 break;
5425 }
5426
5427 r = _getattr_for_perm(in, perms);
5428 if (r < 0)
5429 goto out;
5430
5431 r = inode_permission(in, perms, want);
5432 out:
5433 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5434 return r;
5435 }
5436
5437 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5438 {
5439 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5440 int r = _getattr_for_perm(dir, perms);
5441 if (r < 0)
5442 goto out;
5443
5444 r = inode_permission(dir, perms, MAY_EXEC);
5445 out:
5446 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5447 return r;
5448 }
5449
5450 int Client::may_create(Inode *dir, const UserPerm& perms)
5451 {
5452 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5453 int r = _getattr_for_perm(dir, perms);
5454 if (r < 0)
5455 goto out;
5456
5457 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5458 out:
5459 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5460 return r;
5461 }
5462
5463 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5464 {
5465 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5466 int r = _getattr_for_perm(dir, perms);
5467 if (r < 0)
5468 goto out;
5469
5470 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5471 if (r < 0)
5472 goto out;
5473
5474 /* 'name == NULL' means rmsnap */
5475 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5476 InodeRef otherin;
5477 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5478 if (r < 0)
5479 goto out;
5480 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5481 r = -EPERM;
5482 }
5483 out:
5484 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5485 return r;
5486 }
5487
5488 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5489 {
5490 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5491 int r = _getattr_for_perm(in, perms);
5492 if (r < 0)
5493 goto out;
5494
5495 if (perms.uid() == 0 || perms.uid() == in->uid) {
5496 r = 0;
5497 goto out;
5498 }
5499
5500 r = -EPERM;
5501 if (!S_ISREG(in->mode))
5502 goto out;
5503
5504 if (in->mode & S_ISUID)
5505 goto out;
5506
5507 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5508 goto out;
5509
5510 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5511 out:
5512 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5513 return r;
5514 }
5515
5516 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5517 {
5518 int mask = CEPH_STAT_CAP_MODE;
5519 bool force = false;
5520 if (acl_type != NO_ACL) {
5521 mask |= CEPH_STAT_CAP_XATTR;
5522 force = in->xattr_version == 0;
5523 }
5524 return _getattr(in, mask, perms, force);
5525 }
5526
5527 vinodeno_t Client::_get_vino(Inode *in)
5528 {
5529 /* The caller must hold the client lock */
5530 return vinodeno_t(in->ino, in->snapid);
5531 }
5532
5533 /**
5534 * Resolve an MDS spec to a list of MDS daemon GIDs.
5535 *
5536 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5537 * It may be '*' in which case it matches all GIDs.
5538 *
5539 * If no error is returned, the `targets` vector will be populated with at least
5540 * one MDS.
5541 */
5542 int Client::resolve_mds(
5543 const std::string &mds_spec,
5544 std::vector<mds_gid_t> *targets)
5545 {
5546 ceph_assert(fsmap);
5547 ceph_assert(targets != nullptr);
5548
5549 mds_role_t role;
5550 std::stringstream ss;
5551 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5552 if (role_r == 0) {
5553 // We got a role, resolve it to a GID
5554 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5555 << role << "'" << dendl;
5556 targets->push_back(
5557 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5558 return 0;
5559 }
5560
5561 std::string strtol_err;
5562 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5563 if (strtol_err.empty()) {
5564 // It is a possible GID
5565 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5566 if (fsmap->gid_exists(mds_gid)) {
5567 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5568 targets->push_back(mds_gid);
5569 } else {
5570 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5571 << dendl;
5572 return -ENOENT;
5573 }
5574 } else if (mds_spec == "*") {
5575 // It is a wildcard: use all MDSs
5576 const auto mds_info = fsmap->get_mds_info();
5577
5578 if (mds_info.empty()) {
5579 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5580 return -ENOENT;
5581 }
5582
5583 for (const auto i : mds_info) {
5584 targets->push_back(i.first);
5585 }
5586 } else {
5587 // It did not parse as an integer, it is not a wildcard, it must be a name
5588 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5589 if (mds_gid == 0) {
5590 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5591
5592 lderr(cct) << "FSMap: " << *fsmap << dendl;
5593
5594 return -ENOENT;
5595 } else {
5596 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5597 << "' to GID " << mds_gid << dendl;
5598 targets->push_back(mds_gid);
5599 }
5600 }
5601
5602 return 0;
5603 }
5604
5605
5606 /**
5607 * Authenticate with mon and establish global ID
5608 */
5609 int Client::authenticate()
5610 {
5611 ceph_assert(client_lock.is_locked_by_me());
5612
5613 if (monclient->is_authenticated()) {
5614 return 0;
5615 }
5616
5617 client_lock.Unlock();
5618 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5619 client_lock.Lock();
5620 if (r < 0) {
5621 return r;
5622 }
5623
5624 whoami = monclient->get_global_id();
5625 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5626
5627 return 0;
5628 }
5629
5630 int Client::fetch_fsmap(bool user)
5631 {
5632 int r;
5633 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5634 // rather than MDSMap because no one MDSMap contains all the daemons, and
5635 // a `tell` can address any daemon.
5636 version_t fsmap_latest;
5637 do {
5638 C_SaferCond cond;
5639 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5640 client_lock.Unlock();
5641 r = cond.wait();
5642 client_lock.Lock();
5643 } while (r == -EAGAIN);
5644
5645 if (r < 0) {
5646 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5647 return r;
5648 }
5649
5650 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5651
5652 if (user) {
5653 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5654 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5655 monclient->renew_subs();
5656 wait_on_list(waiting_for_fsmap);
5657 }
5658 ceph_assert(fsmap_user);
5659 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5660 } else {
5661 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5662 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5663 monclient->renew_subs();
5664 wait_on_list(waiting_for_fsmap);
5665 }
5666 ceph_assert(fsmap);
5667 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5668 }
5669 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5670 << fsmap_latest << dendl;
5671 return 0;
5672 }
5673
5674 /**
5675 *
5676 * @mds_spec one of ID, rank, GID, "*"
5677 *
5678 */
5679 int Client::mds_command(
5680 const std::string &mds_spec,
5681 const vector<string>& cmd,
5682 const bufferlist& inbl,
5683 bufferlist *outbl,
5684 string *outs,
5685 Context *onfinish)
5686 {
5687 std::lock_guard lock(client_lock);
5688
5689 if (!initialized)
5690 return -ENOTCONN;
5691
5692 int r;
5693 r = authenticate();
5694 if (r < 0) {
5695 return r;
5696 }
5697
5698 r = fetch_fsmap(false);
5699 if (r < 0) {
5700 return r;
5701 }
5702
5703 // Look up MDS target(s) of the command
5704 std::vector<mds_gid_t> targets;
5705 r = resolve_mds(mds_spec, &targets);
5706 if (r < 0) {
5707 return r;
5708 }
5709
5710 // If daemons are laggy, we won't send them commands. If all
5711 // are laggy then we fail.
5712 std::vector<mds_gid_t> non_laggy;
5713 for (const auto gid : targets) {
5714 const auto info = fsmap->get_info_gid(gid);
5715 if (!info.laggy()) {
5716 non_laggy.push_back(gid);
5717 }
5718 }
5719 if (non_laggy.size() == 0) {
5720 *outs = "All targeted MDS daemons are laggy";
5721 return -ENOENT;
5722 }
5723
5724 if (metadata.empty()) {
5725 // We are called on an unmounted client, so metadata
5726 // won't be initialized yet.
5727 populate_metadata("");
5728 }
5729
5730 // Send commands to targets
5731 C_GatherBuilder gather(cct, onfinish);
5732 for (const auto target_gid : non_laggy) {
5733 const auto info = fsmap->get_info_gid(target_gid);
5734
5735 // Open a connection to the target MDS
5736 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5737
5738 // Generate MDSCommandOp state
5739 auto &op = command_table.start_command();
5740
5741 op.on_finish = gather.new_sub();
5742 op.cmd = cmd;
5743 op.outbl = outbl;
5744 op.outs = outs;
5745 op.inbl = inbl;
5746 op.mds_gid = target_gid;
5747 op.con = conn;
5748
5749 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5750 << " tid=" << op.tid << cmd << dendl;
5751
5752 // Construct and send MCommand
5753 auto m = op.get_message(monclient->get_fsid());
5754 conn->send_message2(std::move(m));
5755 }
5756 gather.activate();
5757
5758 return 0;
5759 }
5760
5761 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5762 {
5763 ceph_tid_t const tid = m->get_tid();
5764
5765 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5766
5767 if (!command_table.exists(tid)) {
5768 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5769 return;
5770 }
5771
5772 auto &op = command_table.get_command(tid);
5773 if (op.outbl) {
5774 *op.outbl = m->get_data();
5775 }
5776 if (op.outs) {
5777 *op.outs = m->rs;
5778 }
5779
5780 if (op.on_finish) {
5781 op.on_finish->complete(m->r);
5782 }
5783
5784 command_table.erase(tid);
5785 }
5786
5787 // -------------------
5788 // MOUNT
5789
5790 int Client::subscribe_mdsmap(const std::string &fs_name)
5791 {
5792 int r = authenticate();
5793 if (r < 0) {
5794 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5795 return r;
5796 }
5797
5798 std::string resolved_fs_name;
5799 if (fs_name.empty()) {
5800 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5801 } else {
5802 resolved_fs_name = fs_name;
5803 }
5804
5805 std::string want = "mdsmap";
5806 if (!resolved_fs_name.empty()) {
5807 r = fetch_fsmap(true);
5808 if (r < 0)
5809 return r;
5810 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5811 if (fscid == FS_CLUSTER_ID_NONE) {
5812 return -ENOENT;
5813 }
5814
5815 std::ostringstream oss;
5816 oss << want << "." << fscid;
5817 want = oss.str();
5818 }
5819 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5820
5821 monclient->sub_want(want, 0, 0);
5822 monclient->renew_subs();
5823
5824 return 0;
5825 }
5826
5827 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5828 bool require_mds, const std::string &fs_name)
5829 {
5830 std::lock_guard lock(client_lock);
5831
5832 if (mounted) {
5833 ldout(cct, 5) << "already mounted" << dendl;
5834 return 0;
5835 }
5836
5837 unmounting = false;
5838
5839 int r = subscribe_mdsmap(fs_name);
5840 if (r < 0) {
5841 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5842 return r;
5843 }
5844
5845 tick(); // start tick
5846
5847 if (require_mds) {
5848 while (1) {
5849 auto availability = mdsmap->is_cluster_available();
5850 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5851 // Error out
5852 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5853 return CEPH_FUSE_NO_MDS_UP;
5854 } else if (availability == MDSMap::AVAILABLE) {
5855 // Continue to mount
5856 break;
5857 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5858 // Else, wait. MDSMonitor will update the map to bring
5859 // us to a conclusion eventually.
5860 wait_on_list(waiting_for_mdsmap);
5861 } else {
5862 // Unexpected value!
5863 ceph_abort();
5864 }
5865 }
5866 }
5867
5868 populate_metadata(mount_root.empty() ? "/" : mount_root);
5869
5870 filepath fp(CEPH_INO_ROOT);
5871 if (!mount_root.empty()) {
5872 fp = filepath(mount_root.c_str());
5873 }
5874 while (true) {
5875 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5876 req->set_filepath(fp);
5877 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5878 int res = make_request(req, perms);
5879 if (res < 0) {
5880 if (res == -EACCES && root) {
5881 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5882 break;
5883 }
5884 return res;
5885 }
5886
5887 if (fp.depth())
5888 fp.pop_dentry();
5889 else
5890 break;
5891 }
5892
5893 ceph_assert(root);
5894 _ll_get(root);
5895
5896 mounted = true;
5897
5898 // trace?
5899 if (!cct->_conf->client_trace.empty()) {
5900 traceout.open(cct->_conf->client_trace.c_str());
5901 if (traceout.is_open()) {
5902 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5903 } else {
5904 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5905 }
5906 }
5907
5908 /*
5909 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5910 ldout(cct, 3) << "op: struct stat st;" << dendl;
5911 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5912 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5913 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5914 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5915 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5916 ldout(cct, 3) << "op: int fd;" << dendl;
5917 */
5918 return 0;
5919 }
5920
5921 // UNMOUNT
5922
5923 void Client::_close_sessions()
5924 {
5925 while (!mds_sessions.empty()) {
5926 // send session closes!
5927 for (auto &p : mds_sessions) {
5928 if (p.second.state != MetaSession::STATE_CLOSING) {
5929 _close_mds_session(&p.second);
5930 }
5931 }
5932
5933 // wait for sessions to close
5934 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5935 mount_cond.Wait(client_lock);
5936 }
5937 }
5938
5939 void Client::flush_mdlog_sync()
5940 {
5941 if (mds_requests.empty())
5942 return;
5943 for (auto &p : mds_sessions) {
5944 flush_mdlog(&p.second);
5945 }
5946 }
5947
5948 void Client::flush_mdlog(MetaSession *session)
5949 {
5950 // Only send this to Luminous or newer MDS daemons, older daemons
5951 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5952 const uint64_t features = session->con->get_features();
5953 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5954 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5955 session->con->send_message2(std::move(m));
5956 }
5957 }
5958
5959
5960 void Client::_abort_mds_sessions(int err)
5961 {
5962 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5963 auto req = p->second;
5964 ++p;
5965 // unsafe requests will be removed during close session below.
5966 if (req->got_unsafe)
5967 continue;
5968
5969 req->abort(err);
5970 if (req->caller_cond) {
5971 req->kick = true;
5972 req->caller_cond->Signal();
5973 }
5974 }
5975
5976 // Process aborts on any requests that were on this waitlist.
5977 // Any requests that were on a waiting_for_open session waitlist
5978 // will get kicked during close session below.
5979 signal_cond_list(waiting_for_mdsmap);
5980
5981 // Force-close all sessions
5982 while(!mds_sessions.empty()) {
5983 auto& session = mds_sessions.begin()->second;
5984 _closed_mds_session(&session);
5985 }
5986 }
5987
5988 void Client::_unmount(bool abort)
5989 {
5990 if (unmounting)
5991 return;
5992
5993 if (abort || blacklisted) {
5994 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
5995 } else {
5996 ldout(cct, 2) << "unmounting" << dendl;
5997 }
5998 unmounting = true;
5999
6000 deleg_timeout = 0;
6001
6002 if (abort) {
6003 // Abort all mds sessions
6004 _abort_mds_sessions(-ENOTCONN);
6005
6006 objecter->op_cancel_writes(-ENOTCONN);
6007 } else {
6008 // flush the mdlog for pending requests, if any
6009 flush_mdlog_sync();
6010 }
6011
6012 while (!mds_requests.empty()) {
6013 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6014 mount_cond.Wait(client_lock);
6015 }
6016
6017 if (tick_event)
6018 timer.cancel_event(tick_event);
6019 tick_event = 0;
6020
6021 cwd.reset();
6022
6023 // clean up any unclosed files
6024 while (!fd_map.empty()) {
6025 Fh *fh = fd_map.begin()->second;
6026 fd_map.erase(fd_map.begin());
6027 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6028 _release_fh(fh);
6029 }
6030
6031 while (!ll_unclosed_fh_set.empty()) {
6032 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6033 Fh *fh = *it;
6034 ll_unclosed_fh_set.erase(fh);
6035 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6036 _release_fh(fh);
6037 }
6038
6039 while (!opened_dirs.empty()) {
6040 dir_result_t *dirp = *opened_dirs.begin();
6041 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6042 _closedir(dirp);
6043 }
6044
6045 _ll_drop_pins();
6046
6047 while (unsafe_sync_write > 0) {
6048 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6049 mount_cond.Wait(client_lock);
6050 }
6051
6052 if (cct->_conf->client_oc) {
6053 // flush/release all buffered data
6054 std::list<InodeRef> anchor;
6055 for (auto& p : inode_map) {
6056 Inode *in = p.second;
6057 if (!in) {
6058 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6059 ceph_assert(in);
6060 }
6061
6062 // prevent inode from getting freed
6063 anchor.emplace_back(in);
6064
6065 if (abort || blacklisted) {
6066 objectcacher->purge_set(&in->oset);
6067 } else if (!in->caps.empty()) {
6068 _release(in);
6069 _flush(in, new C_Client_FlushComplete(this, in));
6070 }
6071 }
6072 }
6073
6074 if (abort || blacklisted) {
6075 for (auto p = dirty_list.begin(); !p.end(); ) {
6076 Inode *in = *p;
6077 ++p;
6078 if (in->dirty_caps) {
6079 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6080 in->mark_caps_clean();
6081 put_inode(in);
6082 }
6083 }
6084 } else {
6085 flush_caps_sync();
6086 wait_sync_caps(last_flush_tid);
6087 }
6088
6089 // empty lru cache
6090 trim_cache();
6091
6092 while (lru.lru_get_size() > 0 ||
6093 !inode_map.empty()) {
6094 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6095 << "+" << inode_map.size() << " items"
6096 << ", waiting (for caps to release?)"
6097 << dendl;
6098 utime_t until = ceph_clock_now() + utime_t(5, 0);
6099 int r = mount_cond.WaitUntil(client_lock, until);
6100 if (r == ETIMEDOUT) {
6101 dump_cache(NULL);
6102 }
6103 }
6104 ceph_assert(lru.lru_get_size() == 0);
6105 ceph_assert(inode_map.empty());
6106
6107 // stop tracing
6108 if (!cct->_conf->client_trace.empty()) {
6109 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6110 traceout.close();
6111 }
6112
6113 _close_sessions();
6114
6115 mounted = false;
6116
6117 ldout(cct, 2) << "unmounted." << dendl;
6118 }
6119
6120 void Client::unmount()
6121 {
6122 std::lock_guard lock(client_lock);
6123 _unmount(false);
6124 }
6125
6126 void Client::abort_conn()
6127 {
6128 std::lock_guard lock(client_lock);
6129 _unmount(true);
6130 }
6131
6132 void Client::flush_cap_releases()
6133 {
6134 // send any cap releases
6135 for (auto &p : mds_sessions) {
6136 auto &session = p.second;
6137 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6138 p.first)) {
6139 if (cct->_conf->client_inject_release_failure) {
6140 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6141 } else {
6142 session.con->send_message2(std::move(session.release));
6143 }
6144 session.release.reset();
6145 }
6146 }
6147 }
6148
6149 void Client::tick()
6150 {
6151 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6152 sleep(cct->_conf->client_debug_inject_tick_delay);
6153 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6154 cct->_conf.apply_changes(nullptr);
6155 }
6156
6157 ldout(cct, 21) << "tick" << dendl;
6158 tick_event = timer.add_event_after(
6159 cct->_conf->client_tick_interval,
6160 new FunctionContext([this](int) {
6161 // Called back via Timer, which takes client_lock for us
6162 ceph_assert(client_lock.is_locked_by_me());
6163 tick();
6164 }));
6165 utime_t now = ceph_clock_now();
6166
6167 if (!mounted && !mds_requests.empty()) {
6168 MetaRequest *req = mds_requests.begin()->second;
6169 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6170 req->abort(-ETIMEDOUT);
6171 if (req->caller_cond) {
6172 req->kick = true;
6173 req->caller_cond->Signal();
6174 }
6175 signal_cond_list(waiting_for_mdsmap);
6176 for (auto &p : mds_sessions) {
6177 signal_context_list(p.second.waiting_for_open);
6178 }
6179 }
6180 }
6181
6182 if (mdsmap->get_epoch()) {
6183 // renew caps?
6184 utime_t el = now - last_cap_renew;
6185 if (el > mdsmap->get_session_timeout() / 3.0)
6186 renew_caps();
6187
6188 flush_cap_releases();
6189 }
6190
6191 // delayed caps
6192 xlist<Inode*>::iterator p = delayed_list.begin();
6193 while (!p.end()) {
6194 Inode *in = *p;
6195 ++p;
6196 if (in->hold_caps_until > now)
6197 break;
6198 delayed_list.pop_front();
6199 check_caps(in, CHECK_CAPS_NODELAY);
6200 }
6201
6202 trim_cache(true);
6203 }
6204
6205 void Client::renew_caps()
6206 {
6207 ldout(cct, 10) << "renew_caps()" << dendl;
6208 last_cap_renew = ceph_clock_now();
6209
6210 for (auto &p : mds_sessions) {
6211 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6212 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6213 renew_caps(&p.second);
6214 }
6215 }
6216
6217 void Client::renew_caps(MetaSession *session)
6218 {
6219 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6220 session->last_cap_renew_request = ceph_clock_now();
6221 uint64_t seq = ++session->cap_renew_seq;
6222 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6223 }
6224
6225
6226 // ===============================================================
6227 // high level (POSIXy) interface
6228
6229 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6230 InodeRef *target, const UserPerm& perms)
6231 {
6232 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6233 MetaRequest *req = new MetaRequest(op);
6234 filepath path;
6235 dir->make_nosnap_relative_path(path);
6236 path.push_dentry(name);
6237 req->set_filepath(path);
6238 req->set_inode(dir);
6239 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6240 mask |= DEBUG_GETATTR_CAPS;
6241 req->head.args.getattr.mask = mask;
6242
6243 ldout(cct, 10) << __func__ << " on " << path << dendl;
6244
6245 int r = make_request(req, perms, target);
6246 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6247 return r;
6248 }
6249
6250 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6251 const UserPerm& perms)
6252 {
6253 int r = 0;
6254 Dentry *dn = NULL;
6255
6256 if (dname == "..") {
6257 if (dir->dentries.empty()) {
6258 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6259 filepath path(dir->ino);
6260 req->set_filepath(path);
6261
6262 InodeRef tmptarget;
6263 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6264
6265 if (r == 0) {
6266 Inode *tempino = tmptarget.get();
6267 _ll_get(tempino);
6268 *target = tempino;
6269 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6270 } else {
6271 *target = dir;
6272 }
6273 }
6274 else
6275 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6276 goto done;
6277 }
6278
6279 if (dname == ".") {
6280 *target = dir;
6281 goto done;
6282 }
6283
6284 if (!dir->is_dir()) {
6285 r = -ENOTDIR;
6286 goto done;
6287 }
6288
6289 if (dname.length() > NAME_MAX) {
6290 r = -ENAMETOOLONG;
6291 goto done;
6292 }
6293
6294 if (dname == cct->_conf->client_snapdir &&
6295 dir->snapid == CEPH_NOSNAP) {
6296 *target = open_snapdir(dir);
6297 goto done;
6298 }
6299
6300 if (dir->dir &&
6301 dir->dir->dentries.count(dname)) {
6302 dn = dir->dir->dentries[dname];
6303
6304 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6305 << " seq " << dn->lease_seq
6306 << dendl;
6307
6308 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6309 // is dn lease valid?
6310 utime_t now = ceph_clock_now();
6311 if (dn->lease_mds >= 0 &&
6312 dn->lease_ttl > now &&
6313 mds_sessions.count(dn->lease_mds)) {
6314 MetaSession &s = mds_sessions.at(dn->lease_mds);
6315 if (s.cap_ttl > now &&
6316 s.cap_gen == dn->lease_gen) {
6317 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6318 // make trim_caps() behave.
6319 dir->try_touch_cap(dn->lease_mds);
6320 goto hit_dn;
6321 }
6322 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6323 << " vs lease_gen " << dn->lease_gen << dendl;
6324 }
6325 // dir lease?
6326 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6327 if (dn->cap_shared_gen == dir->shared_gen &&
6328 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6329 goto hit_dn;
6330 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6331 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6332 << *dir << " dn '" << dname << "'" << dendl;
6333 return -ENOENT;
6334 }
6335 }
6336 } else {
6337 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6338 }
6339 } else {
6340 // can we conclude ENOENT locally?
6341 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6342 (dir->flags & I_COMPLETE)) {
6343 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6344 return -ENOENT;
6345 }
6346 }
6347
6348 r = _do_lookup(dir, dname, mask, target, perms);
6349 goto done;
6350
6351 hit_dn:
6352 if (dn->inode) {
6353 *target = dn->inode;
6354 } else {
6355 r = -ENOENT;
6356 }
6357 touch_dn(dn);
6358
6359 done:
6360 if (r < 0)
6361 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6362 else
6363 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6364 return r;
6365 }
6366
6367 int Client::get_or_create(Inode *dir, const char* name,
6368 Dentry **pdn, bool expect_null)
6369 {
6370 // lookup
6371 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6372 dir->open_dir();
6373 if (dir->dir->dentries.count(name)) {
6374 Dentry *dn = dir->dir->dentries[name];
6375
6376 // is dn lease valid?
6377 utime_t now = ceph_clock_now();
6378 if (dn->inode &&
6379 dn->lease_mds >= 0 &&
6380 dn->lease_ttl > now &&
6381 mds_sessions.count(dn->lease_mds)) {
6382 MetaSession &s = mds_sessions.at(dn->lease_mds);
6383 if (s.cap_ttl > now &&
6384 s.cap_gen == dn->lease_gen) {
6385 if (expect_null)
6386 return -EEXIST;
6387 }
6388 }
6389 *pdn = dn;
6390 } else {
6391 // otherwise link up a new one
6392 *pdn = link(dir->dir, name, NULL, NULL);
6393 }
6394
6395 // success
6396 return 0;
6397 }
6398
6399 int Client::path_walk(const filepath& origpath, InodeRef *end,
6400 const UserPerm& perms, bool followsym, int mask)
6401 {
6402 filepath path = origpath;
6403 InodeRef cur;
6404 if (origpath.absolute())
6405 cur = root;
6406 else
6407 cur = cwd;
6408 ceph_assert(cur);
6409
6410 ldout(cct, 10) << __func__ << " " << path << dendl;
6411
6412 int symlinks = 0;
6413
6414 unsigned i=0;
6415 while (i < path.depth() && cur) {
6416 int caps = 0;
6417 const string &dname = path[i];
6418 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6419 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6420 InodeRef next;
6421 if (cct->_conf->client_permissions) {
6422 int r = may_lookup(cur.get(), perms);
6423 if (r < 0)
6424 return r;
6425 caps = CEPH_CAP_AUTH_SHARED;
6426 }
6427
6428 /* Get extra requested caps on the last component */
6429 if (i == (path.depth() - 1))
6430 caps |= mask;
6431 int r = _lookup(cur.get(), dname, caps, &next, perms);
6432 if (r < 0)
6433 return r;
6434 // only follow trailing symlink if followsym. always follow
6435 // 'directory' symlinks.
6436 if (next && next->is_symlink()) {
6437 symlinks++;
6438 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6439 if (symlinks > MAXSYMLINKS) {
6440 return -ELOOP;
6441 }
6442
6443 if (i < path.depth() - 1) {
6444 // dir symlink
6445 // replace consumed components of path with symlink dir target
6446 filepath resolved(next->symlink.c_str());
6447 resolved.append(path.postfixpath(i + 1));
6448 path = resolved;
6449 i = 0;
6450 if (next->symlink[0] == '/') {
6451 cur = root;
6452 }
6453 continue;
6454 } else if (followsym) {
6455 if (next->symlink[0] == '/') {
6456 path = next->symlink.c_str();
6457 i = 0;
6458 // reset position
6459 cur = root;
6460 } else {
6461 filepath more(next->symlink.c_str());
6462 // we need to remove the symlink component from off of the path
6463 // before adding the target that the symlink points to. remain
6464 // at the same position in the path.
6465 path.pop_dentry();
6466 path.append(more);
6467 }
6468 continue;
6469 }
6470 }
6471 cur.swap(next);
6472 i++;
6473 }
6474 if (!cur)
6475 return -ENOENT;
6476 if (end)
6477 end->swap(cur);
6478 return 0;
6479 }
6480
6481
6482 // namespace ops
6483
6484 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6485 {
6486 std::lock_guard lock(client_lock);
6487 tout(cct) << "link" << std::endl;
6488 tout(cct) << relexisting << std::endl;
6489 tout(cct) << relpath << std::endl;
6490
6491 if (unmounting)
6492 return -ENOTCONN;
6493
6494 filepath existing(relexisting);
6495
6496 InodeRef in, dir;
6497 int r = path_walk(existing, &in, perm, true);
6498 if (r < 0)
6499 return r;
6500 if (std::string(relpath) == "/") {
6501 r = -EEXIST;
6502 return r;
6503 }
6504 filepath path(relpath);
6505 string name = path.last_dentry();
6506 path.pop_dentry();
6507
6508 r = path_walk(path, &dir, perm, true);
6509 if (r < 0)
6510 return r;
6511 if (cct->_conf->client_permissions) {
6512 if (S_ISDIR(in->mode)) {
6513 r = -EPERM;
6514 return r;
6515 }
6516 r = may_hardlink(in.get(), perm);
6517 if (r < 0)
6518 return r;
6519 r = may_create(dir.get(), perm);
6520 if (r < 0)
6521 return r;
6522 }
6523 r = _link(in.get(), dir.get(), name.c_str(), perm);
6524 return r;
6525 }
6526
6527 int Client::unlink(const char *relpath, const UserPerm& perm)
6528 {
6529 std::lock_guard lock(client_lock);
6530 tout(cct) << __func__ << std::endl;
6531 tout(cct) << relpath << std::endl;
6532
6533 if (unmounting)
6534 return -ENOTCONN;
6535
6536 if (std::string(relpath) == "/")
6537 return -EISDIR;
6538
6539 filepath path(relpath);
6540 string name = path.last_dentry();
6541 path.pop_dentry();
6542 InodeRef dir;
6543 int r = path_walk(path, &dir, perm);
6544 if (r < 0)
6545 return r;
6546 if (cct->_conf->client_permissions) {
6547 r = may_delete(dir.get(), name.c_str(), perm);
6548 if (r < 0)
6549 return r;
6550 }
6551 return _unlink(dir.get(), name.c_str(), perm);
6552 }
6553
6554 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6555 {
6556 std::lock_guard lock(client_lock);
6557 tout(cct) << __func__ << std::endl;
6558 tout(cct) << relfrom << std::endl;
6559 tout(cct) << relto << std::endl;
6560
6561 if (unmounting)
6562 return -ENOTCONN;
6563
6564 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6565 return -EBUSY;
6566
6567 filepath from(relfrom);
6568 filepath to(relto);
6569 string fromname = from.last_dentry();
6570 from.pop_dentry();
6571 string toname = to.last_dentry();
6572 to.pop_dentry();
6573
6574 InodeRef fromdir, todir;
6575 int r = path_walk(from, &fromdir, perm);
6576 if (r < 0)
6577 goto out;
6578 r = path_walk(to, &todir, perm);
6579 if (r < 0)
6580 goto out;
6581
6582 if (cct->_conf->client_permissions) {
6583 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6584 if (r < 0)
6585 return r;
6586 r = may_delete(todir.get(), toname.c_str(), perm);
6587 if (r < 0 && r != -ENOENT)
6588 return r;
6589 }
6590 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6591 out:
6592 return r;
6593 }
6594
6595 // dirs
6596
6597 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6598 {
6599 std::lock_guard lock(client_lock);
6600 tout(cct) << __func__ << std::endl;
6601 tout(cct) << relpath << std::endl;
6602 tout(cct) << mode << std::endl;
6603 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6604
6605 if (unmounting)
6606 return -ENOTCONN;
6607
6608 if (std::string(relpath) == "/")
6609 return -EEXIST;
6610
6611 filepath path(relpath);
6612 string name = path.last_dentry();
6613 path.pop_dentry();
6614 InodeRef dir;
6615 int r = path_walk(path, &dir, perm);
6616 if (r < 0)
6617 return r;
6618 if (cct->_conf->client_permissions) {
6619 r = may_create(dir.get(), perm);
6620 if (r < 0)
6621 return r;
6622 }
6623 return _mkdir(dir.get(), name.c_str(), mode, perm);
6624 }
6625
6626 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6627 {
6628 std::lock_guard lock(client_lock);
6629 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6630 tout(cct) << __func__ << std::endl;
6631 tout(cct) << relpath << std::endl;
6632 tout(cct) << mode << std::endl;
6633
6634 if (unmounting)
6635 return -ENOTCONN;
6636
6637 //get through existing parts of path
6638 filepath path(relpath);
6639 unsigned int i;
6640 int r = 0, caps = 0;
6641 InodeRef cur, next;
6642 cur = cwd;
6643 for (i=0; i<path.depth(); ++i) {
6644 if (cct->_conf->client_permissions) {
6645 r = may_lookup(cur.get(), perms);
6646 if (r < 0)
6647 break;
6648 caps = CEPH_CAP_AUTH_SHARED;
6649 }
6650 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6651 if (r < 0)
6652 break;
6653 cur.swap(next);
6654 }
6655 //check that we have work left to do
6656 if (i==path.depth()) return -EEXIST;
6657 if (r!=-ENOENT) return r;
6658 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6659 //make new directory at each level
6660 for (; i<path.depth(); ++i) {
6661 if (cct->_conf->client_permissions) {
6662 r = may_create(cur.get(), perms);
6663 if (r < 0)
6664 return r;
6665 }
6666 //make new dir
6667 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6668
6669 //check proper creation/existence
6670 if(-EEXIST == r && i < path.depth() - 1) {
6671 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6672 }
6673 if (r < 0)
6674 return r;
6675 //move to new dir and continue
6676 cur.swap(next);
6677 ldout(cct, 20) << __func__ << ": successfully created directory "
6678 << filepath(cur->ino).get_path() << dendl;
6679 }
6680 return 0;
6681 }
6682
6683 int Client::rmdir(const char *relpath, const UserPerm& perms)
6684 {
6685 std::lock_guard lock(client_lock);
6686 tout(cct) << __func__ << std::endl;
6687 tout(cct) << relpath << std::endl;
6688
6689 if (unmounting)
6690 return -ENOTCONN;
6691
6692 if (std::string(relpath) == "/")
6693 return -EBUSY;
6694
6695 filepath path(relpath);
6696 string name = path.last_dentry();
6697 path.pop_dentry();
6698 InodeRef dir;
6699 int r = path_walk(path, &dir, perms);
6700 if (r < 0)
6701 return r;
6702 if (cct->_conf->client_permissions) {
6703 int r = may_delete(dir.get(), name.c_str(), perms);
6704 if (r < 0)
6705 return r;
6706 }
6707 return _rmdir(dir.get(), name.c_str(), perms);
6708 }
6709
6710 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6711 {
6712 std::lock_guard lock(client_lock);
6713 tout(cct) << __func__ << std::endl;
6714 tout(cct) << relpath << std::endl;
6715 tout(cct) << mode << std::endl;
6716 tout(cct) << rdev << std::endl;
6717
6718 if (unmounting)
6719 return -ENOTCONN;
6720
6721 if (std::string(relpath) == "/")
6722 return -EEXIST;
6723
6724 filepath path(relpath);
6725 string name = path.last_dentry();
6726 path.pop_dentry();
6727 InodeRef dir;
6728 int r = path_walk(path, &dir, perms);
6729 if (r < 0)
6730 return r;
6731 if (cct->_conf->client_permissions) {
6732 int r = may_create(dir.get(), perms);
6733 if (r < 0)
6734 return r;
6735 }
6736 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6737 }
6738
6739 // symlinks
6740
6741 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6742 {
6743 std::lock_guard lock(client_lock);
6744 tout(cct) << __func__ << std::endl;
6745 tout(cct) << target << std::endl;
6746 tout(cct) << relpath << std::endl;
6747
6748 if (unmounting)
6749 return -ENOTCONN;
6750
6751 if (std::string(relpath) == "/")
6752 return -EEXIST;
6753
6754 filepath path(relpath);
6755 string name = path.last_dentry();
6756 path.pop_dentry();
6757 InodeRef dir;
6758 int r = path_walk(path, &dir, perms);
6759 if (r < 0)
6760 return r;
6761 if (cct->_conf->client_permissions) {
6762 int r = may_create(dir.get(), perms);
6763 if (r < 0)
6764 return r;
6765 }
6766 return _symlink(dir.get(), name.c_str(), target, perms);
6767 }
6768
6769 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6770 {
6771 std::lock_guard lock(client_lock);
6772 tout(cct) << __func__ << std::endl;
6773 tout(cct) << relpath << std::endl;
6774
6775 if (unmounting)
6776 return -ENOTCONN;
6777
6778 filepath path(relpath);
6779 InodeRef in;
6780 int r = path_walk(path, &in, perms, false);
6781 if (r < 0)
6782 return r;
6783
6784 return _readlink(in.get(), buf, size);
6785 }
6786
6787 int Client::_readlink(Inode *in, char *buf, size_t size)
6788 {
6789 if (!in->is_symlink())
6790 return -EINVAL;
6791
6792 // copy into buf (at most size bytes)
6793 int r = in->symlink.length();
6794 if (r > (int)size)
6795 r = size;
6796 memcpy(buf, in->symlink.c_str(), r);
6797 return r;
6798 }
6799
6800
6801 // inode stuff
6802
6803 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6804 {
6805 bool yes = in->caps_issued_mask(mask, true);
6806
6807 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6808 if (yes && !force)
6809 return 0;
6810
6811 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6812 filepath path;
6813 in->make_nosnap_relative_path(path);
6814 req->set_filepath(path);
6815 req->set_inode(in);
6816 req->head.args.getattr.mask = mask;
6817
6818 int res = make_request(req, perms);
6819 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6820 return res;
6821 }
6822
6823 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6824 const UserPerm& perms, InodeRef *inp)
6825 {
6826 int issued = in->caps_issued();
6827
6828 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6829 ccap_string(issued) << dendl;
6830
6831 if (in->snapid != CEPH_NOSNAP) {
6832 return -EROFS;
6833 }
6834 if ((mask & CEPH_SETATTR_SIZE) &&
6835 (unsigned long)stx->stx_size > in->size &&
6836 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6837 perms)) {
6838 return -EDQUOT;
6839 }
6840
6841 // make the change locally?
6842 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6843 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6844 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6845 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6846 << in->cap_dirtier_gid << ", forcing sync setattr"
6847 << dendl;
6848 /*
6849 * This works because we implicitly flush the caps as part of the
6850 * request, so the cap update check will happen with the writeback
6851 * cap context, and then the setattr check will happen with the
6852 * caller's context.
6853 *
6854 * In reality this pattern is likely pretty rare (different users
6855 * setattr'ing the same file). If that turns out not to be the
6856 * case later, we can build a more complex pipelined cap writeback
6857 * infrastructure...
6858 */
6859 if (!mask)
6860 mask |= CEPH_SETATTR_CTIME;
6861 goto force_request;
6862 }
6863
6864 if (!mask) {
6865 // caller just needs us to bump the ctime
6866 in->ctime = ceph_clock_now();
6867 in->cap_dirtier_uid = perms.uid();
6868 in->cap_dirtier_gid = perms.gid();
6869 if (issued & CEPH_CAP_AUTH_EXCL)
6870 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6871 else if (issued & CEPH_CAP_FILE_EXCL)
6872 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6873 else if (issued & CEPH_CAP_XATTR_EXCL)
6874 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6875 else
6876 mask |= CEPH_SETATTR_CTIME;
6877 }
6878
6879 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6880 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6881
6882 mask &= ~CEPH_SETATTR_KILL_SGUID;
6883
6884 if (mask & CEPH_SETATTR_UID) {
6885 in->ctime = ceph_clock_now();
6886 in->cap_dirtier_uid = perms.uid();
6887 in->cap_dirtier_gid = perms.gid();
6888 in->uid = stx->stx_uid;
6889 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6890 mask &= ~CEPH_SETATTR_UID;
6891 kill_sguid = true;
6892 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6893 }
6894 if (mask & CEPH_SETATTR_GID) {
6895 in->ctime = ceph_clock_now();
6896 in->cap_dirtier_uid = perms.uid();
6897 in->cap_dirtier_gid = perms.gid();
6898 in->gid = stx->stx_gid;
6899 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6900 mask &= ~CEPH_SETATTR_GID;
6901 kill_sguid = true;
6902 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6903 }
6904
6905 if (mask & CEPH_SETATTR_MODE) {
6906 in->ctime = ceph_clock_now();
6907 in->cap_dirtier_uid = perms.uid();
6908 in->cap_dirtier_gid = perms.gid();
6909 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6910 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6911 mask &= ~CEPH_SETATTR_MODE;
6912 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6913 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6914 /* Must squash the any setuid/setgid bits with an ownership change */
6915 in->mode &= ~(S_ISUID|S_ISGID);
6916 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6917 }
6918
6919 if (mask & CEPH_SETATTR_BTIME) {
6920 in->ctime = ceph_clock_now();
6921 in->cap_dirtier_uid = perms.uid();
6922 in->cap_dirtier_gid = perms.gid();
6923 in->btime = utime_t(stx->stx_btime);
6924 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6925 mask &= ~CEPH_SETATTR_BTIME;
6926 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6927 }
6928 } else if (mask & CEPH_SETATTR_SIZE) {
6929 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6930 mask |= CEPH_SETATTR_KILL_SGUID;
6931 }
6932
6933 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6934 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6935 if (mask & CEPH_SETATTR_MTIME)
6936 in->mtime = utime_t(stx->stx_mtime);
6937 if (mask & CEPH_SETATTR_ATIME)
6938 in->atime = utime_t(stx->stx_atime);
6939 in->ctime = ceph_clock_now();
6940 in->cap_dirtier_uid = perms.uid();
6941 in->cap_dirtier_gid = perms.gid();
6942 in->time_warp_seq++;
6943 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6944 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6945 }
6946 }
6947 if (!mask) {
6948 in->change_attr++;
6949 return 0;
6950 }
6951
6952 force_request:
6953 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6954
6955 filepath path;
6956
6957 in->make_nosnap_relative_path(path);
6958 req->set_filepath(path);
6959 req->set_inode(in);
6960
6961 if (mask & CEPH_SETATTR_KILL_SGUID) {
6962 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6963 }
6964 if (mask & CEPH_SETATTR_MODE) {
6965 req->head.args.setattr.mode = stx->stx_mode;
6966 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6967 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6968 }
6969 if (mask & CEPH_SETATTR_UID) {
6970 req->head.args.setattr.uid = stx->stx_uid;
6971 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6972 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6973 }
6974 if (mask & CEPH_SETATTR_GID) {
6975 req->head.args.setattr.gid = stx->stx_gid;
6976 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6977 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6978 }
6979 if (mask & CEPH_SETATTR_BTIME) {
6980 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6981 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6982 }
6983 if (mask & CEPH_SETATTR_MTIME) {
6984 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6985 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6986 CEPH_CAP_FILE_WR;
6987 }
6988 if (mask & CEPH_SETATTR_ATIME) {
6989 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6990 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6991 CEPH_CAP_FILE_WR;
6992 }
6993 if (mask & CEPH_SETATTR_SIZE) {
6994 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6995 req->head.args.setattr.size = stx->stx_size;
6996 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6997 } else { //too big!
6998 put_request(req);
6999 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7000 return -EFBIG;
7001 }
7002 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7003 CEPH_CAP_FILE_WR;
7004 }
7005 req->head.args.setattr.mask = mask;
7006
7007 req->regetattr_mask = mask;
7008
7009 int res = make_request(req, perms, inp);
7010 ldout(cct, 10) << "_setattr result=" << res << dendl;
7011 return res;
7012 }
7013
7014 /* Note that we only care about attrs that setattr cares about */
7015 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7016 {
7017 stx->stx_size = st->st_size;
7018 stx->stx_mode = st->st_mode;
7019 stx->stx_uid = st->st_uid;
7020 stx->stx_gid = st->st_gid;
7021 #ifdef __APPLE__
7022 stx->stx_mtime = st->st_mtimespec;
7023 stx->stx_atime = st->st_atimespec;
7024 #else
7025 stx->stx_mtime = st->st_mtim;
7026 stx->stx_atime = st->st_atim;
7027 #endif
7028 }
7029
7030 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7031 const UserPerm& perms, InodeRef *inp)
7032 {
7033 int ret = _do_setattr(in, stx, mask, perms, inp);
7034 if (ret < 0)
7035 return ret;
7036 if (mask & CEPH_SETATTR_MODE)
7037 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7038 return ret;
7039 }
7040
7041 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7042 const UserPerm& perms)
7043 {
7044 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7045 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7046 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7047 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7048 if (cct->_conf->client_permissions) {
7049 int r = may_setattr(in.get(), stx, mask, perms);
7050 if (r < 0)
7051 return r;
7052 }
7053 return __setattrx(in.get(), stx, mask, perms);
7054 }
7055
7056 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7057 const UserPerm& perms)
7058 {
7059 struct ceph_statx stx;
7060
7061 stat_to_statx(attr, &stx);
7062 mask &= ~CEPH_SETATTR_BTIME;
7063
7064 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7065 mask &= ~CEPH_SETATTR_UID;
7066 }
7067 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7068 mask &= ~CEPH_SETATTR_GID;
7069 }
7070
7071 return _setattrx(in, &stx, mask, perms);
7072 }
7073
7074 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7075 const UserPerm& perms)
7076 {
7077 std::lock_guard lock(client_lock);
7078 tout(cct) << __func__ << std::endl;
7079 tout(cct) << relpath << std::endl;
7080 tout(cct) << mask << std::endl;
7081
7082 if (unmounting)
7083 return -ENOTCONN;
7084
7085 filepath path(relpath);
7086 InodeRef in;
7087 int r = path_walk(path, &in, perms);
7088 if (r < 0)
7089 return r;
7090 return _setattr(in, attr, mask, perms);
7091 }
7092
7093 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7094 const UserPerm& perms, int flags)
7095 {
7096 std::lock_guard lock(client_lock);
7097 tout(cct) << __func__ << std::endl;
7098 tout(cct) << relpath << std::endl;
7099 tout(cct) << mask << std::endl;
7100
7101 if (unmounting)
7102 return -ENOTCONN;
7103
7104 filepath path(relpath);
7105 InodeRef in;
7106 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7107 if (r < 0)
7108 return r;
7109 return _setattrx(in, stx, mask, perms);
7110 }
7111
7112 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7113 {
7114 std::lock_guard lock(client_lock);
7115 tout(cct) << __func__ << std::endl;
7116 tout(cct) << fd << std::endl;
7117 tout(cct) << mask << std::endl;
7118
7119 if (unmounting)
7120 return -ENOTCONN;
7121
7122 Fh *f = get_filehandle(fd);
7123 if (!f)
7124 return -EBADF;
7125 #if defined(__linux__) && defined(O_PATH)
7126 if (f->flags & O_PATH)
7127 return -EBADF;
7128 #endif
7129 return _setattr(f->inode, attr, mask, perms);
7130 }
7131
7132 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7133 {
7134 std::lock_guard lock(client_lock);
7135 tout(cct) << __func__ << std::endl;
7136 tout(cct) << fd << std::endl;
7137 tout(cct) << mask << std::endl;
7138
7139 if (unmounting)
7140 return -ENOTCONN;
7141
7142 Fh *f = get_filehandle(fd);
7143 if (!f)
7144 return -EBADF;
7145 #if defined(__linux__) && defined(O_PATH)
7146 if (f->flags & O_PATH)
7147 return -EBADF;
7148 #endif
7149 return _setattrx(f->inode, stx, mask, perms);
7150 }
7151
7152 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7153 frag_info_t *dirstat, int mask)
7154 {
7155 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7156 std::lock_guard lock(client_lock);
7157 tout(cct) << "stat" << std::endl;
7158 tout(cct) << relpath << std::endl;
7159
7160 if (unmounting)
7161 return -ENOTCONN;
7162
7163 filepath path(relpath);
7164 InodeRef in;
7165 int r = path_walk(path, &in, perms, true, mask);
7166 if (r < 0)
7167 return r;
7168 r = _getattr(in, mask, perms);
7169 if (r < 0) {
7170 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7171 return r;
7172 }
7173 fill_stat(in, stbuf, dirstat);
7174 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7175 return r;
7176 }
7177
7178 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7179 {
7180 unsigned mask = 0;
7181
7182 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7183 if (flags & AT_NO_ATTR_SYNC)
7184 goto out;
7185
7186 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7187 mask |= CEPH_CAP_PIN;
7188 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7189 mask |= CEPH_CAP_AUTH_SHARED;
7190 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7191 mask |= CEPH_CAP_LINK_SHARED;
7192 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7193 mask |= CEPH_CAP_FILE_SHARED;
7194 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7195 mask |= CEPH_CAP_XATTR_SHARED;
7196 out:
7197 return mask;
7198 }
7199
7200 int Client::statx(const char *relpath, struct ceph_statx *stx,
7201 const UserPerm& perms,
7202 unsigned int want, unsigned int flags)
7203 {
7204 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7205 std::lock_guard lock(client_lock);
7206 tout(cct) << "statx" << std::endl;
7207 tout(cct) << relpath << std::endl;
7208
7209 if (unmounting)
7210 return -ENOTCONN;
7211
7212 filepath path(relpath);
7213 InodeRef in;
7214
7215 unsigned mask = statx_to_mask(flags, want);
7216
7217 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7218 if (r < 0)
7219 return r;
7220
7221 r = _getattr(in, mask, perms);
7222 if (r < 0) {
7223 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7224 return r;
7225 }
7226
7227 fill_statx(in, mask, stx);
7228 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7229 return r;
7230 }
7231
7232 int Client::lstat(const char *relpath, struct stat *stbuf,
7233 const UserPerm& perms, frag_info_t *dirstat, int mask)
7234 {
7235 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7236 std::lock_guard lock(client_lock);
7237 tout(cct) << __func__ << std::endl;
7238 tout(cct) << relpath << std::endl;
7239
7240 if (unmounting)
7241 return -ENOTCONN;
7242
7243 filepath path(relpath);
7244 InodeRef in;
7245 // don't follow symlinks
7246 int r = path_walk(path, &in, perms, false, mask);
7247 if (r < 0)
7248 return r;
7249 r = _getattr(in, mask, perms);
7250 if (r < 0) {
7251 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7252 return r;
7253 }
7254 fill_stat(in, stbuf, dirstat);
7255 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7256 return r;
7257 }
7258
7259 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7260 {
7261 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7262 << " mode 0" << oct << in->mode << dec
7263 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7264 memset(st, 0, sizeof(struct stat));
7265 if (use_faked_inos())
7266 st->st_ino = in->faked_ino;
7267 else
7268 st->st_ino = in->ino;
7269 st->st_dev = in->snapid;
7270 st->st_mode = in->mode;
7271 st->st_rdev = in->rdev;
7272 if (in->is_dir()) {
7273 switch (in->nlink) {
7274 case 0:
7275 st->st_nlink = 0; /* dir is unlinked */
7276 break;
7277 case 1:
7278 st->st_nlink = 1 /* parent dentry */
7279 + 1 /* <dir>/. */
7280 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7281 break;
7282 default:
7283 ceph_abort();
7284 }
7285 } else {
7286 st->st_nlink = in->nlink;
7287 }
7288 st->st_uid = in->uid;
7289 st->st_gid = in->gid;
7290 if (in->ctime > in->mtime) {
7291 stat_set_ctime_sec(st, in->ctime.sec());
7292 stat_set_ctime_nsec(st, in->ctime.nsec());
7293 } else {
7294 stat_set_ctime_sec(st, in->mtime.sec());
7295 stat_set_ctime_nsec(st, in->mtime.nsec());
7296 }
7297 stat_set_atime_sec(st, in->atime.sec());
7298 stat_set_atime_nsec(st, in->atime.nsec());
7299 stat_set_mtime_sec(st, in->mtime.sec());
7300 stat_set_mtime_nsec(st, in->mtime.nsec());
7301 if (in->is_dir()) {
7302 if (cct->_conf->client_dirsize_rbytes)
7303 st->st_size = in->rstat.rbytes;
7304 else
7305 st->st_size = in->dirstat.size();
7306 st->st_blocks = 1;
7307 } else {
7308 st->st_size = in->size;
7309 st->st_blocks = (in->size + 511) >> 9;
7310 }
7311 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7312
7313 if (dirstat)
7314 *dirstat = in->dirstat;
7315 if (rstat)
7316 *rstat = in->rstat;
7317
7318 return in->caps_issued();
7319 }
7320
7321 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7322 {
7323 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7324 << " mode 0" << oct << in->mode << dec
7325 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7326 memset(stx, 0, sizeof(struct ceph_statx));
7327
7328 /*
7329 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7330 * so that all bits are set.
7331 */
7332 if (!mask)
7333 mask = ~0;
7334
7335 /* These are always considered to be available */
7336 stx->stx_dev = in->snapid;
7337 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7338
7339 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7340 stx->stx_mode = S_IFMT & in->mode;
7341 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7342 stx->stx_rdev = in->rdev;
7343 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7344
7345 if (mask & CEPH_CAP_AUTH_SHARED) {
7346 stx->stx_uid = in->uid;
7347 stx->stx_gid = in->gid;
7348 stx->stx_mode = in->mode;
7349 in->btime.to_timespec(&stx->stx_btime);
7350 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7351 }
7352
7353 if (mask & CEPH_CAP_LINK_SHARED) {
7354 if (in->is_dir()) {
7355 switch (in->nlink) {
7356 case 0:
7357 stx->stx_nlink = 0; /* dir is unlinked */
7358 break;
7359 case 1:
7360 stx->stx_nlink = 1 /* parent dentry */
7361 + 1 /* <dir>/. */
7362 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7363 break;
7364 default:
7365 ceph_abort();
7366 }
7367 } else {
7368 stx->stx_nlink = in->nlink;
7369 }
7370 stx->stx_mask |= CEPH_STATX_NLINK;
7371 }
7372
7373 if (mask & CEPH_CAP_FILE_SHARED) {
7374
7375 in->atime.to_timespec(&stx->stx_atime);
7376 in->mtime.to_timespec(&stx->stx_mtime);
7377
7378 if (in->is_dir()) {
7379 if (cct->_conf->client_dirsize_rbytes)
7380 stx->stx_size = in->rstat.rbytes;
7381 else
7382 stx->stx_size = in->dirstat.size();
7383 stx->stx_blocks = 1;
7384 } else {
7385 stx->stx_size = in->size;
7386 stx->stx_blocks = (in->size + 511) >> 9;
7387 }
7388 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7389 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7390 }
7391
7392 /* Change time and change_attr both require all shared caps to view */
7393 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7394 stx->stx_version = in->change_attr;
7395 if (in->ctime > in->mtime)
7396 in->ctime.to_timespec(&stx->stx_ctime);
7397 else
7398 in->mtime.to_timespec(&stx->stx_ctime);
7399 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7400 }
7401
7402 }
7403
7404 void Client::touch_dn(Dentry *dn)
7405 {
7406 lru.lru_touch(dn);
7407 }
7408
7409 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7410 {
7411 std::lock_guard lock(client_lock);
7412 tout(cct) << __func__ << std::endl;
7413 tout(cct) << relpath << std::endl;
7414 tout(cct) << mode << std::endl;
7415
7416 if (unmounting)
7417 return -ENOTCONN;
7418
7419 filepath path(relpath);
7420 InodeRef in;
7421 int r = path_walk(path, &in, perms);
7422 if (r < 0)
7423 return r;
7424 struct stat attr;
7425 attr.st_mode = mode;
7426 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7427 }
7428
7429 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7430 {
7431 std::lock_guard lock(client_lock);
7432 tout(cct) << __func__ << std::endl;
7433 tout(cct) << fd << std::endl;
7434 tout(cct) << mode << std::endl;
7435
7436 if (unmounting)
7437 return -ENOTCONN;
7438
7439 Fh *f = get_filehandle(fd);
7440 if (!f)
7441 return -EBADF;
7442 #if defined(__linux__) && defined(O_PATH)
7443 if (f->flags & O_PATH)
7444 return -EBADF;
7445 #endif
7446 struct stat attr;
7447 attr.st_mode = mode;
7448 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7449 }
7450
7451 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7452 {
7453 std::lock_guard lock(client_lock);
7454 tout(cct) << __func__ << std::endl;
7455 tout(cct) << relpath << std::endl;
7456 tout(cct) << mode << std::endl;
7457
7458 if (unmounting)
7459 return -ENOTCONN;
7460
7461 filepath path(relpath);
7462 InodeRef in;
7463 // don't follow symlinks
7464 int r = path_walk(path, &in, perms, false);
7465 if (r < 0)
7466 return r;
7467 struct stat attr;
7468 attr.st_mode = mode;
7469 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7470 }
7471
7472 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7473 const UserPerm& perms)
7474 {
7475 std::lock_guard lock(client_lock);
7476 tout(cct) << __func__ << std::endl;
7477 tout(cct) << relpath << std::endl;
7478 tout(cct) << new_uid << std::endl;
7479 tout(cct) << new_gid << std::endl;
7480
7481 if (unmounting)
7482 return -ENOTCONN;
7483
7484 filepath path(relpath);
7485 InodeRef in;
7486 int r = path_walk(path, &in, perms);
7487 if (r < 0)
7488 return r;
7489 struct stat attr;
7490 attr.st_uid = new_uid;
7491 attr.st_gid = new_gid;
7492 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7493 }
7494
7495 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7496 {
7497 std::lock_guard lock(client_lock);
7498 tout(cct) << __func__ << std::endl;
7499 tout(cct) << fd << std::endl;
7500 tout(cct) << new_uid << std::endl;
7501 tout(cct) << new_gid << std::endl;
7502
7503 if (unmounting)
7504 return -ENOTCONN;
7505
7506 Fh *f = get_filehandle(fd);
7507 if (!f)
7508 return -EBADF;
7509 #if defined(__linux__) && defined(O_PATH)
7510 if (f->flags & O_PATH)
7511 return -EBADF;
7512 #endif
7513 struct stat attr;
7514 attr.st_uid = new_uid;
7515 attr.st_gid = new_gid;
7516 int mask = 0;
7517 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7518 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7519 return _setattr(f->inode, &attr, mask, perms);
7520 }
7521
7522 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7523 const UserPerm& perms)
7524 {
7525 std::lock_guard lock(client_lock);
7526 tout(cct) << __func__ << std::endl;
7527 tout(cct) << relpath << std::endl;
7528 tout(cct) << new_uid << std::endl;
7529 tout(cct) << new_gid << std::endl;
7530
7531 if (unmounting)
7532 return -ENOTCONN;
7533
7534 filepath path(relpath);
7535 InodeRef in;
7536 // don't follow symlinks
7537 int r = path_walk(path, &in, perms, false);
7538 if (r < 0)
7539 return r;
7540 struct stat attr;
7541 attr.st_uid = new_uid;
7542 attr.st_gid = new_gid;
7543 int mask = 0;
7544 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7545 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7546 return _setattr(in, &attr, mask, perms);
7547 }
7548
7549 static void attr_set_atime_and_mtime(struct stat *attr,
7550 const utime_t &atime,
7551 const utime_t &mtime)
7552 {
7553 stat_set_atime_sec(attr, atime.tv.tv_sec);
7554 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7555 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7556 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7557 }
7558
7559 // for [l]utime() invoke the timeval variant as the timespec
7560 // variant are not yet implemented. for futime[s](), invoke
7561 // the timespec variant.
7562 int Client::utime(const char *relpath, struct utimbuf *buf,
7563 const UserPerm& perms)
7564 {
7565 struct timeval tv[2];
7566 tv[0].tv_sec = buf->actime;
7567 tv[0].tv_usec = 0;
7568 tv[1].tv_sec = buf->modtime;
7569 tv[1].tv_usec = 0;
7570
7571 return utimes(relpath, tv, perms);
7572 }
7573
7574 int Client::lutime(const char *relpath, struct utimbuf *buf,
7575 const UserPerm& perms)
7576 {
7577 struct timeval tv[2];
7578 tv[0].tv_sec = buf->actime;
7579 tv[0].tv_usec = 0;
7580 tv[1].tv_sec = buf->modtime;
7581 tv[1].tv_usec = 0;
7582
7583 return lutimes(relpath, tv, perms);
7584 }
7585
7586 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7587 {
7588 struct timespec ts[2];
7589 ts[0].tv_sec = buf->actime;
7590 ts[0].tv_nsec = 0;
7591 ts[1].tv_sec = buf->modtime;
7592 ts[1].tv_nsec = 0;
7593
7594 return futimens(fd, ts, perms);
7595 }
7596
7597 int Client::utimes(const char *relpath, struct timeval times[2],
7598 const UserPerm& perms)
7599 {
7600 std::lock_guard lock(client_lock);
7601 tout(cct) << __func__ << std::endl;
7602 tout(cct) << relpath << std::endl;
7603 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7604 << std::endl;
7605 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7606 << std::endl;
7607
7608 if (unmounting)
7609 return -ENOTCONN;
7610
7611 filepath path(relpath);
7612 InodeRef in;
7613 int r = path_walk(path, &in, perms);
7614 if (r < 0)
7615 return r;
7616 struct stat attr;
7617 utime_t atime(times[0]);
7618 utime_t mtime(times[1]);
7619
7620 attr_set_atime_and_mtime(&attr, atime, mtime);
7621 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7622 }
7623
7624 int Client::lutimes(const char *relpath, struct timeval times[2],
7625 const UserPerm& perms)
7626 {
7627 std::lock_guard lock(client_lock);
7628 tout(cct) << __func__ << std::endl;
7629 tout(cct) << relpath << std::endl;
7630 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7631 << std::endl;
7632 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7633 << std::endl;
7634
7635 if (unmounting)
7636 return -ENOTCONN;
7637
7638 filepath path(relpath);
7639 InodeRef in;
7640 int r = path_walk(path, &in, perms, false);
7641 if (r < 0)
7642 return r;
7643 struct stat attr;
7644 utime_t atime(times[0]);
7645 utime_t mtime(times[1]);
7646
7647 attr_set_atime_and_mtime(&attr, atime, mtime);
7648 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7649 }
7650
7651 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7652 {
7653 struct timespec ts[2];
7654 ts[0].tv_sec = times[0].tv_sec;
7655 ts[0].tv_nsec = times[0].tv_usec * 1000;
7656 ts[1].tv_sec = times[1].tv_sec;
7657 ts[1].tv_nsec = times[1].tv_usec * 1000;
7658
7659 return futimens(fd, ts, perms);
7660 }
7661
7662 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7663 {
7664 std::lock_guard lock(client_lock);
7665 tout(cct) << __func__ << std::endl;
7666 tout(cct) << fd << std::endl;
7667 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7668 << std::endl;
7669 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7670 << std::endl;
7671
7672 if (unmounting)
7673 return -ENOTCONN;
7674
7675 Fh *f = get_filehandle(fd);
7676 if (!f)
7677 return -EBADF;
7678 #if defined(__linux__) && defined(O_PATH)
7679 if (f->flags & O_PATH)
7680 return -EBADF;
7681 #endif
7682 struct stat attr;
7683 utime_t atime(times[0]);
7684 utime_t mtime(times[1]);
7685
7686 attr_set_atime_and_mtime(&attr, atime, mtime);
7687 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7688 }
7689
7690 int Client::flock(int fd, int operation, uint64_t owner)
7691 {
7692 std::lock_guard lock(client_lock);
7693 tout(cct) << __func__ << std::endl;
7694 tout(cct) << fd << std::endl;
7695 tout(cct) << operation << std::endl;
7696 tout(cct) << owner << std::endl;
7697
7698 if (unmounting)
7699 return -ENOTCONN;
7700
7701 Fh *f = get_filehandle(fd);
7702 if (!f)
7703 return -EBADF;
7704
7705 return _flock(f, operation, owner);
7706 }
7707
7708 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7709 {
7710 std::lock_guard lock(client_lock);
7711 tout(cct) << __func__ << std::endl;
7712 tout(cct) << relpath << std::endl;
7713
7714 if (unmounting)
7715 return -ENOTCONN;
7716
7717 filepath path(relpath);
7718 InodeRef in;
7719 int r = path_walk(path, &in, perms, true);
7720 if (r < 0)
7721 return r;
7722 if (cct->_conf->client_permissions) {
7723 int r = may_open(in.get(), O_RDONLY, perms);
7724 if (r < 0)
7725 return r;
7726 }
7727 r = _opendir(in.get(), dirpp, perms);
7728 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7729 if (r != -ENOTDIR)
7730 tout(cct) << (unsigned long)*dirpp << std::endl;
7731 return r;
7732 }
7733
7734 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7735 {
7736 if (!in->is_dir())
7737 return -ENOTDIR;
7738 *dirpp = new dir_result_t(in, perms);
7739 opened_dirs.insert(*dirpp);
7740 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7741 return 0;
7742 }
7743
7744
7745 int Client::closedir(dir_result_t *dir)
7746 {
7747 std::lock_guard lock(client_lock);
7748 tout(cct) << __func__ << std::endl;
7749 tout(cct) << (unsigned long)dir << std::endl;
7750
7751 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7752 _closedir(dir);
7753 return 0;
7754 }
7755
7756 void Client::_closedir(dir_result_t *dirp)
7757 {
7758 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7759 if (dirp->inode) {
7760 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7761 dirp->inode.reset();
7762 }
7763 _readdir_drop_dirp_buffer(dirp);
7764 opened_dirs.erase(dirp);
7765 delete dirp;
7766 }
7767
7768 void Client::rewinddir(dir_result_t *dirp)
7769 {
7770 std::lock_guard lock(client_lock);
7771 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7772
7773 if (unmounting)
7774 return;
7775
7776 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7777 _readdir_drop_dirp_buffer(d);
7778 d->reset();
7779 }
7780
7781 loff_t Client::telldir(dir_result_t *dirp)
7782 {
7783 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7784 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7785 return d->offset;
7786 }
7787
7788 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7789 {
7790 std::lock_guard lock(client_lock);
7791
7792 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7793
7794 if (unmounting)
7795 return;
7796
7797 if (offset == dirp->offset)
7798 return;
7799
7800 if (offset > dirp->offset)
7801 dirp->release_count = 0; // bump if we do a forward seek
7802 else
7803 dirp->ordered_count = 0; // disable filling readdir cache
7804
7805 if (dirp->hash_order()) {
7806 if (dirp->offset > offset) {
7807 _readdir_drop_dirp_buffer(dirp);
7808 dirp->reset();
7809 }
7810 } else {
7811 if (offset == 0 ||
7812 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7813 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7814 _readdir_drop_dirp_buffer(dirp);
7815 dirp->reset();
7816 }
7817 }
7818
7819 dirp->offset = offset;
7820 }
7821
7822
7823 //struct dirent {
7824 // ino_t d_ino; /* inode number */
7825 // off_t d_off; /* offset to the next dirent */
7826 // unsigned short d_reclen; /* length of this record */
7827 // unsigned char d_type; /* type of file */
7828 // char d_name[256]; /* filename */
7829 //};
7830 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7831 {
7832 strncpy(de->d_name, name, 255);
7833 de->d_name[255] = '\0';
7834 #ifndef __CYGWIN__
7835 de->d_ino = ino;
7836 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7837 de->d_off = next_off;
7838 #endif
7839 de->d_reclen = 1;
7840 de->d_type = IFTODT(type);
7841 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7842 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7843 #endif
7844 }
7845
7846 void Client::_readdir_next_frag(dir_result_t *dirp)
7847 {
7848 frag_t fg = dirp->buffer_frag;
7849
7850 if (fg.is_rightmost()) {
7851 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7852 dirp->set_end();
7853 return;
7854 }
7855
7856 // advance
7857 fg = fg.next();
7858 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7859
7860 if (dirp->hash_order()) {
7861 // keep last_name
7862 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7863 if (dirp->offset < new_offset) // don't decrease offset
7864 dirp->offset = new_offset;
7865 } else {
7866 dirp->last_name.clear();
7867 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7868 _readdir_rechoose_frag(dirp);
7869 }
7870 }
7871
7872 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7873 {
7874 ceph_assert(dirp->inode);
7875
7876 if (dirp->hash_order())
7877 return;
7878
7879 frag_t cur = frag_t(dirp->offset_high());
7880 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7881 if (fg != cur) {
7882 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7883 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7884 dirp->last_name.clear();
7885 dirp->next_offset = 2;
7886 }
7887 }
7888
7889 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7890 {
7891 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7892 dirp->buffer.clear();
7893 }
7894
7895 int Client::_readdir_get_frag(dir_result_t *dirp)
7896 {
7897 ceph_assert(dirp);
7898 ceph_assert(dirp->inode);
7899
7900 // get the current frag.
7901 frag_t fg;
7902 if (dirp->hash_order())
7903 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7904 else
7905 fg = frag_t(dirp->offset_high());
7906
7907 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7908 << " offset " << hex << dirp->offset << dec << dendl;
7909
7910 int op = CEPH_MDS_OP_READDIR;
7911 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7912 op = CEPH_MDS_OP_LSSNAP;
7913
7914 InodeRef& diri = dirp->inode;
7915
7916 MetaRequest *req = new MetaRequest(op);
7917 filepath path;
7918 diri->make_nosnap_relative_path(path);
7919 req->set_filepath(path);
7920 req->set_inode(diri.get());
7921 req->head.args.readdir.frag = fg;
7922 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7923 if (dirp->last_name.length()) {
7924 req->path2.set_path(dirp->last_name);
7925 } else if (dirp->hash_order()) {
7926 req->head.args.readdir.offset_hash = dirp->offset_high();
7927 }
7928 req->dirp = dirp;
7929
7930 bufferlist dirbl;
7931 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7932
7933 if (res == -EAGAIN) {
7934 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7935 _readdir_rechoose_frag(dirp);
7936 return _readdir_get_frag(dirp);
7937 }
7938
7939 if (res == 0) {
7940 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7941 << " size " << dirp->buffer.size() << dendl;
7942 } else {
7943 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7944 dirp->set_end();
7945 }
7946
7947 return res;
7948 }
7949
7950 struct dentry_off_lt {
7951 bool operator()(const Dentry* dn, int64_t off) const {
7952 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7953 }
7954 };
7955
7956 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7957 int caps, bool getref)
7958 {
7959 ceph_assert(client_lock.is_locked());
7960 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7961 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7962 << dendl;
7963 Dir *dir = dirp->inode->dir;
7964
7965 if (!dir) {
7966 ldout(cct, 10) << " dir is empty" << dendl;
7967 dirp->set_end();
7968 return 0;
7969 }
7970
7971 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7972 dir->readdir_cache.end(),
7973 dirp->offset, dentry_off_lt());
7974
7975 string dn_name;
7976 while (true) {
7977 if (!dirp->inode->is_complete_and_ordered())
7978 return -EAGAIN;
7979 if (pd == dir->readdir_cache.end())
7980 break;
7981 Dentry *dn = *pd;
7982 if (dn->inode == NULL) {
7983 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7984 ++pd;
7985 continue;
7986 }
7987 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7988 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7989 ++pd;
7990 continue;
7991 }
7992
7993 int r = _getattr(dn->inode, caps, dirp->perms);
7994 if (r < 0)
7995 return r;
7996
7997 struct ceph_statx stx;
7998 struct dirent de;
7999 fill_statx(dn->inode, caps, &stx);
8000
8001 uint64_t next_off = dn->offset + 1;
8002 ++pd;
8003 if (pd == dir->readdir_cache.end())
8004 next_off = dir_result_t::END;
8005
8006 Inode *in = NULL;
8007 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8008 if (getref) {
8009 in = dn->inode.get();
8010 _ll_get(in);
8011 }
8012
8013 dn_name = dn->name; // fill in name while we have lock
8014
8015 client_lock.Unlock();
8016 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8017 client_lock.Lock();
8018 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8019 << " = " << r << dendl;
8020 if (r < 0) {
8021 return r;
8022 }
8023
8024 dirp->offset = next_off;
8025 if (dirp->at_end())
8026 dirp->next_offset = 2;
8027 else
8028 dirp->next_offset = dirp->offset_low();
8029 dirp->last_name = dn_name; // we successfully returned this one; update!
8030 dirp->release_count = 0; // last_name no longer match cache index
8031 if (r > 0)
8032 return r;
8033 }
8034
8035 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8036 dirp->set_end();
8037 return 0;
8038 }
8039
8040 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8041 unsigned want, unsigned flags, bool getref)
8042 {
8043 int caps = statx_to_mask(flags, want);
8044
8045 std::lock_guard lock(client_lock);
8046
8047 if (unmounting)
8048 return -ENOTCONN;
8049
8050 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8051
8052 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8053 << dec << " at_end=" << dirp->at_end()
8054 << " hash_order=" << dirp->hash_order() << dendl;
8055
8056 struct dirent de;
8057 struct ceph_statx stx;
8058 memset(&de, 0, sizeof(de));
8059 memset(&stx, 0, sizeof(stx));
8060
8061 InodeRef& diri = dirp->inode;
8062
8063 if (dirp->at_end())
8064 return 0;
8065
8066 if (dirp->offset == 0) {
8067 ldout(cct, 15) << " including ." << dendl;
8068 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8069 uint64_t next_off = 1;
8070
8071 int r;
8072 r = _getattr(diri, caps, dirp->perms);
8073 if (r < 0)
8074 return r;
8075
8076 fill_statx(diri, caps, &stx);
8077 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8078
8079 Inode *inode = NULL;
8080 if (getref) {
8081 inode = diri.get();
8082 _ll_get(inode);
8083 }
8084
8085 client_lock.Unlock();
8086 r = cb(p, &de, &stx, next_off, inode);
8087 client_lock.Lock();
8088 if (r < 0)
8089 return r;
8090
8091 dirp->offset = next_off;
8092 if (r > 0)
8093 return r;
8094 }
8095 if (dirp->offset == 1) {
8096 ldout(cct, 15) << " including .." << dendl;
8097 uint64_t next_off = 2;
8098 InodeRef in;
8099 if (diri->dentries.empty())
8100 in = diri;
8101 else
8102 in = diri->get_first_parent()->dir->parent_inode;
8103
8104 int r;
8105 r = _getattr(in, caps, dirp->perms);
8106 if (r < 0)
8107 return r;
8108
8109 fill_statx(in, caps, &stx);
8110 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8111
8112 Inode *inode = NULL;
8113 if (getref) {
8114 inode = in.get();
8115 _ll_get(inode);
8116 }
8117
8118 client_lock.Unlock();
8119 r = cb(p, &de, &stx, next_off, inode);
8120 client_lock.Lock();
8121 if (r < 0)
8122 return r;
8123
8124 dirp->offset = next_off;
8125 if (r > 0)
8126 return r;
8127 }
8128
8129 // can we read from our cache?
8130 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8131 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8132 << dirp->inode->is_complete_and_ordered()
8133 << " issued " << ccap_string(dirp->inode->caps_issued())
8134 << dendl;
8135 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8136 dirp->inode->is_complete_and_ordered() &&
8137 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8138 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8139 if (err != -EAGAIN)
8140 return err;
8141 }
8142
8143 while (1) {
8144 if (dirp->at_end())
8145 return 0;
8146
8147 bool check_caps = true;
8148 if (!dirp->is_cached()) {
8149 int r = _readdir_get_frag(dirp);
8150 if (r)
8151 return r;
8152 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8153 // different than the requested one. (our dirfragtree was outdated)
8154 check_caps = false;
8155 }
8156 frag_t fg = dirp->buffer_frag;
8157
8158 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8159 << " offset " << hex << dirp->offset << dendl;
8160
8161 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8162 dirp->offset, dir_result_t::dentry_off_lt());
8163 it != dirp->buffer.end();
8164 ++it) {
8165 dir_result_t::dentry &entry = *it;
8166
8167 uint64_t next_off = entry.offset + 1;
8168
8169 int r;
8170 if (check_caps) {
8171 r = _getattr(entry.inode, caps, dirp->perms);
8172 if (r < 0)
8173 return r;
8174 }
8175
8176 fill_statx(entry.inode, caps, &stx);
8177 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8178
8179 Inode *inode = NULL;
8180 if (getref) {
8181 inode = entry.inode.get();
8182 _ll_get(inode);
8183 }
8184
8185 client_lock.Unlock();
8186 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8187 client_lock.Lock();
8188
8189 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8190 << " = " << r << dendl;
8191 if (r < 0)
8192 return r;
8193
8194 dirp->offset = next_off;
8195 if (r > 0)
8196 return r;
8197 }
8198
8199 if (dirp->next_offset > 2) {
8200 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8201 _readdir_drop_dirp_buffer(dirp);
8202 continue; // more!
8203 }
8204
8205 if (!fg.is_rightmost()) {
8206 // next frag!
8207 _readdir_next_frag(dirp);
8208 continue;
8209 }
8210
8211 if (diri->shared_gen == dirp->start_shared_gen &&
8212 diri->dir_release_count == dirp->release_count) {
8213 if (diri->dir_ordered_count == dirp->ordered_count) {
8214 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8215 if (diri->dir) {
8216 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8217 diri->dir->readdir_cache.resize(dirp->cache_index);
8218 }
8219 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8220 } else {
8221 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8222 diri->flags |= I_COMPLETE;
8223 }
8224 }
8225
8226 dirp->set_end();
8227 return 0;
8228 }
8229 ceph_abort();
8230 return 0;
8231 }
8232
8233
8234 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8235 {
8236 return readdirplus_r(d, de, 0, 0, 0, NULL);
8237 }
8238
8239 /*
8240 * readdirplus_r
8241 *
8242 * returns
8243 * 1 if we got a dirent
8244 * 0 for end of directory
8245 * <0 on error
8246 */
8247
8248 struct single_readdir {
8249 struct dirent *de;
8250 struct ceph_statx *stx;
8251 Inode *inode;
8252 bool full;
8253 };
8254
8255 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8256 struct ceph_statx *stx, off_t off,
8257 Inode *in)
8258 {
8259 single_readdir *c = static_cast<single_readdir *>(p);
8260
8261 if (c->full)
8262 return -1; // already filled this dirent
8263
8264 *c->de = *de;
8265 if (c->stx)
8266 *c->stx = *stx;
8267 c->inode = in;
8268 c->full = true;
8269 return 1;
8270 }
8271
8272 struct dirent *Client::readdir(dir_result_t *d)
8273 {
8274 int ret;
8275 static struct dirent de;
8276 single_readdir sr;
8277 sr.de = &de;
8278 sr.stx = NULL;
8279 sr.inode = NULL;
8280 sr.full = false;
8281
8282 // our callback fills the dirent and sets sr.full=true on first
8283 // call, and returns -1 the second time around.
8284 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8285 if (ret < -1) {
8286 errno = -ret; // this sucks.
8287 return (dirent *) NULL;
8288 }
8289 if (sr.full) {
8290 return &de;
8291 }
8292 return (dirent *) NULL;
8293 }
8294
8295 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8296 struct ceph_statx *stx, unsigned want,
8297 unsigned flags, Inode **out)
8298 {
8299 single_readdir sr;
8300 sr.de = de;
8301 sr.stx = stx;
8302 sr.inode = NULL;
8303 sr.full = false;
8304
8305 // our callback fills the dirent and sets sr.full=true on first
8306 // call, and returns -1 the second time around.
8307 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8308 if (r < -1)
8309 return r;
8310 if (out)
8311 *out = sr.inode;
8312 if (sr.full)
8313 return 1;
8314 return 0;
8315 }
8316
8317
8318 /* getdents */
8319 struct getdents_result {
8320 char *buf;
8321 int buflen;
8322 int pos;
8323 bool fullent;
8324 };
8325
8326 static int _readdir_getdent_cb(void *p, struct dirent *de,
8327 struct ceph_statx *stx, off_t off, Inode *in)
8328 {
8329 struct getdents_result *c = static_cast<getdents_result *>(p);
8330
8331 int dlen;
8332 if (c->fullent)
8333 dlen = sizeof(*de);
8334 else
8335 dlen = strlen(de->d_name) + 1;
8336
8337 if (c->pos + dlen > c->buflen)
8338 return -1; // doesn't fit
8339
8340 if (c->fullent) {
8341 memcpy(c->buf + c->pos, de, sizeof(*de));
8342 } else {
8343 memcpy(c->buf + c->pos, de->d_name, dlen);
8344 }
8345 c->pos += dlen;
8346 return 0;
8347 }
8348
8349 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8350 {
8351 getdents_result gr;
8352 gr.buf = buf;
8353 gr.buflen = buflen;
8354 gr.fullent = fullent;
8355 gr.pos = 0;
8356
8357 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8358
8359 if (r < 0) { // some error
8360 if (r == -1) { // buffer ran out of space
8361 if (gr.pos) { // but we got some entries already!
8362 return gr.pos;
8363 } // or we need a larger buffer
8364 return -ERANGE;
8365 } else { // actual error, return it
8366 return r;
8367 }
8368 }
8369 return gr.pos;
8370 }
8371
8372
8373 /* getdir */
8374 struct getdir_result {
8375 list<string> *contents;
8376 int num;
8377 };
8378
8379 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8380 {
8381 getdir_result *r = static_cast<getdir_result *>(p);
8382
8383 r->contents->push_back(de->d_name);
8384 r->num++;
8385 return 0;
8386 }
8387
8388 int Client::getdir(const char *relpath, list<string>& contents,
8389 const UserPerm& perms)
8390 {
8391 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8392 {
8393 std::lock_guard lock(client_lock);
8394 tout(cct) << "getdir" << std::endl;
8395 tout(cct) << relpath << std::endl;
8396 }
8397
8398 dir_result_t *d;
8399 int r = opendir(relpath, &d, perms);
8400 if (r < 0)
8401 return r;
8402
8403 getdir_result gr;
8404 gr.contents = &contents;
8405 gr.num = 0;
8406 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8407
8408 closedir(d);
8409
8410 if (r < 0)
8411 return r;
8412 return gr.num;
8413 }
8414
8415
8416 /****** file i/o **********/
8417 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8418 mode_t mode, int stripe_unit, int stripe_count,
8419 int object_size, const char *data_pool)
8420 {
8421 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8422 std::lock_guard lock(client_lock);
8423 tout(cct) << "open" << std::endl;
8424 tout(cct) << relpath << std::endl;
8425 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8426
8427 if (unmounting)
8428 return -ENOTCONN;
8429
8430 Fh *fh = NULL;
8431
8432 #if defined(__linux__) && defined(O_PATH)
8433 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8434 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8435 * in kernel (fs/open.c). */
8436 if (flags & O_PATH)
8437 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8438 #endif
8439
8440 filepath path(relpath);
8441 InodeRef in;
8442 bool created = false;
8443 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8444 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8445 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8446
8447 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8448 return -EEXIST;
8449
8450 #if defined(__linux__) && defined(O_PATH)
8451 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8452 #else
8453 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8454 #endif
8455 return -ELOOP;
8456
8457 if (r == -ENOENT && (flags & O_CREAT)) {
8458 filepath dirpath = path;
8459 string dname = dirpath.last_dentry();
8460 dirpath.pop_dentry();
8461 InodeRef dir;
8462 r = path_walk(dirpath, &dir, perms, true,
8463 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8464 if (r < 0)
8465 goto out;
8466 if (cct->_conf->client_permissions) {
8467 r = may_create(dir.get(), perms);
8468 if (r < 0)
8469 goto out;
8470 }
8471 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8472 stripe_count, object_size, data_pool, &created, perms);
8473 }
8474 if (r < 0)
8475 goto out;
8476
8477 if (!created) {
8478 // posix says we can only check permissions of existing files
8479 if (cct->_conf->client_permissions) {
8480 r = may_open(in.get(), flags, perms);
8481 if (r < 0)
8482 goto out;
8483 }
8484 }
8485
8486 if (!fh)
8487 r = _open(in.get(), flags, mode, &fh, perms);
8488 if (r >= 0) {
8489 // allocate a integer file descriptor
8490 ceph_assert(fh);
8491 r = get_fd();
8492 ceph_assert(fd_map.count(r) == 0);
8493 fd_map[r] = fh;
8494 }
8495
8496 out:
8497 tout(cct) << r << std::endl;
8498 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8499 return r;
8500 }
8501
8502 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8503 {
8504 /* Use default file striping parameters */
8505 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8506 }
8507
8508 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8509 const UserPerm& perms)
8510 {
8511 std::lock_guard lock(client_lock);
8512 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8513
8514 if (unmounting)
8515 return -ENOTCONN;
8516
8517 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8518 filepath path(ino);
8519 req->set_filepath(path);
8520
8521 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8522 char f[30];
8523 sprintf(f, "%u", h);
8524 filepath path2(dirino);
8525 path2.push_dentry(string(f));
8526 req->set_filepath2(path2);
8527
8528 int r = make_request(req, perms, NULL, NULL,
8529 rand() % mdsmap->get_num_in_mds());
8530 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8531 return r;
8532 }
8533
8534
8535 /**
8536 * Load inode into local cache.
8537 *
8538 * If inode pointer is non-NULL, and take a reference on
8539 * the resulting Inode object in one operation, so that caller
8540 * can safely assume inode will still be there after return.
8541 */
8542 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8543 {
8544 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8545
8546 if (unmounting)
8547 return -ENOTCONN;
8548
8549 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8550 filepath path(ino);
8551 req->set_filepath(path);
8552
8553 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8554 if (r == 0 && inode != NULL) {
8555 vinodeno_t vino(ino, CEPH_NOSNAP);
8556 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8557 ceph_assert(p != inode_map.end());
8558 *inode = p->second;
8559 _ll_get(*inode);
8560 }
8561 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8562 return r;
8563 }
8564
8565 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8566 {
8567 std::lock_guard lock(client_lock);
8568 return _lookup_ino(ino, perms, inode);
8569 }
8570
8571 /**
8572 * Find the parent inode of `ino` and insert it into
8573 * our cache. Conditionally also set `parent` to a referenced
8574 * Inode* if caller provides non-NULL value.
8575 */
8576 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8577 {
8578 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8579
8580 if (unmounting)
8581 return -ENOTCONN;
8582
8583 if (!ino->dentries.empty()) {
8584 // if we exposed the parent here, we'd need to check permissions,
8585 // but right now we just rely on the MDS doing so in make_request
8586 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
8587 return 0;
8588 }
8589
8590 if (ino->is_root()) {
8591 *parent = NULL;
8592 ldout(cct, 8) << "ino is root, no parent" << dendl;
8593 return -EINVAL;
8594 }
8595
8596 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8597 filepath path(ino->ino);
8598 req->set_filepath(path);
8599
8600 InodeRef target;
8601 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8602 // Give caller a reference to the parent ino if they provided a pointer.
8603 if (parent != NULL) {
8604 if (r == 0) {
8605 *parent = target.get();
8606 _ll_get(*parent);
8607 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8608 } else {
8609 *parent = NULL;
8610 }
8611 }
8612 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8613 return r;
8614 }
8615
8616 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8617 {
8618 std::lock_guard lock(client_lock);
8619 return _lookup_parent(ino, perms, parent);
8620 }
8621
8622 /**
8623 * Populate the parent dentry for `ino`, provided it is
8624 * a child of `parent`.
8625 */
8626 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8627 {
8628 ceph_assert(parent->is_dir());
8629 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8630
8631 if (unmounting)
8632 return -ENOTCONN;
8633
8634 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8635 req->set_filepath2(filepath(parent->ino));
8636 req->set_filepath(filepath(ino->ino));
8637 req->set_inode(ino);
8638
8639 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8640 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8641 return r;
8642 }
8643
8644 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8645 {
8646 std::lock_guard lock(client_lock);
8647 return _lookup_name(ino, parent, perms);
8648 }
8649
8650 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8651 {
8652 ceph_assert(in);
8653 Fh *f = new Fh(in, flags, cmode, perms);
8654
8655 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8656
8657 if (in->snapid != CEPH_NOSNAP) {
8658 in->snap_cap_refs++;
8659 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8660 << ccap_string(in->caps_issued()) << dendl;
8661 }
8662
8663 const auto& conf = cct->_conf;
8664 f->readahead.set_trigger_requests(1);
8665 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8666 uint64_t max_readahead = Readahead::NO_LIMIT;
8667 if (conf->client_readahead_max_bytes) {
8668 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8669 }
8670 if (conf->client_readahead_max_periods) {
8671 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8672 }
8673 f->readahead.set_max_readahead_size(max_readahead);
8674 vector<uint64_t> alignments;
8675 alignments.push_back(in->layout.get_period());
8676 alignments.push_back(in->layout.stripe_unit);
8677 f->readahead.set_alignments(alignments);
8678
8679 return f;
8680 }
8681
8682 int Client::_release_fh(Fh *f)
8683 {
8684 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8685 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8686 Inode *in = f->inode.get();
8687 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8688
8689 in->unset_deleg(f);
8690
8691 if (in->snapid == CEPH_NOSNAP) {
8692 if (in->put_open_ref(f->mode)) {
8693 _flush(in, new C_Client_FlushComplete(this, in));
8694 check_caps(in, 0);
8695 }
8696 } else {
8697 ceph_assert(in->snap_cap_refs > 0);
8698 in->snap_cap_refs--;
8699 }
8700
8701 _release_filelocks(f);
8702
8703 // Finally, read any async err (i.e. from flushes)
8704 int err = f->take_async_err();
8705 if (err != 0) {
8706 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8707 << cpp_strerror(err) << dendl;
8708 } else {
8709 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8710 }
8711
8712 _put_fh(f);
8713
8714 return err;
8715 }
8716
8717 void Client::_put_fh(Fh *f)
8718 {
8719 int left = f->put();
8720 if (!left) {
8721 delete f;
8722 }
8723 }
8724
8725 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8726 const UserPerm& perms)
8727 {
8728 if (in->snapid != CEPH_NOSNAP &&
8729 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8730 return -EROFS;
8731 }
8732
8733 // use normalized flags to generate cmode
8734 int cflags = ceph_flags_sys2wire(flags);
8735 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8736 cflags |= CEPH_O_LAZY;
8737
8738 int cmode = ceph_flags_to_mode(cflags);
8739 int want = ceph_caps_for_mode(cmode);
8740 int result = 0;
8741
8742 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8743
8744 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8745 // update wanted?
8746 check_caps(in, CHECK_CAPS_NODELAY);
8747 } else {
8748
8749 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8750 filepath path;
8751 in->make_nosnap_relative_path(path);
8752 req->set_filepath(path);
8753 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8754 req->head.args.open.mode = mode;
8755 req->head.args.open.pool = -1;
8756 if (cct->_conf->client_debug_getattr_caps)
8757 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8758 else
8759 req->head.args.open.mask = 0;
8760 req->head.args.open.old_size = in->size; // for O_TRUNC
8761 req->set_inode(in);
8762 result = make_request(req, perms);
8763
8764 /*
8765 * NFS expects that delegations will be broken on a conflicting open,
8766 * not just when there is actual conflicting access to the file. SMB leases
8767 * and oplocks also have similar semantics.
8768 *
8769 * Ensure that clients that have delegations enabled will wait on minimal
8770 * caps during open, just to ensure that other clients holding delegations
8771 * return theirs first.
8772 */
8773 if (deleg_timeout && result == 0) {
8774 int need = 0, have;
8775
8776 if (cmode & CEPH_FILE_MODE_WR)
8777 need |= CEPH_CAP_FILE_WR;
8778 if (cmode & CEPH_FILE_MODE_RD)
8779 need |= CEPH_CAP_FILE_RD;
8780
8781 result = get_caps(in, need, want, &have, -1);
8782 if (result < 0) {
8783 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8784 " . Denying open: " <<
8785 cpp_strerror(result) << dendl;
8786 in->put_open_ref(cmode);
8787 } else {
8788 put_cap_ref(in, need);
8789 }
8790 }
8791 }
8792
8793 // success?
8794 if (result >= 0) {
8795 if (fhp)
8796 *fhp = _create_fh(in, flags, cmode, perms);
8797 } else {
8798 in->put_open_ref(cmode);
8799 }
8800
8801 trim_cache();
8802
8803 return result;
8804 }
8805
8806 int Client::_renew_caps(Inode *in)
8807 {
8808 int wanted = in->caps_file_wanted();
8809 if (in->is_any_caps() &&
8810 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8811 check_caps(in, CHECK_CAPS_NODELAY);
8812 return 0;
8813 }
8814
8815 int flags = 0;
8816 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8817 flags = O_RDWR;
8818 else if (wanted & CEPH_CAP_FILE_RD)
8819 flags = O_RDONLY;
8820 else if (wanted & CEPH_CAP_FILE_WR)
8821 flags = O_WRONLY;
8822
8823 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8824 filepath path;
8825 in->make_nosnap_relative_path(path);
8826 req->set_filepath(path);
8827 req->head.args.open.flags = flags;
8828 req->head.args.open.pool = -1;
8829 if (cct->_conf->client_debug_getattr_caps)
8830 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8831 else
8832 req->head.args.open.mask = 0;
8833 req->set_inode(in);
8834
8835 // duplicate in case Cap goes away; not sure if that race is a concern?
8836 const UserPerm *pperm = in->get_best_perms();
8837 UserPerm perms;
8838 if (pperm != NULL)
8839 perms = *pperm;
8840 int ret = make_request(req, perms);
8841 return ret;
8842 }
8843
8844 int Client::close(int fd)
8845 {
8846 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8847 std::lock_guard lock(client_lock);
8848 tout(cct) << "close" << std::endl;
8849 tout(cct) << fd << std::endl;
8850
8851 if (unmounting)
8852 return -ENOTCONN;
8853
8854 Fh *fh = get_filehandle(fd);
8855 if (!fh)
8856 return -EBADF;
8857 int err = _release_fh(fh);
8858 fd_map.erase(fd);
8859 put_fd(fd);
8860 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8861 return err;
8862 }
8863
8864
8865 // ------------
8866 // read, write
8867
8868 loff_t Client::lseek(int fd, loff_t offset, int whence)
8869 {
8870 std::lock_guard lock(client_lock);
8871 tout(cct) << "lseek" << std::endl;
8872 tout(cct) << fd << std::endl;
8873 tout(cct) << offset << std::endl;
8874 tout(cct) << whence << std::endl;
8875
8876 if (unmounting)
8877 return -ENOTCONN;
8878
8879 Fh *f = get_filehandle(fd);
8880 if (!f)
8881 return -EBADF;
8882 #if defined(__linux__) && defined(O_PATH)
8883 if (f->flags & O_PATH)
8884 return -EBADF;
8885 #endif
8886 return _lseek(f, offset, whence);
8887 }
8888
8889 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8890 {
8891 Inode *in = f->inode.get();
8892 int r;
8893 loff_t pos = -1;
8894
8895 switch (whence) {
8896 case SEEK_SET:
8897 pos = offset;
8898 break;
8899
8900 case SEEK_CUR:
8901 pos += offset;
8902 break;
8903
8904 case SEEK_END:
8905 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8906 if (r < 0)
8907 return r;
8908 pos = in->size + offset;
8909 break;
8910
8911 default:
8912 ceph_abort();
8913 }
8914
8915 if (pos < 0) {
8916 return -EINVAL;
8917 } else {
8918 f->pos = pos;
8919 }
8920
8921 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8922 return f->pos;
8923 }
8924
8925
8926 void Client::lock_fh_pos(Fh *f)
8927 {
8928 ldout(cct, 10) << __func__ << " " << f << dendl;
8929
8930 if (f->pos_locked || !f->pos_waiters.empty()) {
8931 Cond cond;
8932 f->pos_waiters.push_back(&cond);
8933 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
8934 while (f->pos_locked || f->pos_waiters.front() != &cond)
8935 cond.Wait(client_lock);
8936 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8937 ceph_assert(f->pos_waiters.front() == &cond);
8938 f->pos_waiters.pop_front();
8939 }
8940
8941 f->pos_locked = true;
8942 }
8943
8944 void Client::unlock_fh_pos(Fh *f)
8945 {
8946 ldout(cct, 10) << __func__ << " " << f << dendl;
8947 f->pos_locked = false;
8948 }
8949
8950 int Client::uninline_data(Inode *in, Context *onfinish)
8951 {
8952 if (!in->inline_data.length()) {
8953 onfinish->complete(0);
8954 return 0;
8955 }
8956
8957 char oid_buf[32];
8958 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8959 object_t oid = oid_buf;
8960
8961 ObjectOperation create_ops;
8962 create_ops.create(false);
8963
8964 objecter->mutate(oid,
8965 OSDMap::file_to_object_locator(in->layout),
8966 create_ops,
8967 in->snaprealm->get_snap_context(),
8968 ceph::real_clock::now(),
8969 0,
8970 NULL);
8971
8972 bufferlist inline_version_bl;
8973 encode(in->inline_version, inline_version_bl);
8974
8975 ObjectOperation uninline_ops;
8976 uninline_ops.cmpxattr("inline_version",
8977 CEPH_OSD_CMPXATTR_OP_GT,
8978 CEPH_OSD_CMPXATTR_MODE_U64,
8979 inline_version_bl);
8980 bufferlist inline_data = in->inline_data;
8981 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8982 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8983
8984 objecter->mutate(oid,
8985 OSDMap::file_to_object_locator(in->layout),
8986 uninline_ops,
8987 in->snaprealm->get_snap_context(),
8988 ceph::real_clock::now(),
8989 0,
8990 onfinish);
8991
8992 return 0;
8993 }
8994
8995 //
8996
8997 // blocking osd interface
8998
8999 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9000 {
9001 std::lock_guard lock(client_lock);
9002 tout(cct) << "read" << std::endl;
9003 tout(cct) << fd << std::endl;
9004 tout(cct) << size << std::endl;
9005 tout(cct) << offset << std::endl;
9006
9007 if (unmounting)
9008 return -ENOTCONN;
9009
9010 Fh *f = get_filehandle(fd);
9011 if (!f)
9012 return -EBADF;
9013 #if defined(__linux__) && defined(O_PATH)
9014 if (f->flags & O_PATH)
9015 return -EBADF;
9016 #endif
9017 bufferlist bl;
9018 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9019 size = std::min(size, (loff_t)INT_MAX);
9020 int r = _read(f, offset, size, &bl);
9021 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9022 if (r >= 0) {
9023 bl.copy(0, bl.length(), buf);
9024 r = bl.length();
9025 }
9026 return r;
9027 }
9028
9029 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9030 {
9031 if (iovcnt < 0)
9032 return -EINVAL;
9033 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9034 }
9035
9036 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9037 {
9038 int want, have = 0;
9039 bool movepos = false;
9040 std::unique_ptr<C_SaferCond> onuninline;
9041 int64_t r = 0;
9042 const auto& conf = cct->_conf;
9043 Inode *in = f->inode.get();
9044 utime_t lat;
9045 utime_t start = ceph_clock_now();
9046
9047 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9048 return -EBADF;
9049 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9050
9051 if (offset < 0) {
9052 lock_fh_pos(f);
9053 offset = f->pos;
9054 movepos = true;
9055 }
9056 loff_t start_pos = offset;
9057
9058 if (in->inline_version == 0) {
9059 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9060 if (r < 0) {
9061 goto done;
9062 }
9063 ceph_assert(in->inline_version > 0);
9064 }
9065
9066 retry:
9067 if (f->mode & CEPH_FILE_MODE_LAZY)
9068 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9069 else
9070 want = CEPH_CAP_FILE_CACHE;
9071 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
9072 if (r < 0) {
9073 goto done;
9074 }
9075 if (f->flags & O_DIRECT)
9076 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9077
9078 if (in->inline_version < CEPH_INLINE_NONE) {
9079 if (!(have & CEPH_CAP_FILE_CACHE)) {
9080 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9081 uninline_data(in, onuninline.get());
9082 } else {
9083 uint32_t len = in->inline_data.length();
9084 uint64_t endoff = offset + size;
9085 if (endoff > in->size)
9086 endoff = in->size;
9087
9088 if (offset < len) {
9089 if (endoff <= len) {
9090 bl->substr_of(in->inline_data, offset, endoff - offset);
9091 } else {
9092 bl->substr_of(in->inline_data, offset, len - offset);
9093 bl->append_zero(endoff - len);
9094 }
9095 r = endoff - offset;
9096 } else if ((uint64_t)offset < endoff) {
9097 bl->append_zero(endoff - offset);
9098 r = endoff - offset;
9099 } else {
9100 r = 0;
9101 }
9102 goto success;
9103 }
9104 }
9105
9106 if (!conf->client_debug_force_sync_read &&
9107 conf->client_oc &&
9108 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9109
9110 if (f->flags & O_RSYNC) {
9111 _flush_range(in, offset, size);
9112 }
9113 r = _read_async(f, offset, size, bl);
9114 if (r < 0)
9115 goto done;
9116 } else {
9117 if (f->flags & O_DIRECT)
9118 _flush_range(in, offset, size);
9119
9120 bool checkeof = false;
9121 r = _read_sync(f, offset, size, bl, &checkeof);
9122 if (r < 0)
9123 goto done;
9124 if (checkeof) {
9125 offset += r;
9126 size -= r;
9127
9128 put_cap_ref(in, CEPH_CAP_FILE_RD);
9129 have = 0;
9130 // reverify size
9131 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9132 if (r < 0)
9133 goto done;
9134
9135 // eof? short read.
9136 if ((uint64_t)offset < in->size)
9137 goto retry;
9138 }
9139 }
9140
9141 success:
9142 ceph_assert(r >= 0);
9143 if (movepos) {
9144 // adjust fd pos
9145 f->pos = start_pos + r;
9146 }
9147
9148 lat = ceph_clock_now();
9149 lat -= start;
9150 logger->tinc(l_c_read, lat);
9151
9152 done:
9153 // done!
9154
9155 if (onuninline) {
9156 client_lock.Unlock();
9157 int ret = onuninline->wait();
9158 client_lock.Lock();
9159 if (ret >= 0 || ret == -ECANCELED) {
9160 in->inline_data.clear();
9161 in->inline_version = CEPH_INLINE_NONE;
9162 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9163 check_caps(in, 0);
9164 } else
9165 r = ret;
9166 }
9167 if (have) {
9168 put_cap_ref(in, CEPH_CAP_FILE_RD);
9169 }
9170 if (movepos) {
9171 unlock_fh_pos(f);
9172 }
9173 return r;
9174 }
9175
9176 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9177 client(c), f(f) {
9178 f->get();
9179 f->readahead.inc_pending();
9180 }
9181
9182 Client::C_Readahead::~C_Readahead() {
9183 f->readahead.dec_pending();
9184 client->_put_fh(f);
9185 }
9186
9187 void Client::C_Readahead::finish(int r) {
9188 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9189 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9190 }
9191
9192 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9193 {
9194 const auto& conf = cct->_conf;
9195 Inode *in = f->inode.get();
9196
9197 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9198
9199 // trim read based on file size?
9200 if (off >= in->size)
9201 return 0;
9202 if (len == 0)
9203 return 0;
9204 if (off + len > in->size) {
9205 len = in->size - off;
9206 }
9207
9208 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9209 << " max_bytes=" << f->readahead.get_max_readahead_size()
9210 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9211
9212 // read (and possibly block)
9213 int r = 0;
9214 C_SaferCond onfinish("Client::_read_async flock");
9215 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9216 off, len, bl, 0, &onfinish);
9217 if (r == 0) {
9218 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9219 client_lock.Unlock();
9220 r = onfinish.wait();
9221 client_lock.Lock();
9222 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9223 }
9224
9225 if(f->readahead.get_min_readahead_size() > 0) {
9226 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9227 if (readahead_extent.second > 0) {
9228 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9229 << " (caller wants " << off << "~" << len << ")" << dendl;
9230 Context *onfinish2 = new C_Readahead(this, f);
9231 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9232 readahead_extent.first, readahead_extent.second,
9233 NULL, 0, onfinish2);
9234 if (r2 == 0) {
9235 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9236 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9237 } else {
9238 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9239 delete onfinish2;
9240 }
9241 }
9242 }
9243
9244 return r;
9245 }
9246
9247 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9248 bool *checkeof)
9249 {
9250 Inode *in = f->inode.get();
9251 uint64_t pos = off;
9252 int left = len;
9253 int read = 0;
9254
9255 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9256
9257 Mutex flock("Client::_read_sync flock");
9258 Cond cond;
9259 while (left > 0) {
9260 C_SaferCond onfinish("Client::_read_sync flock");
9261 bufferlist tbl;
9262
9263 int wanted = left;
9264 filer->read_trunc(in->ino, &in->layout, in->snapid,
9265 pos, left, &tbl, 0,
9266 in->truncate_size, in->truncate_seq,
9267 &onfinish);
9268 client_lock.Unlock();
9269 int r = onfinish.wait();
9270 client_lock.Lock();
9271
9272 // if we get ENOENT from OSD, assume 0 bytes returned
9273 if (r == -ENOENT)
9274 r = 0;
9275 if (r < 0)
9276 return r;
9277 if (tbl.length()) {
9278 r = tbl.length();
9279
9280 read += r;
9281 pos += r;
9282 left -= r;
9283 bl->claim_append(tbl);
9284 }
9285 // short read?
9286 if (r >= 0 && r < wanted) {
9287 if (pos < in->size) {
9288 // zero up to known EOF
9289 int64_t some = in->size - pos;
9290 if (some > left)
9291 some = left;
9292 auto z = buffer::ptr_node::create(some);
9293 z->zero();
9294 bl->push_back(std::move(z));
9295 read += some;
9296 pos += some;
9297 left -= some;
9298 if (left == 0)
9299 return read;
9300 }
9301
9302 *checkeof = true;
9303 return read;
9304 }
9305 }
9306 return read;
9307 }
9308
9309
9310 /*
9311 * we keep count of uncommitted sync writes on the inode, so that
9312 * fsync can DDRT.
9313 */
9314 void Client::_sync_write_commit(Inode *in)
9315 {
9316 ceph_assert(unsafe_sync_write > 0);
9317 unsafe_sync_write--;
9318
9319 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9320
9321 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9322 if (unsafe_sync_write == 0 && unmounting) {
9323 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9324 mount_cond.Signal();
9325 }
9326 }
9327
9328 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9329 {
9330 std::lock_guard lock(client_lock);
9331 tout(cct) << "write" << std::endl;
9332 tout(cct) << fd << std::endl;
9333 tout(cct) << size << std::endl;
9334 tout(cct) << offset << std::endl;
9335
9336 if (unmounting)
9337 return -ENOTCONN;
9338
9339 Fh *fh = get_filehandle(fd);
9340 if (!fh)
9341 return -EBADF;
9342 #if defined(__linux__) && defined(O_PATH)
9343 if (fh->flags & O_PATH)
9344 return -EBADF;
9345 #endif
9346 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9347 size = std::min(size, (loff_t)INT_MAX);
9348 int r = _write(fh, offset, size, buf, NULL, false);
9349 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9350 return r;
9351 }
9352
9353 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9354 {
9355 if (iovcnt < 0)
9356 return -EINVAL;
9357 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9358 }
9359
9360 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9361 unsigned iovcnt, int64_t offset, bool write,
9362 bool clamp_to_int)
9363 {
9364 #if defined(__linux__) && defined(O_PATH)
9365 if (fh->flags & O_PATH)
9366 return -EBADF;
9367 #endif
9368 loff_t totallen = 0;
9369 for (unsigned i = 0; i < iovcnt; i++) {
9370 totallen += iov[i].iov_len;
9371 }
9372
9373 /*
9374 * Some of the API functions take 64-bit size values, but only return
9375 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9376 * we don't do I/Os larger than the values we can return.
9377 */
9378 if (clamp_to_int) {
9379 totallen = std::min(totallen, (loff_t)INT_MAX);
9380 }
9381 if (write) {
9382 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9383 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9384 return w;
9385 } else {
9386 bufferlist bl;
9387 int64_t r = _read(fh, offset, totallen, &bl);
9388 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9389 if (r <= 0)
9390 return r;
9391
9392 int bufoff = 0;
9393 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9394 /*
9395 * This piece of code aims to handle the case that bufferlist does not have enough data
9396 * to fill in the iov
9397 */
9398 if (resid < iov[j].iov_len) {
9399 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9400 break;
9401 } else {
9402 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9403 }
9404 resid -= iov[j].iov_len;
9405 bufoff += iov[j].iov_len;
9406 }
9407 return r;
9408 }
9409 }
9410
9411 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9412 {
9413 std::lock_guard lock(client_lock);
9414 tout(cct) << fd << std::endl;
9415 tout(cct) << offset << std::endl;
9416
9417 if (unmounting)
9418 return -ENOTCONN;
9419
9420 Fh *fh = get_filehandle(fd);
9421 if (!fh)
9422 return -EBADF;
9423 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9424 }
9425
9426 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9427 const struct iovec *iov, int iovcnt)
9428 {
9429 uint64_t fpos = 0;
9430
9431 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9432 return -EFBIG;
9433
9434 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9435 Inode *in = f->inode.get();
9436
9437 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9438 return -ENOSPC;
9439 }
9440
9441 ceph_assert(in->snapid == CEPH_NOSNAP);
9442
9443 // was Fh opened as writeable?
9444 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9445 return -EBADF;
9446
9447 // use/adjust fd pos?
9448 if (offset < 0) {
9449 lock_fh_pos(f);
9450 /*
9451 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9452 * change out from under us.
9453 */
9454 if (f->flags & O_APPEND) {
9455 int r = _lseek(f, 0, SEEK_END);
9456 if (r < 0) {
9457 unlock_fh_pos(f);
9458 return r;
9459 }
9460 }
9461 offset = f->pos;
9462 fpos = offset+size;
9463 unlock_fh_pos(f);
9464 }
9465
9466 // check quota
9467 uint64_t endoff = offset + size;
9468 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9469 f->actor_perms)) {
9470 return -EDQUOT;
9471 }
9472
9473 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9474
9475 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9476
9477 // time it.
9478 utime_t start = ceph_clock_now();
9479
9480 if (in->inline_version == 0) {
9481 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9482 if (r < 0)
9483 return r;
9484 ceph_assert(in->inline_version > 0);
9485 }
9486
9487 // copy into fresh buffer (since our write may be resub, async)
9488 bufferlist bl;
9489 if (buf) {
9490 if (size > 0)
9491 bl.append(buf, size);
9492 } else if (iov){
9493 for (int i = 0; i < iovcnt; i++) {
9494 if (iov[i].iov_len > 0) {
9495 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9496 }
9497 }
9498 }
9499
9500 utime_t lat;
9501 uint64_t totalwritten;
9502 int want, have;
9503 if (f->mode & CEPH_FILE_MODE_LAZY)
9504 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9505 else
9506 want = CEPH_CAP_FILE_BUFFER;
9507 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9508 if (r < 0)
9509 return r;
9510
9511 /* clear the setuid/setgid bits, if any */
9512 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9513 struct ceph_statx stx = { 0 };
9514
9515 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9516 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9517 if (r < 0)
9518 return r;
9519 } else {
9520 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9521 }
9522
9523 if (f->flags & O_DIRECT)
9524 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9525
9526 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9527
9528 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9529
9530 if (in->inline_version < CEPH_INLINE_NONE) {
9531 if (endoff > cct->_conf->client_max_inline_size ||
9532 endoff > CEPH_INLINE_MAX_SIZE ||
9533 !(have & CEPH_CAP_FILE_BUFFER)) {
9534 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9535 uninline_data(in, onuninline.get());
9536 } else {
9537 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9538
9539 uint32_t len = in->inline_data.length();
9540
9541 if (endoff < len)
9542 in->inline_data.copy(endoff, len - endoff, bl);
9543
9544 if (offset < len)
9545 in->inline_data.splice(offset, len - offset);
9546 else if (offset > len)
9547 in->inline_data.append_zero(offset - len);
9548
9549 in->inline_data.append(bl);
9550 in->inline_version++;
9551
9552 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9553
9554 goto success;
9555 }
9556 }
9557
9558 if (cct->_conf->client_oc &&
9559 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9560 // do buffered write
9561 if (!in->oset.dirty_or_tx)
9562 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9563
9564 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9565
9566 // async, caching, non-blocking.
9567 r = objectcacher->file_write(&in->oset, &in->layout,
9568 in->snaprealm->get_snap_context(),
9569 offset, size, bl, ceph::real_clock::now(),
9570 0);
9571 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9572
9573 if (r < 0)
9574 goto done;
9575
9576 // flush cached write if O_SYNC is set on file fh
9577 // O_DSYNC == O_SYNC on linux < 2.6.33
9578 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9579 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9580 _flush_range(in, offset, size);
9581 }
9582 } else {
9583 if (f->flags & O_DIRECT)
9584 _flush_range(in, offset, size);
9585
9586 // simple, non-atomic sync write
9587 C_SaferCond onfinish("Client::_write flock");
9588 unsafe_sync_write++;
9589 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9590
9591 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9592 offset, size, bl, ceph::real_clock::now(), 0,
9593 in->truncate_size, in->truncate_seq,
9594 &onfinish);
9595 client_lock.Unlock();
9596 onfinish.wait();
9597 client_lock.Lock();
9598 _sync_write_commit(in);
9599 }
9600
9601 // if we get here, write was successful, update client metadata
9602 success:
9603 // time
9604 lat = ceph_clock_now();
9605 lat -= start;
9606 logger->tinc(l_c_wrlat, lat);
9607
9608 if (fpos) {
9609 lock_fh_pos(f);
9610 f->pos = fpos;
9611 unlock_fh_pos(f);
9612 }
9613 totalwritten = size;
9614 r = (int64_t)totalwritten;
9615
9616 // extend file?
9617 if (totalwritten + offset > in->size) {
9618 in->size = totalwritten + offset;
9619 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9620
9621 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9622 check_caps(in, CHECK_CAPS_NODELAY);
9623 } else if (is_max_size_approaching(in)) {
9624 check_caps(in, 0);
9625 }
9626
9627 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9628 } else {
9629 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9630 }
9631
9632 // mtime
9633 in->mtime = in->ctime = ceph_clock_now();
9634 in->change_attr++;
9635 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9636
9637 done:
9638
9639 if (nullptr != onuninline) {
9640 client_lock.Unlock();
9641 int uninline_ret = onuninline->wait();
9642 client_lock.Lock();
9643
9644 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9645 in->inline_data.clear();
9646 in->inline_version = CEPH_INLINE_NONE;
9647 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9648 check_caps(in, 0);
9649 } else
9650 r = uninline_ret;
9651 }
9652
9653 put_cap_ref(in, CEPH_CAP_FILE_WR);
9654 return r;
9655 }
9656
9657 int Client::_flush(Fh *f)
9658 {
9659 Inode *in = f->inode.get();
9660 int err = f->take_async_err();
9661 if (err != 0) {
9662 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9663 << cpp_strerror(err) << dendl;
9664 } else {
9665 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9666 }
9667
9668 return err;
9669 }
9670
9671 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9672 {
9673 struct ceph_statx stx;
9674 stx.stx_size = length;
9675 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9676 }
9677
9678 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9679 {
9680 std::lock_guard lock(client_lock);
9681 tout(cct) << __func__ << std::endl;
9682 tout(cct) << fd << std::endl;
9683 tout(cct) << length << std::endl;
9684
9685 if (unmounting)
9686 return -ENOTCONN;
9687
9688 Fh *f = get_filehandle(fd);
9689 if (!f)
9690 return -EBADF;
9691 #if defined(__linux__) && defined(O_PATH)
9692 if (f->flags & O_PATH)
9693 return -EBADF;
9694 #endif
9695 struct stat attr;
9696 attr.st_size = length;
9697 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9698 }
9699
9700 int Client::fsync(int fd, bool syncdataonly)
9701 {
9702 std::lock_guard lock(client_lock);
9703 tout(cct) << "fsync" << std::endl;
9704 tout(cct) << fd << std::endl;
9705 tout(cct) << syncdataonly << std::endl;
9706
9707 if (unmounting)
9708 return -ENOTCONN;
9709
9710 Fh *f = get_filehandle(fd);
9711 if (!f)
9712 return -EBADF;
9713 #if defined(__linux__) && defined(O_PATH)
9714 if (f->flags & O_PATH)
9715 return -EBADF;
9716 #endif
9717 int r = _fsync(f, syncdataonly);
9718 if (r == 0) {
9719 // The IOs in this fsync were okay, but maybe something happened
9720 // in the background that we shoudl be reporting?
9721 r = f->take_async_err();
9722 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9723 << ") = 0, async_err = " << r << dendl;
9724 } else {
9725 // Assume that an error we encountered during fsync, even reported
9726 // synchronously, would also have applied the error to the Fh, and we
9727 // should clear it here to avoid returning the same error again on next
9728 // call.
9729 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9730 << r << dendl;
9731 f->take_async_err();
9732 }
9733 return r;
9734 }
9735
9736 int Client::_fsync(Inode *in, bool syncdataonly)
9737 {
9738 int r = 0;
9739 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9740 ceph_tid_t flush_tid = 0;
9741 InodeRef tmp_ref;
9742 utime_t lat;
9743 utime_t start = ceph_clock_now();
9744
9745 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9746
9747 if (cct->_conf->client_oc) {
9748 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9749 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9750 _flush(in, object_cacher_completion.get());
9751 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9752 }
9753
9754 if (!syncdataonly && in->dirty_caps) {
9755 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9756 if (in->flushing_caps)
9757 flush_tid = last_flush_tid;
9758 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9759
9760 if (!syncdataonly && !in->unsafe_ops.empty()) {
9761 flush_mdlog_sync();
9762
9763 MetaRequest *req = in->unsafe_ops.back();
9764 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9765
9766 req->get();
9767 wait_on_list(req->waitfor_safe);
9768 put_request(req);
9769 }
9770
9771 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9772 client_lock.Unlock();
9773 ldout(cct, 15) << "waiting on data to flush" << dendl;
9774 r = object_cacher_completion->wait();
9775 client_lock.Lock();
9776 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9777 } else {
9778 // FIXME: this can starve
9779 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9780 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9781 << " uncommitted, waiting" << dendl;
9782 wait_on_list(in->waitfor_commit);
9783 }
9784 }
9785
9786 if (!r) {
9787 if (flush_tid > 0)
9788 wait_sync_caps(in, flush_tid);
9789
9790 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9791 } else {
9792 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9793 << cpp_strerror(-r) << dendl;
9794 }
9795
9796 lat = ceph_clock_now();
9797 lat -= start;
9798 logger->tinc(l_c_fsync, lat);
9799
9800 return r;
9801 }
9802
9803 int Client::_fsync(Fh *f, bool syncdataonly)
9804 {
9805 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9806 return _fsync(f->inode.get(), syncdataonly);
9807 }
9808
9809 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9810 {
9811 std::lock_guard lock(client_lock);
9812 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9813 tout(cct) << fd << std::endl;
9814
9815 if (unmounting)
9816 return -ENOTCONN;
9817
9818 Fh *f = get_filehandle(fd);
9819 if (!f)
9820 return -EBADF;
9821 int r = _getattr(f->inode, mask, perms);
9822 if (r < 0)
9823 return r;
9824 fill_stat(f->inode, stbuf, NULL);
9825 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9826 return r;
9827 }
9828
9829 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9830 unsigned int want, unsigned int flags)
9831 {
9832 std::lock_guard lock(client_lock);
9833 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9834 tout(cct) << fd << std::endl;
9835
9836 if (unmounting)
9837 return -ENOTCONN;
9838
9839 Fh *f = get_filehandle(fd);
9840 if (!f)
9841 return -EBADF;
9842
9843 unsigned mask = statx_to_mask(flags, want);
9844
9845 int r = 0;
9846 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9847 r = _getattr(f->inode, mask, perms);
9848 if (r < 0) {
9849 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9850 return r;
9851 }
9852 }
9853
9854 fill_statx(f->inode, mask, stx);
9855 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9856 return r;
9857 }
9858
9859 // not written yet, but i want to link!
9860
9861 int Client::chdir(const char *relpath, std::string &new_cwd,
9862 const UserPerm& perms)
9863 {
9864 std::lock_guard lock(client_lock);
9865 tout(cct) << "chdir" << std::endl;
9866 tout(cct) << relpath << std::endl;
9867
9868 if (unmounting)
9869 return -ENOTCONN;
9870
9871 filepath path(relpath);
9872 InodeRef in;
9873 int r = path_walk(path, &in, perms);
9874 if (r < 0)
9875 return r;
9876 if (cwd != in)
9877 cwd.swap(in);
9878 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9879
9880 _getcwd(new_cwd, perms);
9881 return 0;
9882 }
9883
9884 void Client::_getcwd(string& dir, const UserPerm& perms)
9885 {
9886 filepath path;
9887 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
9888
9889 Inode *in = cwd.get();
9890 while (in != root) {
9891 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
9892
9893 // A cwd or ancester is unlinked
9894 if (in->dentries.empty()) {
9895 return;
9896 }
9897
9898 Dentry *dn = in->get_first_parent();
9899
9900
9901 if (!dn) {
9902 // look it up
9903 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
9904 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9905 filepath path(in->ino);
9906 req->set_filepath(path);
9907 req->set_inode(in);
9908 int res = make_request(req, perms);
9909 if (res < 0)
9910 break;
9911
9912 // start over
9913 path = filepath();
9914 in = cwd.get();
9915 continue;
9916 }
9917 path.push_front_dentry(dn->name);
9918 in = dn->dir->parent_inode;
9919 }
9920 dir = "/";
9921 dir += path.get_path();
9922 }
9923
9924 void Client::getcwd(string& dir, const UserPerm& perms)
9925 {
9926 std::lock_guard l(client_lock);
9927 if (!unmounting)
9928 _getcwd(dir, perms);
9929 }
9930
9931 int Client::statfs(const char *path, struct statvfs *stbuf,
9932 const UserPerm& perms)
9933 {
9934 std::lock_guard l(client_lock);
9935 tout(cct) << __func__ << std::endl;
9936 unsigned long int total_files_on_fs;
9937
9938 if (unmounting)
9939 return -ENOTCONN;
9940
9941 ceph_statfs stats;
9942 C_SaferCond cond;
9943
9944 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9945 if (data_pools.size() == 1) {
9946 objecter->get_fs_stats(stats, data_pools[0], &cond);
9947 } else {
9948 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9949 }
9950
9951 client_lock.Unlock();
9952 int rval = cond.wait();
9953 assert(root);
9954 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9955 client_lock.Lock();
9956
9957 if (rval < 0) {
9958 ldout(cct, 1) << "underlying call to statfs returned error: "
9959 << cpp_strerror(rval)
9960 << dendl;
9961 return rval;
9962 }
9963
9964 memset(stbuf, 0, sizeof(*stbuf));
9965
9966 /*
9967 * we're going to set a block size of 4MB so we can represent larger
9968 * FSes without overflowing. Additionally convert the space
9969 * measurements from KB to bytes while making them in terms of
9970 * blocks. We use 4MB only because it is big enough, and because it
9971 * actually *is* the (ceph) default block size.
9972 */
9973 const int CEPH_BLOCK_SHIFT = 22;
9974 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9975 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9976 stbuf->f_files = total_files_on_fs;
9977 stbuf->f_ffree = 0;
9978 stbuf->f_favail = -1;
9979 stbuf->f_fsid = -1; // ??
9980 stbuf->f_flag = 0; // ??
9981 stbuf->f_namemax = NAME_MAX;
9982
9983 // Usually quota_root will == root_ancestor, but if the mount root has no
9984 // quota but we can see a parent of it that does have a quota, we'll
9985 // respect that one instead.
9986 ceph_assert(root != nullptr);
9987 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9988
9989 // get_quota_root should always give us something
9990 // because client quotas are always enabled
9991 ceph_assert(quota_root != nullptr);
9992
9993 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9994
9995 // Skip the getattr if any sessions are stale, as we don't want to
9996 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9997 // is unhealthy.
9998 if (!_any_stale_sessions()) {
9999 int r = _getattr(quota_root, 0, perms, true);
10000 if (r != 0) {
10001 // Ignore return value: error getting latest inode metadata is not a good
10002 // reason to break "df".
10003 lderr(cct) << "Error in getattr on quota root 0x"
10004 << std::hex << quota_root->ino << std::dec
10005 << " statfs result may be outdated" << dendl;
10006 }
10007 }
10008
10009 // Special case: if there is a size quota set on the Inode acting
10010 // as the root for this client mount, then report the quota status
10011 // as the filesystem statistics.
10012 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10013 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10014 // It is possible for a quota to be exceeded: arithmetic here must
10015 // handle case where used > total.
10016 const fsblkcnt_t free = total > used ? total - used : 0;
10017
10018 stbuf->f_blocks = total;
10019 stbuf->f_bfree = free;
10020 stbuf->f_bavail = free;
10021 } else {
10022 // General case: report the cluster statistics returned from RADOS. Because
10023 // multiple pools may be used without one filesystem namespace via
10024 // layouts, this is the most correct thing we can do.
10025 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10026 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10027 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10028 }
10029
10030 return rval;
10031 }
10032
10033 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10034 struct flock *fl, uint64_t owner, bool removing)
10035 {
10036 ldout(cct, 10) << __func__ << " ino " << in->ino
10037 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10038 << " type " << fl->l_type << " owner " << owner
10039 << " " << fl->l_start << "~" << fl->l_len << dendl;
10040
10041 int lock_cmd;
10042 if (F_RDLCK == fl->l_type)
10043 lock_cmd = CEPH_LOCK_SHARED;
10044 else if (F_WRLCK == fl->l_type)
10045 lock_cmd = CEPH_LOCK_EXCL;
10046 else if (F_UNLCK == fl->l_type)
10047 lock_cmd = CEPH_LOCK_UNLOCK;
10048 else
10049 return -EIO;
10050
10051 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10052 sleep = 0;
10053
10054 /*
10055 * Set the most significant bit, so that MDS knows the 'owner'
10056 * is sufficient to identify the owner of lock. (old code uses
10057 * both 'owner' and 'pid')
10058 */
10059 owner |= (1ULL << 63);
10060
10061 MetaRequest *req = new MetaRequest(op);
10062 filepath path;
10063 in->make_nosnap_relative_path(path);
10064 req->set_filepath(path);
10065 req->set_inode(in);
10066
10067 req->head.args.filelock_change.rule = lock_type;
10068 req->head.args.filelock_change.type = lock_cmd;
10069 req->head.args.filelock_change.owner = owner;
10070 req->head.args.filelock_change.pid = fl->l_pid;
10071 req->head.args.filelock_change.start = fl->l_start;
10072 req->head.args.filelock_change.length = fl->l_len;
10073 req->head.args.filelock_change.wait = sleep;
10074
10075 int ret;
10076 bufferlist bl;
10077
10078 if (sleep && switch_interrupt_cb) {
10079 // enable interrupt
10080 switch_interrupt_cb(callback_handle, req->get());
10081 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10082 // disable interrupt
10083 switch_interrupt_cb(callback_handle, NULL);
10084 if (ret == 0 && req->aborted()) {
10085 // effect of this lock request has been revoked by the 'lock intr' request
10086 ret = req->get_abort_code();
10087 }
10088 put_request(req);
10089 } else {
10090 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10091 }
10092
10093 if (ret == 0) {
10094 if (op == CEPH_MDS_OP_GETFILELOCK) {
10095 ceph_filelock filelock;
10096 auto p = bl.cbegin();
10097 decode(filelock, p);
10098
10099 if (CEPH_LOCK_SHARED == filelock.type)
10100 fl->l_type = F_RDLCK;
10101 else if (CEPH_LOCK_EXCL == filelock.type)
10102 fl->l_type = F_WRLCK;
10103 else
10104 fl->l_type = F_UNLCK;
10105
10106 fl->l_whence = SEEK_SET;
10107 fl->l_start = filelock.start;
10108 fl->l_len = filelock.length;
10109 fl->l_pid = filelock.pid;
10110 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10111 ceph_lock_state_t *lock_state;
10112 if (lock_type == CEPH_LOCK_FCNTL) {
10113 if (!in->fcntl_locks)
10114 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10115 lock_state = in->fcntl_locks.get();
10116 } else if (lock_type == CEPH_LOCK_FLOCK) {
10117 if (!in->flock_locks)
10118 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10119 lock_state = in->flock_locks.get();
10120 } else {
10121 ceph_abort();
10122 return -EINVAL;
10123 }
10124 _update_lock_state(fl, owner, lock_state);
10125
10126 if (!removing) {
10127 if (lock_type == CEPH_LOCK_FCNTL) {
10128 if (!fh->fcntl_locks)
10129 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10130 lock_state = fh->fcntl_locks.get();
10131 } else {
10132 if (!fh->flock_locks)
10133 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10134 lock_state = fh->flock_locks.get();
10135 }
10136 _update_lock_state(fl, owner, lock_state);
10137 }
10138 } else
10139 ceph_abort();
10140 }
10141 return ret;
10142 }
10143
10144 int Client::_interrupt_filelock(MetaRequest *req)
10145 {
10146 // Set abort code, but do not kick. The abort code prevents the request
10147 // from being re-sent.
10148 req->abort(-EINTR);
10149 if (req->mds < 0)
10150 return 0; // haven't sent the request
10151
10152 Inode *in = req->inode();
10153
10154 int lock_type;
10155 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10156 lock_type = CEPH_LOCK_FLOCK_INTR;
10157 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10158 lock_type = CEPH_LOCK_FCNTL_INTR;
10159 else {
10160 ceph_abort();
10161 return -EINVAL;
10162 }
10163
10164 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10165 filepath path;
10166 in->make_nosnap_relative_path(path);
10167 intr_req->set_filepath(path);
10168 intr_req->set_inode(in);
10169 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10170 intr_req->head.args.filelock_change.rule = lock_type;
10171 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10172
10173 UserPerm perms(req->get_uid(), req->get_gid());
10174 return make_request(intr_req, perms, NULL, NULL, -1);
10175 }
10176
10177 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10178 {
10179 if (!in->fcntl_locks && !in->flock_locks)
10180 return;
10181
10182 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10183 encode(nr_fcntl_locks, bl);
10184 if (nr_fcntl_locks) {
10185 auto &lock_state = in->fcntl_locks;
10186 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10187 p != lock_state->held_locks.end();
10188 ++p)
10189 encode(p->second, bl);
10190 }
10191
10192 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10193 encode(nr_flock_locks, bl);
10194 if (nr_flock_locks) {
10195 auto &lock_state = in->flock_locks;
10196 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10197 p != lock_state->held_locks.end();
10198 ++p)
10199 encode(p->second, bl);
10200 }
10201
10202 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10203 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10204 }
10205
10206 void Client::_release_filelocks(Fh *fh)
10207 {
10208 if (!fh->fcntl_locks && !fh->flock_locks)
10209 return;
10210
10211 Inode *in = fh->inode.get();
10212 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10213
10214 list<pair<int, ceph_filelock> > to_release;
10215
10216 if (fh->fcntl_locks) {
10217 auto &lock_state = fh->fcntl_locks;
10218 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10219 p != lock_state->held_locks.end();
10220 ++p)
10221 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10222 lock_state.reset();
10223 }
10224 if (fh->flock_locks) {
10225 auto &lock_state = fh->flock_locks;
10226 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10227 p != lock_state->held_locks.end();
10228 ++p)
10229 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10230 lock_state.reset();
10231 }
10232
10233 if (to_release.empty())
10234 return;
10235
10236 // mds has already released filelocks if session was closed.
10237 if (in->caps.empty())
10238 return;
10239
10240 struct flock fl;
10241 memset(&fl, 0, sizeof(fl));
10242 fl.l_whence = SEEK_SET;
10243 fl.l_type = F_UNLCK;
10244
10245 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10246 p != to_release.end();
10247 ++p) {
10248 fl.l_start = p->second.start;
10249 fl.l_len = p->second.length;
10250 fl.l_pid = p->second.pid;
10251 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10252 p->second.owner, true);
10253 }
10254 }
10255
10256 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10257 ceph_lock_state_t *lock_state)
10258 {
10259 int lock_cmd;
10260 if (F_RDLCK == fl->l_type)
10261 lock_cmd = CEPH_LOCK_SHARED;
10262 else if (F_WRLCK == fl->l_type)
10263 lock_cmd = CEPH_LOCK_EXCL;
10264 else
10265 lock_cmd = CEPH_LOCK_UNLOCK;;
10266
10267 ceph_filelock filelock;
10268 filelock.start = fl->l_start;
10269 filelock.length = fl->l_len;
10270 filelock.client = 0;
10271 // see comment in _do_filelock()
10272 filelock.owner = owner | (1ULL << 63);
10273 filelock.pid = fl->l_pid;
10274 filelock.type = lock_cmd;
10275
10276 if (filelock.type == CEPH_LOCK_UNLOCK) {
10277 list<ceph_filelock> activated_locks;
10278 lock_state->remove_lock(filelock, activated_locks);
10279 } else {
10280 bool r = lock_state->add_lock(filelock, false, false, NULL);
10281 ceph_assert(r);
10282 }
10283 }
10284
10285 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10286 {
10287 Inode *in = fh->inode.get();
10288 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10289 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10290 return ret;
10291 }
10292
10293 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10294 {
10295 Inode *in = fh->inode.get();
10296 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10297 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10298 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10299 return ret;
10300 }
10301
10302 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10303 {
10304 Inode *in = fh->inode.get();
10305 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10306
10307 int sleep = !(cmd & LOCK_NB);
10308 cmd &= ~LOCK_NB;
10309
10310 int type;
10311 switch (cmd) {
10312 case LOCK_SH:
10313 type = F_RDLCK;
10314 break;
10315 case LOCK_EX:
10316 type = F_WRLCK;
10317 break;
10318 case LOCK_UN:
10319 type = F_UNLCK;
10320 break;
10321 default:
10322 return -EINVAL;
10323 }
10324
10325 struct flock fl;
10326 memset(&fl, 0, sizeof(fl));
10327 fl.l_type = type;
10328 fl.l_whence = SEEK_SET;
10329
10330 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10331 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10332 return ret;
10333 }
10334
10335 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10336 {
10337 /* Since the only thing this does is wrap a call to statfs, and
10338 statfs takes a lock, it doesn't seem we have a need to split it
10339 out. */
10340 return statfs(0, stbuf, perms);
10341 }
10342
10343 void Client::ll_register_callbacks(struct client_callback_args *args)
10344 {
10345 if (!args)
10346 return;
10347 std::lock_guard l(client_lock);
10348 ldout(cct, 10) << __func__ << " cb " << args->handle
10349 << " invalidate_ino_cb " << args->ino_cb
10350 << " invalidate_dentry_cb " << args->dentry_cb
10351 << " switch_interrupt_cb " << args->switch_intr_cb
10352 << " remount_cb " << args->remount_cb
10353 << dendl;
10354 callback_handle = args->handle;
10355 if (args->ino_cb) {
10356 ino_invalidate_cb = args->ino_cb;
10357 async_ino_invalidator.start();
10358 }
10359 if (args->dentry_cb) {
10360 dentry_invalidate_cb = args->dentry_cb;
10361 async_dentry_invalidator.start();
10362 }
10363 if (args->switch_intr_cb) {
10364 switch_interrupt_cb = args->switch_intr_cb;
10365 interrupt_finisher.start();
10366 }
10367 if (args->remount_cb) {
10368 remount_cb = args->remount_cb;
10369 remount_finisher.start();
10370 }
10371 umask_cb = args->umask_cb;
10372 }
10373
10374 int Client::test_dentry_handling(bool can_invalidate)
10375 {
10376 int r = 0;
10377
10378 can_invalidate_dentries = can_invalidate;
10379
10380 if (can_invalidate_dentries) {
10381 ceph_assert(dentry_invalidate_cb);
10382 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10383 r = 0;
10384 } else {
10385 ceph_assert(remount_cb);
10386 ldout(cct, 1) << "using remount_cb" << dendl;
10387 r = _do_remount(false);
10388 }
10389
10390 return r;
10391 }
10392
10393 int Client::_sync_fs()
10394 {
10395 ldout(cct, 10) << __func__ << dendl;
10396
10397 // flush file data
10398 std::unique_ptr<C_SaferCond> cond = nullptr;
10399 if (cct->_conf->client_oc) {
10400 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10401 objectcacher->flush_all(cond.get());
10402 }
10403
10404 // flush caps
10405 flush_caps_sync();
10406 ceph_tid_t flush_tid = last_flush_tid;
10407
10408 // wait for unsafe mds requests
10409 wait_unsafe_requests();
10410
10411 wait_sync_caps(flush_tid);
10412
10413 if (nullptr != cond) {
10414 client_lock.Unlock();
10415 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10416 cond->wait();
10417 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10418 client_lock.Lock();
10419 }
10420
10421 return 0;
10422 }
10423
10424 int Client::sync_fs()
10425 {
10426 std::lock_guard l(client_lock);
10427
10428 if (unmounting)
10429 return -ENOTCONN;
10430
10431 return _sync_fs();
10432 }
10433
10434 int64_t Client::drop_caches()
10435 {
10436 std::lock_guard l(client_lock);
10437 return objectcacher->release_all();
10438 }
10439
10440 int Client::_lazyio(Fh *fh, int enable)
10441 {
10442 Inode *in = fh->inode.get();
10443 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10444
10445 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10446 return 0;
10447
10448 int orig_mode = fh->mode;
10449 if (enable) {
10450 fh->mode |= CEPH_FILE_MODE_LAZY;
10451 in->get_open_ref(fh->mode);
10452 in->put_open_ref(orig_mode);
10453 check_caps(in, CHECK_CAPS_NODELAY);
10454 } else {
10455 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10456 in->get_open_ref(fh->mode);
10457 in->put_open_ref(orig_mode);
10458 check_caps(in, 0);
10459 }
10460
10461 return 0;
10462 }
10463
10464 int Client::lazyio(int fd, int enable)
10465 {
10466 std::lock_guard l(client_lock);
10467 Fh *f = get_filehandle(fd);
10468 if (!f)
10469 return -EBADF;
10470
10471 return _lazyio(f, enable);
10472 }
10473
10474 int Client::ll_lazyio(Fh *fh, int enable)
10475 {
10476 std::lock_guard lock(client_lock);
10477 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10478 tout(cct) << __func__ << std::endl;
10479
10480 return _lazyio(fh, enable);
10481 }
10482
10483 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10484 {
10485 std::lock_guard l(client_lock);
10486 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10487 << ", " << offset << ", " << count << ")" << dendl;
10488
10489 Fh *f = get_filehandle(fd);
10490 if (!f)
10491 return -EBADF;
10492
10493 // for now
10494 _fsync(f, true);
10495
10496 return 0;
10497 }
10498
10499 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10500 {
10501 std::lock_guard l(client_lock);
10502 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10503 << ", " << offset << ", " << count << ")" << dendl;
10504
10505 Fh *f = get_filehandle(fd);
10506 if (!f)
10507 return -EBADF;
10508 Inode *in = f->inode.get();
10509
10510 _fsync(f, true);
10511 if (_release(in))
10512 check_caps(in, 0);
10513 return 0;
10514 }
10515
10516
10517 // =============================
10518 // snaps
10519
10520 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10521 {
10522 std::lock_guard l(client_lock);
10523
10524 if (unmounting)
10525 return -ENOTCONN;
10526
10527 filepath path(relpath);
10528 InodeRef in;
10529 int r = path_walk(path, &in, perm);
10530 if (r < 0)
10531 return r;
10532 if (cct->_conf->client_permissions) {
10533 r = may_create(in.get(), perm);
10534 if (r < 0)
10535 return r;
10536 }
10537 Inode *snapdir = open_snapdir(in.get());
10538 return _mkdir(snapdir, name, 0, perm);
10539 }
10540
10541 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10542 {
10543 std::lock_guard l(client_lock);
10544
10545 if (unmounting)
10546 return -ENOTCONN;
10547
10548 filepath path(relpath);
10549 InodeRef in;
10550 int r = path_walk(path, &in, perms);
10551 if (r < 0)
10552 return r;
10553 if (cct->_conf->client_permissions) {
10554 r = may_delete(in.get(), NULL, perms);
10555 if (r < 0)
10556 return r;
10557 }
10558 Inode *snapdir = open_snapdir(in.get());
10559 return _rmdir(snapdir, name, perms);
10560 }
10561
10562 // =============================
10563 // expose caps
10564
10565 int Client::get_caps_issued(int fd) {
10566
10567 std::lock_guard lock(client_lock);
10568
10569 if (unmounting)
10570 return -ENOTCONN;
10571
10572 Fh *f = get_filehandle(fd);
10573 if (!f)
10574 return -EBADF;
10575
10576 return f->inode->caps_issued();
10577 }
10578
10579 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10580 {
10581 std::lock_guard lock(client_lock);
10582
10583 if (unmounting)
10584 return -ENOTCONN;
10585
10586 filepath p(path);
10587 InodeRef in;
10588 int r = path_walk(p, &in, perms, true);
10589 if (r < 0)
10590 return r;
10591 return in->caps_issued();
10592 }
10593
10594 // =========================================
10595 // low level
10596
10597 Inode *Client::open_snapdir(Inode *diri)
10598 {
10599 Inode *in;
10600 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10601 if (!inode_map.count(vino)) {
10602 in = new Inode(this, vino, &diri->layout);
10603
10604 in->ino = diri->ino;
10605 in->snapid = CEPH_SNAPDIR;
10606 in->mode = diri->mode;
10607 in->uid = diri->uid;
10608 in->gid = diri->gid;
10609 in->mtime = diri->mtime;
10610 in->ctime = diri->ctime;
10611 in->btime = diri->btime;
10612 in->size = diri->size;
10613 in->change_attr = diri->change_attr;
10614
10615 in->dirfragtree.clear();
10616 in->snapdir_parent = diri;
10617 diri->flags |= I_SNAPDIR_OPEN;
10618 inode_map[vino] = in;
10619 if (use_faked_inos())
10620 _assign_faked_ino(in);
10621 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10622 } else {
10623 in = inode_map[vino];
10624 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10625 }
10626 return in;
10627 }
10628
10629 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10630 Inode **out, const UserPerm& perms)
10631 {
10632 std::lock_guard lock(client_lock);
10633 vinodeno_t vparent = _get_vino(parent);
10634 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10635 tout(cct) << __func__ << std::endl;
10636 tout(cct) << name << std::endl;
10637
10638 if (unmounting)
10639 return -ENOTCONN;
10640
10641 int r = 0;
10642 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10643 "fuse_default_permissions");
10644 if (!fuse_default_permissions) {
10645 if (strcmp(name, ".") && strcmp(name, "..")) {
10646 r = may_lookup(parent, perms);
10647 if (r < 0)
10648 return r;
10649 }
10650 }
10651
10652 string dname(name);
10653 InodeRef in;
10654
10655 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10656 if (r < 0) {
10657 attr->st_ino = 0;
10658 goto out;
10659 }
10660
10661 ceph_assert(in);
10662 fill_stat(in, attr);
10663 _ll_get(in.get());
10664
10665 out:
10666 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10667 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10668 tout(cct) << attr->st_ino << std::endl;
10669 *out = in.get();
10670 return r;
10671 }
10672
10673 int Client::ll_lookup_inode(
10674 struct inodeno_t ino,
10675 const UserPerm& perms,
10676 Inode **inode)
10677 {
10678 std::lock_guard lock(client_lock);
10679 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10680
10681 // Num1: get inode and *inode
10682 int r = _lookup_ino(ino, perms, inode);
10683 if (r) {
10684 return r;
10685 }
10686 ceph_assert(inode != NULL);
10687 ceph_assert(*inode != NULL);
10688
10689 // Num2: Request the parent inode, so that we can look up the name
10690 Inode *parent;
10691 r = _lookup_parent(*inode, perms, &parent);
10692 if (r && r != -EINVAL) {
10693 // Unexpected error
10694 _ll_forget(*inode, 1);
10695 return r;
10696 } else if (r == -EINVAL) {
10697 // EINVAL indicates node without parents (root), drop out now
10698 // and don't try to look up the non-existent dentry.
10699 return 0;
10700 }
10701 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10702 // is already in cache
10703 ceph_assert(parent != NULL);
10704
10705 // Num3: Finally, get the name (dentry) of the requested inode
10706 r = _lookup_name(*inode, parent, perms);
10707 if (r) {
10708 // Unexpected error
10709 _ll_forget(parent, 1);
10710 _ll_forget(*inode, 1);
10711 return r;
10712 }
10713
10714 _ll_forget(parent, 1);
10715 return 0;
10716 }
10717
10718 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10719 struct ceph_statx *stx, unsigned want, unsigned flags,
10720 const UserPerm& perms)
10721 {
10722 std::lock_guard lock(client_lock);
10723 vinodeno_t vparent = _get_vino(parent);
10724 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10725 tout(cct) << "ll_lookupx" << std::endl;
10726 tout(cct) << name << std::endl;
10727
10728 if (unmounting)
10729 return -ENOTCONN;
10730
10731 int r = 0;
10732 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10733 "fuse_default_permissions");
10734 if (!fuse_default_permissions) {
10735 r = may_lookup(parent, perms);
10736 if (r < 0)
10737 return r;
10738 }
10739
10740 string dname(name);
10741 InodeRef in;
10742
10743 unsigned mask = statx_to_mask(flags, want);
10744 r = _lookup(parent, dname, mask, &in, perms);
10745 if (r < 0) {
10746 stx->stx_ino = 0;
10747 stx->stx_mask = 0;
10748 } else {
10749 ceph_assert(in);
10750 fill_statx(in, mask, stx);
10751 _ll_get(in.get());
10752 }
10753
10754 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10755 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10756 tout(cct) << stx->stx_ino << std::endl;
10757 *out = in.get();
10758 return r;
10759 }
10760
10761 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10762 unsigned int want, unsigned int flags, const UserPerm& perms)
10763 {
10764 std::lock_guard lock(client_lock);
10765
10766 if (unmounting)
10767 return -ENOTCONN;
10768
10769 filepath fp(name, 0);
10770 InodeRef in;
10771 int rc;
10772 unsigned mask = statx_to_mask(flags, want);
10773
10774 ldout(cct, 3) << __func__ << " " << name << dendl;
10775 tout(cct) << __func__ << std::endl;
10776 tout(cct) << name << std::endl;
10777
10778 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10779 if (rc < 0) {
10780 /* zero out mask, just in case... */
10781 stx->stx_mask = 0;
10782 stx->stx_ino = 0;
10783 *out = NULL;
10784 return rc;
10785 } else {
10786 ceph_assert(in);
10787 fill_statx(in, mask, stx);
10788 _ll_get(in.get());
10789 *out = in.get();
10790 return 0;
10791 }
10792 }
10793
10794 void Client::_ll_get(Inode *in)
10795 {
10796 if (in->ll_ref == 0) {
10797 in->get();
10798 if (in->is_dir() && !in->dentries.empty()) {
10799 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10800 in->get_first_parent()->get(); // pin dentry
10801 }
10802 if (in->snapid != CEPH_NOSNAP)
10803 ll_snap_ref[in->snapid]++;
10804 }
10805 in->ll_get();
10806 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10807 }
10808
10809 int Client::_ll_put(Inode *in, int num)
10810 {
10811 in->ll_put(num);
10812 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10813 if (in->ll_ref == 0) {
10814 if (in->is_dir() && !in->dentries.empty()) {
10815 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10816 in->get_first_parent()->put(); // unpin dentry
10817 }
10818 if (in->snapid != CEPH_NOSNAP) {
10819 auto p = ll_snap_ref.find(in->snapid);
10820 ceph_assert(p != ll_snap_ref.end());
10821 ceph_assert(p->second > 0);
10822 if (--p->second == 0)
10823 ll_snap_ref.erase(p);
10824 }
10825 put_inode(in);
10826 return 0;
10827 } else {
10828 return in->ll_ref;
10829 }
10830 }
10831
10832 void Client::_ll_drop_pins()
10833 {
10834 ldout(cct, 10) << __func__ << dendl;
10835 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10836 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10837 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10838 it != inode_map.end();
10839 it = next) {
10840 Inode *in = it->second;
10841 next = it;
10842 ++next;
10843 if (in->ll_ref){
10844 to_be_put.insert(in);
10845 _ll_put(in, in->ll_ref);
10846 }
10847 }
10848 }
10849
10850 bool Client::_ll_forget(Inode *in, int count)
10851 {
10852 inodeno_t ino = in->ino;
10853
10854 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10855 tout(cct) << __func__ << std::endl;
10856 tout(cct) << ino.val << std::endl;
10857 tout(cct) << count << std::endl;
10858
10859 // Ignore forget if we're no longer mounted
10860 if (unmounting)
10861 return true;
10862
10863 if (ino == 1) return true; // ignore forget on root.
10864
10865 bool last = false;
10866 if (in->ll_ref < count) {
10867 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10868 << ", which only has ll_ref=" << in->ll_ref << dendl;
10869 _ll_put(in, in->ll_ref);
10870 last = true;
10871 } else {
10872 if (_ll_put(in, count) == 0)
10873 last = true;
10874 }
10875
10876 return last;
10877 }
10878
10879 bool Client::ll_forget(Inode *in, int count)
10880 {
10881 std::lock_guard lock(client_lock);
10882 return _ll_forget(in, count);
10883 }
10884
10885 bool Client::ll_put(Inode *in)
10886 {
10887 /* ll_forget already takes the lock */
10888 return ll_forget(in, 1);
10889 }
10890
10891 int Client::ll_get_snap_ref(snapid_t snap)
10892 {
10893 std::lock_guard lock(client_lock);
10894 auto p = ll_snap_ref.find(snap);
10895 if (p != ll_snap_ref.end())
10896 return p->second;
10897 return 0;
10898 }
10899
10900 snapid_t Client::ll_get_snapid(Inode *in)
10901 {
10902 std::lock_guard lock(client_lock);
10903 return in->snapid;
10904 }
10905
10906 Inode *Client::ll_get_inode(ino_t ino)
10907 {
10908 std::lock_guard lock(client_lock);
10909
10910 if (unmounting)
10911 return NULL;
10912
10913 vinodeno_t vino = _map_faked_ino(ino);
10914 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10915 if (p == inode_map.end())
10916 return NULL;
10917 Inode *in = p->second;
10918 _ll_get(in);
10919 return in;
10920 }
10921
10922 Inode *Client::ll_get_inode(vinodeno_t vino)
10923 {
10924 std::lock_guard lock(client_lock);
10925
10926 if (unmounting)
10927 return NULL;
10928
10929 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10930 if (p == inode_map.end())
10931 return NULL;
10932 Inode *in = p->second;
10933 _ll_get(in);
10934 return in;
10935 }
10936
10937 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10938 {
10939 vinodeno_t vino = _get_vino(in);
10940
10941 ldout(cct, 8) << __func__ << " " << vino << dendl;
10942 tout(cct) << __func__ << std::endl;
10943 tout(cct) << vino.ino.val << std::endl;
10944
10945 if (vino.snapid < CEPH_NOSNAP)
10946 return 0;
10947 else
10948 return _getattr(in, caps, perms);
10949 }
10950
10951 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10952 {
10953 std::lock_guard lock(client_lock);
10954
10955 if (unmounting)
10956 return -ENOTCONN;
10957
10958 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10959
10960 if (res == 0)
10961 fill_stat(in, attr);
10962 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
10963 return res;
10964 }
10965
10966 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10967 unsigned int flags, const UserPerm& perms)
10968 {
10969 std::lock_guard lock(client_lock);
10970
10971 if (unmounting)
10972 return -ENOTCONN;
10973
10974 int res = 0;
10975 unsigned mask = statx_to_mask(flags, want);
10976
10977 if (mask && !in->caps_issued_mask(mask, true))
10978 res = _ll_getattr(in, mask, perms);
10979
10980 if (res == 0)
10981 fill_statx(in, mask, stx);
10982 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
10983 return res;
10984 }
10985
10986 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10987 const UserPerm& perms, InodeRef *inp)
10988 {
10989 vinodeno_t vino = _get_vino(in);
10990
10991 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
10992 << dendl;
10993 tout(cct) << __func__ << std::endl;
10994 tout(cct) << vino.ino.val << std::endl;
10995 tout(cct) << stx->stx_mode << std::endl;
10996 tout(cct) << stx->stx_uid << std::endl;
10997 tout(cct) << stx->stx_gid << std::endl;
10998 tout(cct) << stx->stx_size << std::endl;
10999 tout(cct) << stx->stx_mtime << std::endl;
11000 tout(cct) << stx->stx_atime << std::endl;
11001 tout(cct) << stx->stx_btime << std::endl;
11002 tout(cct) << mask << std::endl;
11003
11004 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11005 "fuse_default_permissions");
11006 if (!fuse_default_permissions) {
11007 int res = may_setattr(in, stx, mask, perms);
11008 if (res < 0)
11009 return res;
11010 }
11011
11012 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11013
11014 return __setattrx(in, stx, mask, perms, inp);
11015 }
11016
11017 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11018 const UserPerm& perms)
11019 {
11020 std::lock_guard lock(client_lock);
11021
11022 if (unmounting)
11023 return -ENOTCONN;
11024
11025 InodeRef target(in);
11026 int res = _ll_setattrx(in, stx, mask, perms, &target);
11027 if (res == 0) {
11028 ceph_assert(in == target.get());
11029 fill_statx(in, in->caps_issued(), stx);
11030 }
11031
11032 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11033 return res;
11034 }
11035
11036 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11037 const UserPerm& perms)
11038 {
11039 struct ceph_statx stx;
11040 stat_to_statx(attr, &stx);
11041
11042 std::lock_guard lock(client_lock);
11043
11044 if (unmounting)
11045 return -ENOTCONN;
11046
11047 InodeRef target(in);
11048 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11049 if (res == 0) {
11050 ceph_assert(in == target.get());
11051 fill_stat(in, attr);
11052 }
11053
11054 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11055 return res;
11056 }
11057
11058
11059 // ----------
11060 // xattrs
11061
11062 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11063 const UserPerm& perms)
11064 {
11065 std::lock_guard lock(client_lock);
11066
11067 if (unmounting)
11068 return -ENOTCONN;
11069
11070 InodeRef in;
11071 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11072 if (r < 0)
11073 return r;
11074 return _getxattr(in, name, value, size, perms);
11075 }
11076
11077 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11078 const UserPerm& perms)
11079 {
11080 std::lock_guard lock(client_lock);
11081
11082 if (unmounting)
11083 return -ENOTCONN;
11084
11085 InodeRef in;
11086 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11087 if (r < 0)
11088 return r;
11089 return _getxattr(in, name, value, size, perms);
11090 }
11091
11092 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11093 const UserPerm& perms)
11094 {
11095 std::lock_guard lock(client_lock);
11096
11097 if (unmounting)
11098 return -ENOTCONN;
11099
11100 Fh *f = get_filehandle(fd);
11101 if (!f)
11102 return -EBADF;
11103 return _getxattr(f->inode, name, value, size, perms);
11104 }
11105
11106 int Client::listxattr(const char *path, char *list, size_t size,
11107 const UserPerm& perms)
11108 {
11109 std::lock_guard lock(client_lock);
11110
11111 if (unmounting)
11112 return -ENOTCONN;
11113
11114 InodeRef in;
11115 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11116 if (r < 0)
11117 return r;
11118 return Client::_listxattr(in.get(), list, size, perms);
11119 }
11120
11121 int Client::llistxattr(const char *path, char *list, size_t size,
11122 const UserPerm& perms)
11123 {
11124 std::lock_guard lock(client_lock);
11125
11126 if (unmounting)
11127 return -ENOTCONN;
11128
11129 InodeRef in;
11130 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11131 if (r < 0)
11132 return r;
11133 return Client::_listxattr(in.get(), list, size, perms);
11134 }
11135
11136 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11137 {
11138 std::lock_guard lock(client_lock);
11139
11140 if (unmounting)
11141 return -ENOTCONN;
11142
11143 Fh *f = get_filehandle(fd);
11144 if (!f)
11145 return -EBADF;
11146 return Client::_listxattr(f->inode.get(), list, size, perms);
11147 }
11148
11149 int Client::removexattr(const char *path, const char *name,
11150 const UserPerm& perms)
11151 {
11152 std::lock_guard lock(client_lock);
11153
11154 if (unmounting)
11155 return -ENOTCONN;
11156
11157 InodeRef in;
11158 int r = Client::path_walk(path, &in, perms, true);
11159 if (r < 0)
11160 return r;
11161 return _removexattr(in, name, perms);
11162 }
11163
11164 int Client::lremovexattr(const char *path, const char *name,
11165 const UserPerm& perms)
11166 {
11167 std::lock_guard lock(client_lock);
11168
11169 if (unmounting)
11170 return -ENOTCONN;
11171
11172 InodeRef in;
11173 int r = Client::path_walk(path, &in, perms, false);
11174 if (r < 0)
11175 return r;
11176 return _removexattr(in, name, perms);
11177 }
11178
11179 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11180 {
11181 std::lock_guard lock(client_lock);
11182
11183 if (unmounting)
11184 return -ENOTCONN;
11185
11186 Fh *f = get_filehandle(fd);
11187 if (!f)
11188 return -EBADF;
11189 return _removexattr(f->inode, name, perms);
11190 }
11191
11192 int Client::setxattr(const char *path, const char *name, const void *value,
11193 size_t size, int flags, const UserPerm& perms)
11194 {
11195 _setxattr_maybe_wait_for_osdmap(name, value, size);
11196
11197 std::lock_guard lock(client_lock);
11198
11199 if (unmounting)
11200 return -ENOTCONN;
11201
11202 InodeRef in;
11203 int r = Client::path_walk(path, &in, perms, true);
11204 if (r < 0)
11205 return r;
11206 return _setxattr(in, name, value, size, flags, perms);
11207 }
11208
11209 int Client::lsetxattr(const char *path, const char *name, const void *value,
11210 size_t size, int flags, const UserPerm& perms)
11211 {
11212 _setxattr_maybe_wait_for_osdmap(name, value, size);
11213
11214 std::lock_guard lock(client_lock);
11215
11216 if (unmounting)
11217 return -ENOTCONN;
11218
11219 InodeRef in;
11220 int r = Client::path_walk(path, &in, perms, false);
11221 if (r < 0)
11222 return r;
11223 return _setxattr(in, name, value, size, flags, perms);
11224 }
11225
11226 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11227 int flags, const UserPerm& perms)
11228 {
11229 _setxattr_maybe_wait_for_osdmap(name, value, size);
11230
11231 std::lock_guard lock(client_lock);
11232
11233 if (unmounting)
11234 return -ENOTCONN;
11235
11236 Fh *f = get_filehandle(fd);
11237 if (!f)
11238 return -EBADF;
11239 return _setxattr(f->inode, name, value, size, flags, perms);
11240 }
11241
11242 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11243 const UserPerm& perms)
11244 {
11245 int r;
11246
11247 const VXattr *vxattr = _match_vxattr(in, name);
11248 if (vxattr) {
11249 r = -ENODATA;
11250
11251 // Do a force getattr to get the latest quota before returning
11252 // a value to userspace.
11253 int flags = 0;
11254 if (vxattr->flags & VXATTR_RSTAT) {
11255 flags |= CEPH_STAT_RSTAT;
11256 }
11257 r = _getattr(in, flags, perms, true);
11258 if (r != 0) {
11259 // Error from getattr!
11260 return r;
11261 }
11262
11263 // call pointer-to-member function
11264 char buf[256];
11265 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11266 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11267 } else {
11268 r = -ENODATA;
11269 }
11270
11271 if (size != 0) {
11272 if (r > (int)size) {
11273 r = -ERANGE;
11274 } else if (r > 0) {
11275 memcpy(value, buf, r);
11276 }
11277 }
11278 goto out;
11279 }
11280
11281 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11282 r = -EOPNOTSUPP;
11283 goto out;
11284 }
11285
11286 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11287 if (r == 0) {
11288 string n(name);
11289 r = -ENODATA;
11290 if (in->xattrs.count(n)) {
11291 r = in->xattrs[n].length();
11292 if (r > 0 && size != 0) {
11293 if (size >= (unsigned)r)
11294 memcpy(value, in->xattrs[n].c_str(), r);
11295 else
11296 r = -ERANGE;
11297 }
11298 }
11299 }
11300 out:
11301 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11302 return r;
11303 }
11304
11305 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11306 const UserPerm& perms)
11307 {
11308 if (cct->_conf->client_permissions) {
11309 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11310 if (r < 0)
11311 return r;
11312 }
11313 return _getxattr(in.get(), name, value, size, perms);
11314 }
11315
11316 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11317 size_t size, const UserPerm& perms)
11318 {
11319 std::lock_guard lock(client_lock);
11320
11321 if (unmounting)
11322 return -ENOTCONN;
11323
11324 vinodeno_t vino = _get_vino(in);
11325
11326 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11327 tout(cct) << __func__ << std::endl;
11328 tout(cct) << vino.ino.val << std::endl;
11329 tout(cct) << name << std::endl;
11330
11331 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11332 "fuse_default_permissions");
11333 if (!fuse_default_permissions) {
11334 int r = xattr_permission(in, name, MAY_READ, perms);
11335 if (r < 0)
11336 return r;
11337 }
11338
11339 return _getxattr(in, name, value, size, perms);
11340 }
11341
11342 int Client::_listxattr(Inode *in, char *name, size_t size,
11343 const UserPerm& perms)
11344 {
11345 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11346 if (r == 0) {
11347 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11348 p != in->xattrs.end();
11349 ++p)
11350 r += p->first.length() + 1;
11351
11352 const VXattr *vxattrs = _get_vxattrs(in);
11353 r += _vxattrs_name_size(vxattrs);
11354
11355 if (size != 0) {
11356 if (size >= (unsigned)r) {
11357 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11358 p != in->xattrs.end();
11359 ++p) {
11360 memcpy(name, p->first.c_str(), p->first.length());
11361 name += p->first.length();
11362 *name = '\0';
11363 name++;
11364 }
11365 if (vxattrs) {
11366 for (int i = 0; !vxattrs[i].name.empty(); i++) {
11367 const VXattr& vxattr = vxattrs[i];
11368 if (vxattr.hidden)
11369 continue;
11370 // call pointer-to-member function
11371 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11372 continue;
11373 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11374 name += vxattr.name.length();
11375 *name = '\0';
11376 name++;
11377 }
11378 }
11379 } else
11380 r = -ERANGE;
11381 }
11382 }
11383 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11384 return r;
11385 }
11386
11387 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11388 const UserPerm& perms)
11389 {
11390 std::lock_guard lock(client_lock);
11391
11392 if (unmounting)
11393 return -ENOTCONN;
11394
11395 vinodeno_t vino = _get_vino(in);
11396
11397 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11398 tout(cct) << __func__ << std::endl;
11399 tout(cct) << vino.ino.val << std::endl;
11400 tout(cct) << size << std::endl;
11401
11402 return _listxattr(in, names, size, perms);
11403 }
11404
11405 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11406 size_t size, int flags, const UserPerm& perms)
11407 {
11408
11409 int xattr_flags = 0;
11410 if (!value)
11411 xattr_flags |= CEPH_XATTR_REMOVE;
11412 if (flags & XATTR_CREATE)
11413 xattr_flags |= CEPH_XATTR_CREATE;
11414 if (flags & XATTR_REPLACE)
11415 xattr_flags |= CEPH_XATTR_REPLACE;
11416
11417 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11418 filepath path;
11419 in->make_nosnap_relative_path(path);
11420 req->set_filepath(path);
11421 req->set_string2(name);
11422 req->set_inode(in);
11423 req->head.args.setxattr.flags = xattr_flags;
11424
11425 bufferlist bl;
11426 assert (value || size == 0);
11427 bl.append((const char*)value, size);
11428 req->set_data(bl);
11429
11430 int res = make_request(req, perms);
11431
11432 trim_cache();
11433 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11434 res << dendl;
11435 return res;
11436 }
11437
11438 int Client::_setxattr(Inode *in, const char *name, const void *value,
11439 size_t size, int flags, const UserPerm& perms)
11440 {
11441 if (in->snapid != CEPH_NOSNAP) {
11442 return -EROFS;
11443 }
11444
11445 bool posix_acl_xattr = false;
11446 if (acl_type == POSIX_ACL)
11447 posix_acl_xattr = !strncmp(name, "system.", 7);
11448
11449 if (strncmp(name, "user.", 5) &&
11450 strncmp(name, "security.", 9) &&
11451 strncmp(name, "trusted.", 8) &&
11452 strncmp(name, "ceph.", 5) &&
11453 !posix_acl_xattr)
11454 return -EOPNOTSUPP;
11455
11456 bool check_realm = false;
11457
11458 if (posix_acl_xattr) {
11459 if (!strcmp(name, ACL_EA_ACCESS)) {
11460 mode_t new_mode = in->mode;
11461 if (value) {
11462 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11463 if (ret < 0)
11464 return ret;
11465 if (ret == 0) {
11466 value = NULL;
11467 size = 0;
11468 }
11469 if (new_mode != in->mode) {
11470 struct ceph_statx stx;
11471 stx.stx_mode = new_mode;
11472 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11473 if (ret < 0)
11474 return ret;
11475 }
11476 }
11477 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11478 if (value) {
11479 if (!S_ISDIR(in->mode))
11480 return -EACCES;
11481 int ret = posix_acl_check(value, size);
11482 if (ret < 0)
11483 return -EINVAL;
11484 if (ret == 0) {
11485 value = NULL;
11486 size = 0;
11487 }
11488 }
11489 } else {
11490 return -EOPNOTSUPP;
11491 }
11492 } else {
11493 const VXattr *vxattr = _match_vxattr(in, name);
11494 if (vxattr) {
11495 if (vxattr->readonly)
11496 return -EOPNOTSUPP;
11497 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11498 check_realm = true;
11499 }
11500 }
11501
11502 int ret = _do_setxattr(in, name, value, size, flags, perms);
11503 if (ret >= 0 && check_realm) {
11504 // check if snaprealm was created for quota inode
11505 if (in->quota.is_enable() &&
11506 !(in->snaprealm && in->snaprealm->ino == in->ino))
11507 ret = -EOPNOTSUPP;
11508 }
11509
11510 return ret;
11511 }
11512
11513 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11514 size_t size, int flags, const UserPerm& perms)
11515 {
11516 if (cct->_conf->client_permissions) {
11517 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11518 if (r < 0)
11519 return r;
11520 }
11521 return _setxattr(in.get(), name, value, size, flags, perms);
11522 }
11523
11524 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11525 {
11526 string tmp;
11527 if (name == "layout") {
11528 string::iterator begin = value.begin();
11529 string::iterator end = value.end();
11530 keys_and_values<string::iterator> p; // create instance of parser
11531 std::map<string, string> m; // map to receive results
11532 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11533 return -EINVAL;
11534 }
11535 if (begin != end)
11536 return -EINVAL;
11537 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11538 if (q->first == "pool") {
11539 tmp = q->second;
11540 break;
11541 }
11542 }
11543 } else if (name == "layout.pool") {
11544 tmp = value;
11545 }
11546
11547 if (tmp.length()) {
11548 int64_t pool;
11549 try {
11550 pool = boost::lexical_cast<unsigned>(tmp);
11551 if (!osdmap->have_pg_pool(pool))
11552 return -ENOENT;
11553 } catch (boost::bad_lexical_cast const&) {
11554 pool = osdmap->lookup_pg_pool_name(tmp);
11555 if (pool < 0) {
11556 return -ENOENT;
11557 }
11558 }
11559 }
11560
11561 return 0;
11562 }
11563
11564 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11565 {
11566 // For setting pool of layout, MetaRequest need osdmap epoch.
11567 // There is a race which create a new data pool but client and mds both don't have.
11568 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11569 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11570 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11571 string rest(strstr(name, "layout"));
11572 string v((const char*)value, size);
11573 int r = objecter->with_osdmap([&](const OSDMap& o) {
11574 return _setxattr_check_data_pool(rest, v, &o);
11575 });
11576
11577 if (r == -ENOENT) {
11578 C_SaferCond ctx;
11579 objecter->wait_for_latest_osdmap(&ctx);
11580 ctx.wait();
11581 }
11582 }
11583 }
11584
11585 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11586 size_t size, int flags, const UserPerm& perms)
11587 {
11588 _setxattr_maybe_wait_for_osdmap(name, value, size);
11589
11590 std::lock_guard lock(client_lock);
11591
11592 if (unmounting)
11593 return -ENOTCONN;
11594
11595 vinodeno_t vino = _get_vino(in);
11596
11597 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11598 tout(cct) << __func__ << std::endl;
11599 tout(cct) << vino.ino.val << std::endl;
11600 tout(cct) << name << std::endl;
11601
11602 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11603 "fuse_default_permissions");
11604 if (!fuse_default_permissions) {
11605 int r = xattr_permission(in, name, MAY_WRITE, perms);
11606 if (r < 0)
11607 return r;
11608 }
11609 return _setxattr(in, name, value, size, flags, perms);
11610 }
11611
11612 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11613 {
11614 if (in->snapid != CEPH_NOSNAP) {
11615 return -EROFS;
11616 }
11617
11618 // same xattrs supported by kernel client
11619 if (strncmp(name, "user.", 5) &&
11620 strncmp(name, "system.", 7) &&
11621 strncmp(name, "security.", 9) &&
11622 strncmp(name, "trusted.", 8) &&
11623 strncmp(name, "ceph.", 5))
11624 return -EOPNOTSUPP;
11625
11626 const VXattr *vxattr = _match_vxattr(in, name);
11627 if (vxattr && vxattr->readonly)
11628 return -EOPNOTSUPP;
11629
11630 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11631 filepath path;
11632 in->make_nosnap_relative_path(path);
11633 req->set_filepath(path);
11634 req->set_filepath2(name);
11635 req->set_inode(in);
11636
11637 int res = make_request(req, perms);
11638
11639 trim_cache();
11640 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11641 return res;
11642 }
11643
11644 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11645 {
11646 if (cct->_conf->client_permissions) {
11647 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11648 if (r < 0)
11649 return r;
11650 }
11651 return _removexattr(in.get(), name, perms);
11652 }
11653
11654 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11655 {
11656 std::lock_guard lock(client_lock);
11657
11658 if (unmounting)
11659 return -ENOTCONN;
11660
11661 vinodeno_t vino = _get_vino(in);
11662
11663 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11664 tout(cct) << "ll_removexattr" << std::endl;
11665 tout(cct) << vino.ino.val << std::endl;
11666 tout(cct) << name << std::endl;
11667
11668 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11669 "fuse_default_permissions");
11670 if (!fuse_default_permissions) {
11671 int r = xattr_permission(in, name, MAY_WRITE, perms);
11672 if (r < 0)
11673 return r;
11674 }
11675
11676 return _removexattr(in, name, perms);
11677 }
11678
11679 bool Client::_vxattrcb_quota_exists(Inode *in)
11680 {
11681 return in->quota.is_enable() &&
11682 in->snaprealm && in->snaprealm->ino == in->ino;
11683 }
11684 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11685 {
11686 return snprintf(val, size,
11687 "max_bytes=%lld max_files=%lld",
11688 (long long int)in->quota.max_bytes,
11689 (long long int)in->quota.max_files);
11690 }
11691 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11692 {
11693 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11694 }
11695 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11696 {
11697 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11698 }
11699
11700 bool Client::_vxattrcb_layout_exists(Inode *in)
11701 {
11702 return in->layout != file_layout_t();
11703 }
11704 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11705 {
11706 int r = snprintf(val, size,
11707 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11708 (unsigned long long)in->layout.stripe_unit,
11709 (unsigned long long)in->layout.stripe_count,
11710 (unsigned long long)in->layout.object_size);
11711 objecter->with_osdmap([&](const OSDMap& o) {
11712 if (o.have_pg_pool(in->layout.pool_id))
11713 r += snprintf(val + r, size - r, "%s",
11714 o.get_pool_name(in->layout.pool_id).c_str());
11715 else
11716 r += snprintf(val + r, size - r, "%" PRIu64,
11717 (uint64_t)in->layout.pool_id);
11718 });
11719 if (in->layout.pool_ns.length())
11720 r += snprintf(val + r, size - r, " pool_namespace=%s",
11721 in->layout.pool_ns.c_str());
11722 return r;
11723 }
11724 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11725 {
11726 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11727 }
11728 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11729 {
11730 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11731 }
11732 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11733 {
11734 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11735 }
11736 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11737 {
11738 size_t r;
11739 objecter->with_osdmap([&](const OSDMap& o) {
11740 if (o.have_pg_pool(in->layout.pool_id))
11741 r = snprintf(val, size, "%s", o.get_pool_name(
11742 in->layout.pool_id).c_str());
11743 else
11744 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11745 });
11746 return r;
11747 }
11748 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11749 {
11750 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11751 }
11752 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11753 {
11754 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11755 }
11756 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11757 {
11758 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11759 }
11760 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11761 {
11762 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11763 }
11764 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11765 {
11766 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11767 }
11768 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11769 {
11770 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11771 }
11772 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11773 {
11774 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11775 }
11776 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11777 {
11778 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11779 }
11780 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11781 {
11782 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11783 (long)in->rstat.rctime.nsec());
11784 }
11785 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11786 {
11787 return in->dir_pin != -ENODATA;
11788 }
11789 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11790 {
11791 return snprintf(val, size, "%ld", (long)in->dir_pin);
11792 }
11793
11794 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11795 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11796
11797 #define XATTR_NAME_CEPH(_type, _name) \
11798 { \
11799 name: CEPH_XATTR_NAME(_type, _name), \
11800 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11801 readonly: true, \
11802 hidden: false, \
11803 exists_cb: NULL, \
11804 flags: 0, \
11805 }
11806 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11807 { \
11808 name: CEPH_XATTR_NAME(_type, _name), \
11809 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11810 readonly: true, \
11811 hidden: false, \
11812 exists_cb: NULL, \
11813 flags: _flags, \
11814 }
11815 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11816 { \
11817 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11818 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11819 readonly: false, \
11820 hidden: true, \
11821 exists_cb: &Client::_vxattrcb_layout_exists, \
11822 flags: 0, \
11823 }
11824 #define XATTR_QUOTA_FIELD(_type, _name) \
11825 { \
11826 name: CEPH_XATTR_NAME(_type, _name), \
11827 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11828 readonly: false, \
11829 hidden: true, \
11830 exists_cb: &Client::_vxattrcb_quota_exists, \
11831 flags: 0, \
11832 }
11833
11834 const Client::VXattr Client::_dir_vxattrs[] = {
11835 {
11836 name: "ceph.dir.layout",
11837 getxattr_cb: &Client::_vxattrcb_layout,
11838 readonly: false,
11839 hidden: true,
11840 exists_cb: &Client::_vxattrcb_layout_exists,
11841 flags: 0,
11842 },
11843 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11844 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11845 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11846 XATTR_LAYOUT_FIELD(dir, layout, pool),
11847 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11848 XATTR_NAME_CEPH(dir, entries),
11849 XATTR_NAME_CEPH(dir, files),
11850 XATTR_NAME_CEPH(dir, subdirs),
11851 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11852 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11853 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11854 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11855 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11856 {
11857 name: "ceph.quota",
11858 getxattr_cb: &Client::_vxattrcb_quota,
11859 readonly: false,
11860 hidden: true,
11861 exists_cb: &Client::_vxattrcb_quota_exists,
11862 flags: 0,
11863 },
11864 XATTR_QUOTA_FIELD(quota, max_bytes),
11865 XATTR_QUOTA_FIELD(quota, max_files),
11866 {
11867 name: "ceph.dir.pin",
11868 getxattr_cb: &Client::_vxattrcb_dir_pin,
11869 readonly: false,
11870 hidden: true,
11871 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11872 flags: 0,
11873 },
11874 { name: "" } /* Required table terminator */
11875 };
11876
11877 const Client::VXattr Client::_file_vxattrs[] = {
11878 {
11879 name: "ceph.file.layout",
11880 getxattr_cb: &Client::_vxattrcb_layout,
11881 readonly: false,
11882 hidden: true,
11883 exists_cb: &Client::_vxattrcb_layout_exists,
11884 flags: 0,
11885 },
11886 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11887 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11888 XATTR_LAYOUT_FIELD(file, layout, object_size),
11889 XATTR_LAYOUT_FIELD(file, layout, pool),
11890 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11891 { name: "" } /* Required table terminator */
11892 };
11893
11894 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11895 {
11896 if (in->is_dir())
11897 return _dir_vxattrs;
11898 else if (in->is_file())
11899 return _file_vxattrs;
11900 return NULL;
11901 }
11902
11903 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11904 {
11905 if (strncmp(name, "ceph.", 5) == 0) {
11906 const VXattr *vxattr = _get_vxattrs(in);
11907 if (vxattr) {
11908 while (!vxattr->name.empty()) {
11909 if (vxattr->name == name)
11910 return vxattr;
11911 vxattr++;
11912 }
11913 }
11914 }
11915 return NULL;
11916 }
11917
11918 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11919 {
11920 size_t len = 0;
11921 while (!vxattr->name.empty()) {
11922 if (!vxattr->hidden)
11923 len += vxattr->name.length() + 1;
11924 vxattr++;
11925 }
11926 return len;
11927 }
11928
11929 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11930 {
11931 std::lock_guard lock(client_lock);
11932
11933 if (unmounting)
11934 return -ENOTCONN;
11935
11936 vinodeno_t vino = _get_vino(in);
11937
11938 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11939 tout(cct) << "ll_readlink" << std::endl;
11940 tout(cct) << vino.ino.val << std::endl;
11941
11942 for (auto dn : in->dentries) {
11943 touch_dn(dn);
11944 }
11945
11946 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11947 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11948 return r;
11949 }
11950
11951 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11952 const UserPerm& perms, InodeRef *inp)
11953 {
11954 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11955 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11956 << ", gid " << perms.gid() << ")" << dendl;
11957
11958 if (strlen(name) > NAME_MAX)
11959 return -ENAMETOOLONG;
11960
11961 if (dir->snapid != CEPH_NOSNAP) {
11962 return -EROFS;
11963 }
11964 if (is_quota_files_exceeded(dir, perms)) {
11965 return -EDQUOT;
11966 }
11967
11968 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11969
11970 filepath path;
11971 dir->make_nosnap_relative_path(path);
11972 path.push_dentry(name);
11973 req->set_filepath(path);
11974 req->set_inode(dir);
11975 req->head.args.mknod.rdev = rdev;
11976 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11977 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11978
11979 bufferlist xattrs_bl;
11980 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11981 if (res < 0)
11982 goto fail;
11983 req->head.args.mknod.mode = mode;
11984 if (xattrs_bl.length() > 0)
11985 req->set_data(xattrs_bl);
11986
11987 Dentry *de;
11988 res = get_or_create(dir, name, &de);
11989 if (res < 0)
11990 goto fail;
11991 req->set_dentry(de);
11992
11993 res = make_request(req, perms, inp);
11994
11995 trim_cache();
11996
11997 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11998 return res;
11999
12000 fail:
12001 put_request(req);
12002 return res;
12003 }
12004
12005 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12006 dev_t rdev, struct stat *attr, Inode **out,
12007 const UserPerm& perms)
12008 {
12009 std::lock_guard lock(client_lock);
12010
12011 if (unmounting)
12012 return -ENOTCONN;
12013
12014 vinodeno_t vparent = _get_vino(parent);
12015
12016 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12017 tout(cct) << "ll_mknod" << std::endl;
12018 tout(cct) << vparent.ino.val << std::endl;
12019 tout(cct) << name << std::endl;
12020 tout(cct) << mode << std::endl;
12021 tout(cct) << rdev << std::endl;
12022
12023 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12024 "fuse_default_permissions");
12025 if (!fuse_default_permissions) {
12026 int r = may_create(parent, perms);
12027 if (r < 0)
12028 return r;
12029 }
12030
12031 InodeRef in;
12032 int r = _mknod(parent, name, mode, rdev, perms, &in);
12033 if (r == 0) {
12034 fill_stat(in, attr);
12035 _ll_get(in.get());
12036 }
12037 tout(cct) << attr->st_ino << std::endl;
12038 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12039 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12040 *out = in.get();
12041 return r;
12042 }
12043
12044 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12045 dev_t rdev, Inode **out,
12046 struct ceph_statx *stx, unsigned want, unsigned flags,
12047 const UserPerm& perms)
12048 {
12049 unsigned caps = statx_to_mask(flags, want);
12050 std::lock_guard lock(client_lock);
12051
12052 if (unmounting)
12053 return -ENOTCONN;
12054
12055 vinodeno_t vparent = _get_vino(parent);
12056
12057 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12058 tout(cct) << "ll_mknodx" << std::endl;
12059 tout(cct) << vparent.ino.val << std::endl;
12060 tout(cct) << name << std::endl;
12061 tout(cct) << mode << std::endl;
12062 tout(cct) << rdev << std::endl;
12063
12064 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12065 "fuse_default_permissions");
12066 if (!fuse_default_permissions) {
12067 int r = may_create(parent, perms);
12068 if (r < 0)
12069 return r;
12070 }
12071
12072 InodeRef in;
12073 int r = _mknod(parent, name, mode, rdev, perms, &in);
12074 if (r == 0) {
12075 fill_statx(in, caps, stx);
12076 _ll_get(in.get());
12077 }
12078 tout(cct) << stx->stx_ino << std::endl;
12079 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12080 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12081 *out = in.get();
12082 return r;
12083 }
12084
12085 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12086 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12087 int object_size, const char *data_pool, bool *created,
12088 const UserPerm& perms)
12089 {
12090 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12091 mode << dec << ")" << dendl;
12092
12093 if (strlen(name) > NAME_MAX)
12094 return -ENAMETOOLONG;
12095 if (dir->snapid != CEPH_NOSNAP) {
12096 return -EROFS;
12097 }
12098 if (is_quota_files_exceeded(dir, perms)) {
12099 return -EDQUOT;
12100 }
12101
12102 // use normalized flags to generate cmode
12103 int cflags = ceph_flags_sys2wire(flags);
12104 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12105 cflags |= CEPH_O_LAZY;
12106
12107 int cmode = ceph_flags_to_mode(cflags);
12108
12109 int64_t pool_id = -1;
12110 if (data_pool && *data_pool) {
12111 pool_id = objecter->with_osdmap(
12112 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12113 if (pool_id < 0)
12114 return -EINVAL;
12115 if (pool_id > 0xffffffffll)
12116 return -ERANGE; // bummer!
12117 }
12118
12119 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12120
12121 filepath path;
12122 dir->make_nosnap_relative_path(path);
12123 path.push_dentry(name);
12124 req->set_filepath(path);
12125 req->set_inode(dir);
12126 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12127
12128 req->head.args.open.stripe_unit = stripe_unit;
12129 req->head.args.open.stripe_count = stripe_count;
12130 req->head.args.open.object_size = object_size;
12131 if (cct->_conf->client_debug_getattr_caps)
12132 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12133 else
12134 req->head.args.open.mask = 0;
12135 req->head.args.open.pool = pool_id;
12136 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12137 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12138
12139 mode |= S_IFREG;
12140 bufferlist xattrs_bl;
12141 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12142 if (res < 0)
12143 goto fail;
12144 req->head.args.open.mode = mode;
12145 if (xattrs_bl.length() > 0)
12146 req->set_data(xattrs_bl);
12147
12148 Dentry *de;
12149 res = get_or_create(dir, name, &de);
12150 if (res < 0)
12151 goto fail;
12152 req->set_dentry(de);
12153
12154 res = make_request(req, perms, inp, created);
12155 if (res < 0) {
12156 goto reply_error;
12157 }
12158
12159 /* If the caller passed a value in fhp, do the open */
12160 if(fhp) {
12161 (*inp)->get_open_ref(cmode);
12162 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12163 }
12164
12165 reply_error:
12166 trim_cache();
12167
12168 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12169 << " layout " << stripe_unit
12170 << ' ' << stripe_count
12171 << ' ' << object_size
12172 <<") = " << res << dendl;
12173 return res;
12174
12175 fail:
12176 put_request(req);
12177 return res;
12178 }
12179
12180
12181 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12182 InodeRef *inp)
12183 {
12184 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12185 << mode << dec << ", uid " << perm.uid()
12186 << ", gid " << perm.gid() << ")" << dendl;
12187
12188 if (strlen(name) > NAME_MAX)
12189 return -ENAMETOOLONG;
12190
12191 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12192 return -EROFS;
12193 }
12194 if (is_quota_files_exceeded(dir, perm)) {
12195 return -EDQUOT;
12196 }
12197 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12198 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12199
12200 filepath path;
12201 dir->make_nosnap_relative_path(path);
12202 path.push_dentry(name);
12203 req->set_filepath(path);
12204 req->set_inode(dir);
12205 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12206 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12207
12208 mode |= S_IFDIR;
12209 bufferlist xattrs_bl;
12210 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12211 if (res < 0)
12212 goto fail;
12213 req->head.args.mkdir.mode = mode;
12214 if (xattrs_bl.length() > 0)
12215 req->set_data(xattrs_bl);
12216
12217 Dentry *de;
12218 res = get_or_create(dir, name, &de);
12219 if (res < 0)
12220 goto fail;
12221 req->set_dentry(de);
12222
12223 ldout(cct, 10) << "_mkdir: making request" << dendl;
12224 res = make_request(req, perm, inp);
12225 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12226
12227 trim_cache();
12228
12229 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12230 return res;
12231
12232 fail:
12233 put_request(req);
12234 return res;
12235 }
12236
12237 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12238 struct stat *attr, Inode **out, const UserPerm& perm)
12239 {
12240 std::lock_guard lock(client_lock);
12241
12242 if (unmounting)
12243 return -ENOTCONN;
12244
12245 vinodeno_t vparent = _get_vino(parent);
12246
12247 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12248 tout(cct) << "ll_mkdir" << std::endl;
12249 tout(cct) << vparent.ino.val << std::endl;
12250 tout(cct) << name << std::endl;
12251 tout(cct) << mode << std::endl;
12252
12253 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12254 "fuse_default_permissions");
12255 if (!fuse_default_permissions) {
12256 int r = may_create(parent, perm);
12257 if (r < 0)
12258 return r;
12259 }
12260
12261 InodeRef in;
12262 int r = _mkdir(parent, name, mode, perm, &in);
12263 if (r == 0) {
12264 fill_stat(in, attr);
12265 _ll_get(in.get());
12266 }
12267 tout(cct) << attr->st_ino << std::endl;
12268 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12269 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12270 *out = in.get();
12271 return r;
12272 }
12273
12274 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12275 struct ceph_statx *stx, unsigned want, unsigned flags,
12276 const UserPerm& perms)
12277 {
12278 std::lock_guard lock(client_lock);
12279
12280 if (unmounting)
12281 return -ENOTCONN;
12282
12283 vinodeno_t vparent = _get_vino(parent);
12284
12285 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12286 tout(cct) << "ll_mkdirx" << std::endl;
12287 tout(cct) << vparent.ino.val << std::endl;
12288 tout(cct) << name << std::endl;
12289 tout(cct) << mode << std::endl;
12290
12291 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12292 "fuse_default_permissions");
12293 if (!fuse_default_permissions) {
12294 int r = may_create(parent, perms);
12295 if (r < 0)
12296 return r;
12297 }
12298
12299 InodeRef in;
12300 int r = _mkdir(parent, name, mode, perms, &in);
12301 if (r == 0) {
12302 fill_statx(in, statx_to_mask(flags, want), stx);
12303 _ll_get(in.get());
12304 } else {
12305 stx->stx_ino = 0;
12306 stx->stx_mask = 0;
12307 }
12308 tout(cct) << stx->stx_ino << std::endl;
12309 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12310 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12311 *out = in.get();
12312 return r;
12313 }
12314
12315 int Client::_symlink(Inode *dir, const char *name, const char *target,
12316 const UserPerm& perms, InodeRef *inp)
12317 {
12318 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12319 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12320 << dendl;
12321
12322 if (strlen(name) > NAME_MAX)
12323 return -ENAMETOOLONG;
12324
12325 if (dir->snapid != CEPH_NOSNAP) {
12326 return -EROFS;
12327 }
12328 if (is_quota_files_exceeded(dir, perms)) {
12329 return -EDQUOT;
12330 }
12331
12332 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12333
12334 filepath path;
12335 dir->make_nosnap_relative_path(path);
12336 path.push_dentry(name);
12337 req->set_filepath(path);
12338 req->set_inode(dir);
12339 req->set_string2(target);
12340 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12341 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12342
12343 Dentry *de;
12344 int res = get_or_create(dir, name, &de);
12345 if (res < 0)
12346 goto fail;
12347 req->set_dentry(de);
12348
12349 res = make_request(req, perms, inp);
12350
12351 trim_cache();
12352 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12353 res << dendl;
12354 return res;
12355
12356 fail:
12357 put_request(req);
12358 return res;
12359 }
12360
12361 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12362 struct stat *attr, Inode **out, const UserPerm& perms)
12363 {
12364 std::lock_guard lock(client_lock);
12365
12366 if (unmounting)
12367 return -ENOTCONN;
12368
12369 vinodeno_t vparent = _get_vino(parent);
12370
12371 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12372 << dendl;
12373 tout(cct) << "ll_symlink" << std::endl;
12374 tout(cct) << vparent.ino.val << std::endl;
12375 tout(cct) << name << std::endl;
12376 tout(cct) << value << std::endl;
12377
12378 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12379 "fuse_default_permissions");
12380 if (!fuse_default_permissions) {
12381 int r = may_create(parent, perms);
12382 if (r < 0)
12383 return r;
12384 }
12385
12386 InodeRef in;
12387 int r = _symlink(parent, name, value, perms, &in);
12388 if (r == 0) {
12389 fill_stat(in, attr);
12390 _ll_get(in.get());
12391 }
12392 tout(cct) << attr->st_ino << std::endl;
12393 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12394 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12395 *out = in.get();
12396 return r;
12397 }
12398
12399 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12400 Inode **out, struct ceph_statx *stx, unsigned want,
12401 unsigned flags, const UserPerm& perms)
12402 {
12403 std::lock_guard lock(client_lock);
12404
12405 if (unmounting)
12406 return -ENOTCONN;
12407
12408 vinodeno_t vparent = _get_vino(parent);
12409
12410 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12411 << dendl;
12412 tout(cct) << "ll_symlinkx" << std::endl;
12413 tout(cct) << vparent.ino.val << std::endl;
12414 tout(cct) << name << std::endl;
12415 tout(cct) << value << std::endl;
12416
12417 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12418 "fuse_default_permissions");
12419 if (!fuse_default_permissions) {
12420 int r = may_create(parent, perms);
12421 if (r < 0)
12422 return r;
12423 }
12424
12425 InodeRef in;
12426 int r = _symlink(parent, name, value, perms, &in);
12427 if (r == 0) {
12428 fill_statx(in, statx_to_mask(flags, want), stx);
12429 _ll_get(in.get());
12430 }
12431 tout(cct) << stx->stx_ino << std::endl;
12432 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12433 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12434 *out = in.get();
12435 return r;
12436 }
12437
12438 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12439 {
12440 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12441 << " uid " << perm.uid() << " gid " << perm.gid()
12442 << ")" << dendl;
12443
12444 if (dir->snapid != CEPH_NOSNAP) {
12445 return -EROFS;
12446 }
12447
12448 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12449
12450 filepath path;
12451 dir->make_nosnap_relative_path(path);
12452 path.push_dentry(name);
12453 req->set_filepath(path);
12454
12455 InodeRef otherin;
12456 Inode *in;
12457 Dentry *de;
12458
12459 int res = get_or_create(dir, name, &de);
12460 if (res < 0)
12461 goto fail;
12462 req->set_dentry(de);
12463 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12464 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12465
12466 res = _lookup(dir, name, 0, &otherin, perm);
12467 if (res < 0)
12468 goto fail;
12469
12470 in = otherin.get();
12471 req->set_other_inode(in);
12472 in->break_all_delegs();
12473 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12474
12475 req->set_inode(dir);
12476
12477 res = make_request(req, perm);
12478
12479 trim_cache();
12480 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12481 return res;
12482
12483 fail:
12484 put_request(req);
12485 return res;
12486 }
12487
12488 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12489 {
12490 std::lock_guard lock(client_lock);
12491
12492 if (unmounting)
12493 return -ENOTCONN;
12494
12495 vinodeno_t vino = _get_vino(in);
12496
12497 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12498 tout(cct) << "ll_unlink" << std::endl;
12499 tout(cct) << vino.ino.val << std::endl;
12500 tout(cct) << name << std::endl;
12501
12502 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12503 "fuse_default_permissions");
12504 if (!fuse_default_permissions) {
12505 int r = may_delete(in, name, perm);
12506 if (r < 0)
12507 return r;
12508 }
12509 return _unlink(in, name, perm);
12510 }
12511
12512 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12513 {
12514 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12515 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12516
12517 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12518 return -EROFS;
12519 }
12520
12521 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12522 MetaRequest *req = new MetaRequest(op);
12523 filepath path;
12524 dir->make_nosnap_relative_path(path);
12525 path.push_dentry(name);
12526 req->set_filepath(path);
12527 req->set_inode(dir);
12528
12529 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12530 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12531 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12532
12533 InodeRef in;
12534
12535 Dentry *de;
12536 int res = get_or_create(dir, name, &de);
12537 if (res < 0)
12538 goto fail;
12539 if (op == CEPH_MDS_OP_RMDIR)
12540 req->set_dentry(de);
12541 else
12542 de->get();
12543
12544 res = _lookup(dir, name, 0, &in, perms);
12545 if (res < 0)
12546 goto fail;
12547
12548 if (op == CEPH_MDS_OP_RMSNAP) {
12549 unlink(de, true, true);
12550 de->put();
12551 }
12552 req->set_other_inode(in.get());
12553
12554 res = make_request(req, perms);
12555
12556 trim_cache();
12557 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12558 return res;
12559
12560 fail:
12561 put_request(req);
12562 return res;
12563 }
12564
12565 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12566 {
12567 std::lock_guard lock(client_lock);
12568
12569 if (unmounting)
12570 return -ENOTCONN;
12571
12572 vinodeno_t vino = _get_vino(in);
12573
12574 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12575 tout(cct) << "ll_rmdir" << std::endl;
12576 tout(cct) << vino.ino.val << std::endl;
12577 tout(cct) << name << std::endl;
12578
12579 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12580 "fuse_default_permissions");
12581 if (!fuse_default_permissions) {
12582 int r = may_delete(in, name, perms);
12583 if (r < 0)
12584 return r;
12585 }
12586
12587 return _rmdir(in, name, perms);
12588 }
12589
12590 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12591 {
12592 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12593 << todir->ino << " " << toname
12594 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12595 << dendl;
12596
12597 if (fromdir->snapid != todir->snapid)
12598 return -EXDEV;
12599
12600 int op = CEPH_MDS_OP_RENAME;
12601 if (fromdir->snapid != CEPH_NOSNAP) {
12602 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12603 op = CEPH_MDS_OP_RENAMESNAP;
12604 else
12605 return -EROFS;
12606 }
12607 if (fromdir != todir) {
12608 Inode *fromdir_root =
12609 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12610 Inode *todir_root =
12611 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12612 if (fromdir_root != todir_root) {
12613 return -EXDEV;
12614 }
12615 }
12616
12617 InodeRef target;
12618 MetaRequest *req = new MetaRequest(op);
12619
12620 filepath from;
12621 fromdir->make_nosnap_relative_path(from);
12622 from.push_dentry(fromname);
12623 filepath to;
12624 todir->make_nosnap_relative_path(to);
12625 to.push_dentry(toname);
12626 req->set_filepath(to);
12627 req->set_filepath2(from);
12628
12629 Dentry *oldde;
12630 int res = get_or_create(fromdir, fromname, &oldde);
12631 if (res < 0)
12632 goto fail;
12633 Dentry *de;
12634 res = get_or_create(todir, toname, &de);
12635 if (res < 0)
12636 goto fail;
12637
12638 if (op == CEPH_MDS_OP_RENAME) {
12639 req->set_old_dentry(oldde);
12640 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12641 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12642
12643 req->set_dentry(de);
12644 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12645 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12646
12647 InodeRef oldin, otherin;
12648 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12649 if (res < 0)
12650 goto fail;
12651
12652 Inode *oldinode = oldin.get();
12653 oldinode->break_all_delegs();
12654 req->set_old_inode(oldinode);
12655 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12656
12657 res = _lookup(todir, toname, 0, &otherin, perm);
12658 switch (res) {
12659 case 0:
12660 {
12661 Inode *in = otherin.get();
12662 req->set_other_inode(in);
12663 in->break_all_delegs();
12664 }
12665 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12666 break;
12667 case -ENOENT:
12668 break;
12669 default:
12670 goto fail;
12671 }
12672
12673 req->set_inode(todir);
12674 } else {
12675 // renamesnap reply contains no tracedn, so we need to invalidate
12676 // dentry manually
12677 unlink(oldde, true, true);
12678 unlink(de, true, true);
12679
12680 req->set_inode(todir);
12681 }
12682
12683 res = make_request(req, perm, &target);
12684 ldout(cct, 10) << "rename result is " << res << dendl;
12685
12686 // renamed item from our cache
12687
12688 trim_cache();
12689 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12690 return res;
12691
12692 fail:
12693 put_request(req);
12694 return res;
12695 }
12696
12697 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12698 const char *newname, const UserPerm& perm)
12699 {
12700 std::lock_guard lock(client_lock);
12701
12702 if (unmounting)
12703 return -ENOTCONN;
12704
12705 vinodeno_t vparent = _get_vino(parent);
12706 vinodeno_t vnewparent = _get_vino(newparent);
12707
12708 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12709 << vnewparent << " " << newname << dendl;
12710 tout(cct) << "ll_rename" << std::endl;
12711 tout(cct) << vparent.ino.val << std::endl;
12712 tout(cct) << name << std::endl;
12713 tout(cct) << vnewparent.ino.val << std::endl;
12714 tout(cct) << newname << std::endl;
12715
12716 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12717 "fuse_default_permissions");
12718 if (!fuse_default_permissions) {
12719 int r = may_delete(parent, name, perm);
12720 if (r < 0)
12721 return r;
12722 r = may_delete(newparent, newname, perm);
12723 if (r < 0 && r != -ENOENT)
12724 return r;
12725 }
12726
12727 return _rename(parent, name, newparent, newname, perm);
12728 }
12729
12730 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12731 {
12732 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12733 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12734
12735 if (strlen(newname) > NAME_MAX)
12736 return -ENAMETOOLONG;
12737
12738 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12739 return -EROFS;
12740 }
12741 if (is_quota_files_exceeded(dir, perm)) {
12742 return -EDQUOT;
12743 }
12744
12745 in->break_all_delegs();
12746 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12747
12748 filepath path(newname, dir->ino);
12749 req->set_filepath(path);
12750 filepath existing(in->ino);
12751 req->set_filepath2(existing);
12752
12753 req->set_inode(dir);
12754 req->inode_drop = CEPH_CAP_FILE_SHARED;
12755 req->inode_unless = CEPH_CAP_FILE_EXCL;
12756
12757 Dentry *de;
12758 int res = get_or_create(dir, newname, &de);
12759 if (res < 0)
12760 goto fail;
12761 req->set_dentry(de);
12762
12763 res = make_request(req, perm, inp);
12764 ldout(cct, 10) << "link result is " << res << dendl;
12765
12766 trim_cache();
12767 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12768 return res;
12769
12770 fail:
12771 put_request(req);
12772 return res;
12773 }
12774
12775 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12776 const UserPerm& perm)
12777 {
12778 std::lock_guard lock(client_lock);
12779
12780 if (unmounting)
12781 return -ENOTCONN;
12782
12783 vinodeno_t vino = _get_vino(in);
12784 vinodeno_t vnewparent = _get_vino(newparent);
12785
12786 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12787 newname << dendl;
12788 tout(cct) << "ll_link" << std::endl;
12789 tout(cct) << vino.ino.val << std::endl;
12790 tout(cct) << vnewparent << std::endl;
12791 tout(cct) << newname << std::endl;
12792
12793 InodeRef target;
12794
12795 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12796 "fuse_default_permissions");
12797 if (!fuse_default_permissions) {
12798 if (S_ISDIR(in->mode))
12799 return -EPERM;
12800
12801 int r = may_hardlink(in, perm);
12802 if (r < 0)
12803 return r;
12804
12805 r = may_create(newparent, perm);
12806 if (r < 0)
12807 return r;
12808 }
12809
12810 return _link(in, newparent, newname, perm, &target);
12811 }
12812
12813 int Client::ll_num_osds(void)
12814 {
12815 std::lock_guard lock(client_lock);
12816 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12817 }
12818
12819 int Client::ll_osdaddr(int osd, uint32_t *addr)
12820 {
12821 std::lock_guard lock(client_lock);
12822
12823 entity_addr_t g;
12824 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12825 if (!o.exists(osd))
12826 return false;
12827 g = o.get_addrs(osd).front();
12828 return true;
12829 });
12830 if (!exists)
12831 return -1;
12832 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12833 *addr = ntohl(nb_addr);
12834 return 0;
12835 }
12836
12837 uint32_t Client::ll_stripe_unit(Inode *in)
12838 {
12839 std::lock_guard lock(client_lock);
12840 return in->layout.stripe_unit;
12841 }
12842
12843 uint64_t Client::ll_snap_seq(Inode *in)
12844 {
12845 std::lock_guard lock(client_lock);
12846 return in->snaprealm->seq;
12847 }
12848
12849 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12850 {
12851 std::lock_guard lock(client_lock);
12852 *layout = in->layout;
12853 return 0;
12854 }
12855
12856 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12857 {
12858 return ll_file_layout(fh->inode.get(), layout);
12859 }
12860
12861 /* Currently we cannot take advantage of redundancy in reads, since we
12862 would have to go through all possible placement groups (a
12863 potentially quite large number determined by a hash), and use CRUSH
12864 to calculate the appropriate set of OSDs for each placement group,
12865 then index into that. An array with one entry per OSD is much more
12866 tractable and works for demonstration purposes. */
12867
12868 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12869 file_layout_t* layout)
12870 {
12871 std::lock_guard lock(client_lock);
12872
12873 inodeno_t ino = in->ino;
12874 uint32_t object_size = layout->object_size;
12875 uint32_t su = layout->stripe_unit;
12876 uint32_t stripe_count = layout->stripe_count;
12877 uint64_t stripes_per_object = object_size / su;
12878 uint64_t stripeno = 0, stripepos = 0;
12879
12880 if(stripe_count) {
12881 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12882 stripepos = blockno % stripe_count; // which object in the object set (X)
12883 }
12884 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12885 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12886
12887 object_t oid = file_object_t(ino, objectno);
12888 return objecter->with_osdmap([&](const OSDMap& o) {
12889 ceph_object_layout olayout =
12890 o.file_to_object_layout(oid, *layout);
12891 pg_t pg = (pg_t)olayout.ol_pgid;
12892 vector<int> osds;
12893 int primary;
12894 o.pg_to_acting_osds(pg, &osds, &primary);
12895 return primary;
12896 });
12897 }
12898
12899 /* Return the offset of the block, internal to the object */
12900
12901 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12902 {
12903 std::lock_guard lock(client_lock);
12904 file_layout_t *layout=&(in->layout);
12905 uint32_t object_size = layout->object_size;
12906 uint32_t su = layout->stripe_unit;
12907 uint64_t stripes_per_object = object_size / su;
12908
12909 return (blockno % stripes_per_object) * su;
12910 }
12911
12912 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12913 const UserPerm& perms)
12914 {
12915 std::lock_guard lock(client_lock);
12916
12917 if (unmounting)
12918 return -ENOTCONN;
12919
12920 vinodeno_t vino = _get_vino(in);
12921
12922 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12923 tout(cct) << "ll_opendir" << std::endl;
12924 tout(cct) << vino.ino.val << std::endl;
12925
12926 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12927 "fuse_default_permissions");
12928 if (!fuse_default_permissions) {
12929 int r = may_open(in, flags, perms);
12930 if (r < 0)
12931 return r;
12932 }
12933
12934 int r = _opendir(in, dirpp, perms);
12935 tout(cct) << (unsigned long)*dirpp << std::endl;
12936
12937 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12938 << dendl;
12939 return r;
12940 }
12941
12942 int Client::ll_releasedir(dir_result_t *dirp)
12943 {
12944 std::lock_guard lock(client_lock);
12945 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12946 tout(cct) << "ll_releasedir" << std::endl;
12947 tout(cct) << (unsigned long)dirp << std::endl;
12948
12949 if (unmounting)
12950 return -ENOTCONN;
12951
12952 _closedir(dirp);
12953 return 0;
12954 }
12955
12956 int Client::ll_fsyncdir(dir_result_t *dirp)
12957 {
12958 std::lock_guard lock(client_lock);
12959 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12960 tout(cct) << "ll_fsyncdir" << std::endl;
12961 tout(cct) << (unsigned long)dirp << std::endl;
12962
12963 if (unmounting)
12964 return -ENOTCONN;
12965
12966 return _fsync(dirp->inode.get(), false);
12967 }
12968
12969 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12970 {
12971 ceph_assert(!(flags & O_CREAT));
12972
12973 std::lock_guard lock(client_lock);
12974
12975 if (unmounting)
12976 return -ENOTCONN;
12977
12978 vinodeno_t vino = _get_vino(in);
12979
12980 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12981 tout(cct) << "ll_open" << std::endl;
12982 tout(cct) << vino.ino.val << std::endl;
12983 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12984
12985 int r;
12986 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12987 "fuse_default_permissions");
12988 if (!fuse_default_permissions) {
12989 r = may_open(in, flags, perms);
12990 if (r < 0)
12991 goto out;
12992 }
12993
12994 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12995
12996 out:
12997 Fh *fhptr = fhp ? *fhp : NULL;
12998 if (fhptr) {
12999 ll_unclosed_fh_set.insert(fhptr);
13000 }
13001 tout(cct) << (unsigned long)fhptr << std::endl;
13002 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13003 " = " << r << " (" << fhptr << ")" << dendl;
13004 return r;
13005 }
13006
13007 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13008 int flags, InodeRef *in, int caps, Fh **fhp,
13009 const UserPerm& perms)
13010 {
13011 *fhp = NULL;
13012
13013 vinodeno_t vparent = _get_vino(parent);
13014
13015 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13016 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13017 << ", gid " << perms.gid() << dendl;
13018 tout(cct) << "ll_create" << std::endl;
13019 tout(cct) << vparent.ino.val << std::endl;
13020 tout(cct) << name << std::endl;
13021 tout(cct) << mode << std::endl;
13022 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13023
13024 bool created = false;
13025 int r = _lookup(parent, name, caps, in, perms);
13026
13027 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13028 return -EEXIST;
13029
13030 if (r == -ENOENT && (flags & O_CREAT)) {
13031 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13032 "fuse_default_permissions");
13033 if (!fuse_default_permissions) {
13034 r = may_create(parent, perms);
13035 if (r < 0)
13036 goto out;
13037 }
13038 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13039 perms);
13040 if (r < 0)
13041 goto out;
13042 }
13043
13044 if (r < 0)
13045 goto out;
13046
13047 ceph_assert(*in);
13048
13049 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13050 if (!created) {
13051 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13052 "fuse_default_permissions");
13053 if (!fuse_default_permissions) {
13054 r = may_open(in->get(), flags, perms);
13055 if (r < 0) {
13056 if (*fhp) {
13057 int release_r = _release_fh(*fhp);
13058 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13059 }
13060 goto out;
13061 }
13062 }
13063 if (*fhp == NULL) {
13064 r = _open(in->get(), flags, mode, fhp, perms);
13065 if (r < 0)
13066 goto out;
13067 }
13068 }
13069
13070 out:
13071 if (*fhp) {
13072 ll_unclosed_fh_set.insert(*fhp);
13073 }
13074
13075 ino_t ino = 0;
13076 if (r >= 0) {
13077 Inode *inode = in->get();
13078 if (use_faked_inos())
13079 ino = inode->faked_ino;
13080 else
13081 ino = inode->ino;
13082 }
13083
13084 tout(cct) << (unsigned long)*fhp << std::endl;
13085 tout(cct) << ino << std::endl;
13086 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13087 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13088 *fhp << " " << hex << ino << dec << ")" << dendl;
13089
13090 return r;
13091 }
13092
13093 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13094 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13095 const UserPerm& perms)
13096 {
13097 std::lock_guard lock(client_lock);
13098 InodeRef in;
13099
13100 if (unmounting)
13101 return -ENOTCONN;
13102
13103 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13104 fhp, perms);
13105 if (r >= 0) {
13106 ceph_assert(in);
13107
13108 // passing an Inode in outp requires an additional ref
13109 if (outp) {
13110 _ll_get(in.get());
13111 *outp = in.get();
13112 }
13113 fill_stat(in, attr);
13114 } else {
13115 attr->st_ino = 0;
13116 }
13117
13118 return r;
13119 }
13120
13121 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13122 int oflags, Inode **outp, Fh **fhp,
13123 struct ceph_statx *stx, unsigned want, unsigned lflags,
13124 const UserPerm& perms)
13125 {
13126 unsigned caps = statx_to_mask(lflags, want);
13127 std::lock_guard lock(client_lock);
13128 InodeRef in;
13129
13130 if (unmounting)
13131 return -ENOTCONN;
13132
13133 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13134 if (r >= 0) {
13135 ceph_assert(in);
13136
13137 // passing an Inode in outp requires an additional ref
13138 if (outp) {
13139 _ll_get(in.get());
13140 *outp = in.get();
13141 }
13142 fill_statx(in, caps, stx);
13143 } else {
13144 stx->stx_ino = 0;
13145 stx->stx_mask = 0;
13146 }
13147
13148 return r;
13149 }
13150
13151 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13152 {
13153 std::lock_guard lock(client_lock);
13154 tout(cct) << "ll_lseek" << std::endl;
13155 tout(cct) << offset << std::endl;
13156 tout(cct) << whence << std::endl;
13157
13158 if (unmounting)
13159 return -ENOTCONN;
13160
13161 return _lseek(fh, offset, whence);
13162 }
13163
13164 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13165 {
13166 std::lock_guard lock(client_lock);
13167 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13168 tout(cct) << "ll_read" << std::endl;
13169 tout(cct) << (unsigned long)fh << std::endl;
13170 tout(cct) << off << std::endl;
13171 tout(cct) << len << std::endl;
13172
13173 if (unmounting)
13174 return -ENOTCONN;
13175
13176 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13177 len = std::min(len, (loff_t)INT_MAX);
13178 return _read(fh, off, len, bl);
13179 }
13180
13181 int Client::ll_read_block(Inode *in, uint64_t blockid,
13182 char *buf,
13183 uint64_t offset,
13184 uint64_t length,
13185 file_layout_t* layout)
13186 {
13187 std::lock_guard lock(client_lock);
13188
13189 if (unmounting)
13190 return -ENOTCONN;
13191
13192 vinodeno_t vino = _get_vino(in);
13193 object_t oid = file_object_t(vino.ino, blockid);
13194 C_SaferCond onfinish;
13195 bufferlist bl;
13196
13197 objecter->read(oid,
13198 object_locator_t(layout->pool_id),
13199 offset,
13200 length,
13201 vino.snapid,
13202 &bl,
13203 CEPH_OSD_FLAG_READ,
13204 &onfinish);
13205
13206 client_lock.Unlock();
13207 int r = onfinish.wait();
13208 client_lock.Lock();
13209
13210 if (r >= 0) {
13211 bl.copy(0, bl.length(), buf);
13212 r = bl.length();
13213 }
13214
13215 return r;
13216 }
13217
13218 /* It appears that the OSD doesn't return success unless the entire
13219 buffer was written, return the write length on success. */
13220
13221 int Client::ll_write_block(Inode *in, uint64_t blockid,
13222 char* buf, uint64_t offset,
13223 uint64_t length, file_layout_t* layout,
13224 uint64_t snapseq, uint32_t sync)
13225 {
13226 vinodeno_t vino = ll_get_vino(in);
13227 int r = 0;
13228 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13229
13230 if (length == 0) {
13231 return -EINVAL;
13232 }
13233 if (true || sync) {
13234 /* if write is stable, the epilogue is waiting on
13235 * flock */
13236 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13237 }
13238 object_t oid = file_object_t(vino.ino, blockid);
13239 SnapContext fakesnap;
13240 ceph::bufferlist bl;
13241 if (length > 0) {
13242 bl.push_back(buffer::copy(buf, length));
13243 }
13244
13245 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13246 << dendl;
13247
13248 fakesnap.seq = snapseq;
13249
13250 /* lock just in time */
13251 client_lock.Lock();
13252 if (unmounting) {
13253 client_lock.Unlock();
13254 return -ENOTCONN;
13255 }
13256
13257 objecter->write(oid,
13258 object_locator_t(layout->pool_id),
13259 offset,
13260 length,
13261 fakesnap,
13262 bl,
13263 ceph::real_clock::now(),
13264 0,
13265 onsafe.get());
13266
13267 client_lock.Unlock();
13268 if (nullptr != onsafe) {
13269 r = onsafe->wait();
13270 }
13271
13272 if (r < 0) {
13273 return r;
13274 } else {
13275 return length;
13276 }
13277 }
13278
13279 int Client::ll_commit_blocks(Inode *in,
13280 uint64_t offset,
13281 uint64_t length)
13282 {
13283 std::lock_guard lock(client_lock);
13284 /*
13285 BarrierContext *bctx;
13286 vinodeno_t vino = _get_vino(in);
13287 uint64_t ino = vino.ino;
13288
13289 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13290 << offset << " to " << length << dendl;
13291
13292 if (length == 0) {
13293 return -EINVAL;
13294 }
13295
13296 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13297 if (p != barriers.end()) {
13298 barrier_interval civ(offset, offset + length);
13299 p->second->commit_barrier(civ);
13300 }
13301 */
13302 return 0;
13303 }
13304
13305 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13306 {
13307 std::lock_guard lock(client_lock);
13308 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13309 "~" << len << dendl;
13310 tout(cct) << "ll_write" << std::endl;
13311 tout(cct) << (unsigned long)fh << std::endl;
13312 tout(cct) << off << std::endl;
13313 tout(cct) << len << std::endl;
13314
13315 if (unmounting)
13316 return -ENOTCONN;
13317
13318 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13319 len = std::min(len, (loff_t)INT_MAX);
13320 int r = _write(fh, off, len, data, NULL, 0);
13321 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13322 << dendl;
13323 return r;
13324 }
13325
13326 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13327 {
13328 std::lock_guard lock(client_lock);
13329 if (unmounting)
13330 return -ENOTCONN;
13331 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13332 }
13333
13334 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13335 {
13336 std::lock_guard lock(client_lock);
13337 if (unmounting)
13338 return -ENOTCONN;
13339 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13340 }
13341
13342 int Client::ll_flush(Fh *fh)
13343 {
13344 std::lock_guard lock(client_lock);
13345 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13346 tout(cct) << "ll_flush" << std::endl;
13347 tout(cct) << (unsigned long)fh << std::endl;
13348
13349 if (unmounting)
13350 return -ENOTCONN;
13351
13352 return _flush(fh);
13353 }
13354
13355 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13356 {
13357 std::lock_guard lock(client_lock);
13358 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13359 tout(cct) << "ll_fsync" << std::endl;
13360 tout(cct) << (unsigned long)fh << std::endl;
13361
13362 if (unmounting)
13363 return -ENOTCONN;
13364
13365 int r = _fsync(fh, syncdataonly);
13366 if (r) {
13367 // If we're returning an error, clear it from the FH
13368 fh->take_async_err();
13369 }
13370 return r;
13371 }
13372
13373 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13374 {
13375 std::lock_guard lock(client_lock);
13376 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13377 tout(cct) << "ll_sync_inode" << std::endl;
13378 tout(cct) << (unsigned long)in << std::endl;
13379
13380 if (unmounting)
13381 return -ENOTCONN;
13382
13383 return _fsync(in, syncdataonly);
13384 }
13385
13386 #ifdef FALLOC_FL_PUNCH_HOLE
13387
13388 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13389 {
13390 if (offset < 0 || length <= 0)
13391 return -EINVAL;
13392
13393 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13394 return -EOPNOTSUPP;
13395
13396 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13397 return -EOPNOTSUPP;
13398
13399 Inode *in = fh->inode.get();
13400
13401 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13402 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13403 return -ENOSPC;
13404 }
13405
13406 if (in->snapid != CEPH_NOSNAP)
13407 return -EROFS;
13408
13409 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13410 return -EBADF;
13411
13412 uint64_t size = offset + length;
13413 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13414 size > in->size &&
13415 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13416 return -EDQUOT;
13417 }
13418
13419 int have;
13420 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13421 if (r < 0)
13422 return r;
13423
13424 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13425 if (mode & FALLOC_FL_PUNCH_HOLE) {
13426 if (in->inline_version < CEPH_INLINE_NONE &&
13427 (have & CEPH_CAP_FILE_BUFFER)) {
13428 bufferlist bl;
13429 int len = in->inline_data.length();
13430 if (offset < len) {
13431 if (offset > 0)
13432 in->inline_data.copy(0, offset, bl);
13433 int size = length;
13434 if (offset + size > len)
13435 size = len - offset;
13436 if (size > 0)
13437 bl.append_zero(size);
13438 if (offset + size < len)
13439 in->inline_data.copy(offset + size, len - offset - size, bl);
13440 in->inline_data = bl;
13441 in->inline_version++;
13442 }
13443 in->mtime = in->ctime = ceph_clock_now();
13444 in->change_attr++;
13445 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13446 } else {
13447 if (in->inline_version < CEPH_INLINE_NONE) {
13448 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13449 uninline_data(in, onuninline.get());
13450 }
13451
13452 C_SaferCond onfinish("Client::_punch_hole flock");
13453
13454 unsafe_sync_write++;
13455 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13456
13457 _invalidate_inode_cache(in, offset, length);
13458 filer->zero(in->ino, &in->layout,
13459 in->snaprealm->get_snap_context(),
13460 offset, length,
13461 ceph::real_clock::now(),
13462 0, true, &onfinish);
13463 in->mtime = in->ctime = ceph_clock_now();
13464 in->change_attr++;
13465 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13466
13467 client_lock.Unlock();
13468 onfinish.wait();
13469 client_lock.Lock();
13470 _sync_write_commit(in);
13471 }
13472 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13473 uint64_t size = offset + length;
13474 if (size > in->size) {
13475 in->size = size;
13476 in->mtime = in->ctime = ceph_clock_now();
13477 in->change_attr++;
13478 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13479
13480 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13481 check_caps(in, CHECK_CAPS_NODELAY);
13482 } else if (is_max_size_approaching(in)) {
13483 check_caps(in, 0);
13484 }
13485 }
13486 }
13487
13488 if (nullptr != onuninline) {
13489 client_lock.Unlock();
13490 int ret = onuninline->wait();
13491 client_lock.Lock();
13492
13493 if (ret >= 0 || ret == -ECANCELED) {
13494 in->inline_data.clear();
13495 in->inline_version = CEPH_INLINE_NONE;
13496 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13497 check_caps(in, 0);
13498 } else
13499 r = ret;
13500 }
13501
13502 put_cap_ref(in, CEPH_CAP_FILE_WR);
13503 return r;
13504 }
13505 #else
13506
13507 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13508 {
13509 return -EOPNOTSUPP;
13510 }
13511
13512 #endif
13513
13514
13515 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13516 {
13517 std::lock_guard lock(client_lock);
13518 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13519 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13520 tout(cct) << (unsigned long)fh << std::endl;
13521
13522 if (unmounting)
13523 return -ENOTCONN;
13524
13525 return _fallocate(fh, mode, offset, length);
13526 }
13527
13528 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13529 {
13530 std::lock_guard lock(client_lock);
13531 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13532
13533 if (unmounting)
13534 return -ENOTCONN;
13535
13536 Fh *fh = get_filehandle(fd);
13537 if (!fh)
13538 return -EBADF;
13539 #if defined(__linux__) && defined(O_PATH)
13540 if (fh->flags & O_PATH)
13541 return -EBADF;
13542 #endif
13543 return _fallocate(fh, mode, offset, length);
13544 }
13545
13546 int Client::ll_release(Fh *fh)
13547 {
13548 std::lock_guard lock(client_lock);
13549
13550 if (unmounting)
13551 return -ENOTCONN;
13552
13553 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13554 dendl;
13555 tout(cct) << __func__ << " (fh)" << std::endl;
13556 tout(cct) << (unsigned long)fh << std::endl;
13557
13558 if (ll_unclosed_fh_set.count(fh))
13559 ll_unclosed_fh_set.erase(fh);
13560 return _release_fh(fh);
13561 }
13562
13563 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13564 {
13565 std::lock_guard lock(client_lock);
13566
13567 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13568 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13569
13570 if (unmounting)
13571 return -ENOTCONN;
13572
13573 return _getlk(fh, fl, owner);
13574 }
13575
13576 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13577 {
13578 std::lock_guard lock(client_lock);
13579
13580 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13581 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13582
13583 if (unmounting)
13584 return -ENOTCONN;
13585
13586 return _setlk(fh, fl, owner, sleep);
13587 }
13588
13589 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13590 {
13591 std::lock_guard lock(client_lock);
13592
13593 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13594 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13595
13596 if (unmounting)
13597 return -ENOTCONN;
13598
13599 return _flock(fh, cmd, owner);
13600 }
13601
13602 int Client::set_deleg_timeout(uint32_t timeout)
13603 {
13604 std::lock_guard lock(client_lock);
13605
13606 /*
13607 * The whole point is to prevent blacklisting so we must time out the
13608 * delegation before the session autoclose timeout kicks in.
13609 */
13610 if (timeout >= mdsmap->get_session_autoclose())
13611 return -EINVAL;
13612
13613 deleg_timeout = timeout;
13614 return 0;
13615 }
13616
13617 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13618 {
13619 int ret = -EINVAL;
13620
13621 std::lock_guard lock(client_lock);
13622
13623 if (!mounted)
13624 return -ENOTCONN;
13625
13626 Inode *inode = fh->inode.get();
13627
13628 switch(cmd) {
13629 case CEPH_DELEGATION_NONE:
13630 inode->unset_deleg(fh);
13631 ret = 0;
13632 break;
13633 default:
13634 try {
13635 ret = inode->set_deleg(fh, cmd, cb, priv);
13636 } catch (std::bad_alloc&) {
13637 ret = -ENOMEM;
13638 }
13639 break;
13640 }
13641 return ret;
13642 }
13643
13644 class C_Client_RequestInterrupt : public Context {
13645 private:
13646 Client *client;
13647 MetaRequest *req;
13648 public:
13649 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13650 req->get();
13651 }
13652 void finish(int r) override {
13653 std::lock_guard l(client->client_lock);
13654 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13655 client->_interrupt_filelock(req);
13656 client->put_request(req);
13657 }
13658 };
13659
13660 void Client::ll_interrupt(void *d)
13661 {
13662 MetaRequest *req = static_cast<MetaRequest*>(d);
13663 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13664 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13665 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13666 }
13667
13668 // =========================================
13669 // layout
13670
13671 // expose file layouts
13672
13673 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13674 const UserPerm& perms)
13675 {
13676 std::lock_guard lock(client_lock);
13677
13678 if (unmounting)
13679 return -ENOTCONN;
13680
13681 filepath path(relpath);
13682 InodeRef in;
13683 int r = path_walk(path, &in, perms);
13684 if (r < 0)
13685 return r;
13686
13687 *lp = in->layout;
13688
13689 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13690 return 0;
13691 }
13692
13693 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13694 {
13695 std::lock_guard lock(client_lock);
13696
13697 if (unmounting)
13698 return -ENOTCONN;
13699
13700 Fh *f = get_filehandle(fd);
13701 if (!f)
13702 return -EBADF;
13703 Inode *in = f->inode.get();
13704
13705 *lp = in->layout;
13706
13707 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13708 return 0;
13709 }
13710
13711 int64_t Client::get_default_pool_id()
13712 {
13713 std::lock_guard lock(client_lock);
13714
13715 if (unmounting)
13716 return -ENOTCONN;
13717
13718 /* first data pool is the default */
13719 return mdsmap->get_first_data_pool();
13720 }
13721
13722 // expose osdmap
13723
13724 int64_t Client::get_pool_id(const char *pool_name)
13725 {
13726 std::lock_guard lock(client_lock);
13727
13728 if (unmounting)
13729 return -ENOTCONN;
13730
13731 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13732 pool_name);
13733 }
13734
13735 string Client::get_pool_name(int64_t pool)
13736 {
13737 std::lock_guard lock(client_lock);
13738
13739 if (unmounting)
13740 return string();
13741
13742 return objecter->with_osdmap([pool](const OSDMap& o) {
13743 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13744 });
13745 }
13746
13747 int Client::get_pool_replication(int64_t pool)
13748 {
13749 std::lock_guard lock(client_lock);
13750
13751 if (unmounting)
13752 return -ENOTCONN;
13753
13754 return objecter->with_osdmap([pool](const OSDMap& o) {
13755 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13756 });
13757 }
13758
13759 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13760 {
13761 std::lock_guard lock(client_lock);
13762
13763 if (unmounting)
13764 return -ENOTCONN;
13765
13766 Fh *f = get_filehandle(fd);
13767 if (!f)
13768 return -EBADF;
13769 Inode *in = f->inode.get();
13770
13771 vector<ObjectExtent> extents;
13772 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13773 ceph_assert(extents.size() == 1);
13774
13775 objecter->with_osdmap([&](const OSDMap& o) {
13776 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13777 o.pg_to_acting_osds(pg, osds);
13778 });
13779
13780 if (osds.empty())
13781 return -EINVAL;
13782
13783 /*
13784 * Return the remainder of the extent (stripe unit)
13785 *
13786 * If length = 1 is passed to Striper::file_to_extents we get a single
13787 * extent back, but its length is one so we still need to compute the length
13788 * to the end of the stripe unit.
13789 *
13790 * If length = su then we may get 1 or 2 objects back in the extents vector
13791 * which would have to be examined. Even then, the offsets are local to the
13792 * object, so matching up to the file offset is extra work.
13793 *
13794 * It seems simpler to stick with length = 1 and manually compute the
13795 * remainder.
13796 */
13797 if (len) {
13798 uint64_t su = in->layout.stripe_unit;
13799 *len = su - (off % su);
13800 }
13801
13802 return 0;
13803 }
13804
13805 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13806 {
13807 std::lock_guard lock(client_lock);
13808
13809 if (unmounting)
13810 return -ENOTCONN;
13811
13812 if (id < 0)
13813 return -EINVAL;
13814 return objecter->with_osdmap([&](const OSDMap& o) {
13815 return o.crush->get_full_location_ordered(id, path);
13816 });
13817 }
13818
13819 int Client::get_file_stripe_address(int fd, loff_t offset,
13820 vector<entity_addr_t>& address)
13821 {
13822 std::lock_guard lock(client_lock);
13823
13824 if (unmounting)
13825 return -ENOTCONN;
13826
13827 Fh *f = get_filehandle(fd);
13828 if (!f)
13829 return -EBADF;
13830 Inode *in = f->inode.get();
13831
13832 // which object?
13833 vector<ObjectExtent> extents;
13834 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13835 in->truncate_size, extents);
13836 ceph_assert(extents.size() == 1);
13837
13838 // now we have the object and its 'layout'
13839 return objecter->with_osdmap([&](const OSDMap& o) {
13840 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13841 vector<int> osds;
13842 o.pg_to_acting_osds(pg, osds);
13843 if (osds.empty())
13844 return -EINVAL;
13845 for (unsigned i = 0; i < osds.size(); i++) {
13846 entity_addr_t addr = o.get_addrs(osds[i]).front();
13847 address.push_back(addr);
13848 }
13849 return 0;
13850 });
13851 }
13852
13853 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13854 {
13855 std::lock_guard lock(client_lock);
13856
13857 if (unmounting)
13858 return -ENOTCONN;
13859
13860 return objecter->with_osdmap([&](const OSDMap& o) {
13861 if (!o.exists(osd))
13862 return -ENOENT;
13863
13864 addr = o.get_addrs(osd).front();
13865 return 0;
13866 });
13867 }
13868
13869 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13870 loff_t length, loff_t offset)
13871 {
13872 std::lock_guard lock(client_lock);
13873
13874 if (unmounting)
13875 return -ENOTCONN;
13876
13877 Fh *f = get_filehandle(fd);
13878 if (!f)
13879 return -EBADF;
13880 Inode *in = f->inode.get();
13881
13882 // map to a list of extents
13883 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13884
13885 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13886 return 0;
13887 }
13888
13889
13890 /* find an osd with the same ip. -ENXIO if none. */
13891 int Client::get_local_osd()
13892 {
13893 std::lock_guard lock(client_lock);
13894
13895 if (unmounting)
13896 return -ENOTCONN;
13897
13898 objecter->with_osdmap([this](const OSDMap& o) {
13899 if (o.get_epoch() != local_osd_epoch) {
13900 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
13901 local_osd_epoch = o.get_epoch();
13902 }
13903 });
13904 return local_osd;
13905 }
13906
13907
13908
13909
13910
13911
13912 // ===============================
13913
13914 void Client::ms_handle_connect(Connection *con)
13915 {
13916 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
13917 }
13918
13919 bool Client::ms_handle_reset(Connection *con)
13920 {
13921 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13922 return false;
13923 }
13924
13925 void Client::ms_handle_remote_reset(Connection *con)
13926 {
13927 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13928 std::lock_guard l(client_lock);
13929 switch (con->get_peer_type()) {
13930 case CEPH_ENTITY_TYPE_MDS:
13931 {
13932 // kludge to figure out which mds this is; fixme with a Connection* state
13933 mds_rank_t mds = MDS_RANK_NONE;
13934 MetaSession *s = NULL;
13935 for (auto &p : mds_sessions) {
13936 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13937 mds = p.first;
13938 s = &p.second;
13939 }
13940 }
13941 if (mds >= 0) {
13942 assert (s != NULL);
13943 switch (s->state) {
13944 case MetaSession::STATE_CLOSING:
13945 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13946 _closed_mds_session(s);
13947 break;
13948
13949 case MetaSession::STATE_OPENING:
13950 {
13951 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13952 list<Context*> waiters;
13953 waiters.swap(s->waiting_for_open);
13954 _closed_mds_session(s);
13955 MetaSession *news = _get_or_open_mds_session(mds);
13956 news->waiting_for_open.swap(waiters);
13957 }
13958 break;
13959
13960 case MetaSession::STATE_OPEN:
13961 {
13962 objecter->maybe_request_map(); /* to check if we are blacklisted */
13963 const auto& conf = cct->_conf;
13964 if (conf->client_reconnect_stale) {
13965 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13966 _closed_mds_session(s);
13967 } else {
13968 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13969 s->state = MetaSession::STATE_STALE;
13970 }
13971 }
13972 break;
13973
13974 case MetaSession::STATE_NEW:
13975 case MetaSession::STATE_CLOSED:
13976 default:
13977 break;
13978 }
13979 }
13980 }
13981 break;
13982 }
13983 }
13984
13985 bool Client::ms_handle_refused(Connection *con)
13986 {
13987 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
13988 return false;
13989 }
13990
13991 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
13992 {
13993 if (dest_type == CEPH_ENTITY_TYPE_MON)
13994 return true;
13995 *authorizer = monclient->build_authorizer(dest_type);
13996 return true;
13997 }
13998
13999 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14000 {
14001 Inode *quota_in = root_ancestor;
14002 SnapRealm *realm = in->snaprealm;
14003 while (realm) {
14004 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14005 if (realm->ino != in->ino) {
14006 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14007 if (p == inode_map.end())
14008 break;
14009
14010 if (p->second->quota.is_enable()) {
14011 quota_in = p->second;
14012 break;
14013 }
14014 }
14015 realm = realm->pparent;
14016 }
14017 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14018 return quota_in;
14019 }
14020
14021 /**
14022 * Traverse quota ancestors of the Inode, return true
14023 * if any of them passes the passed function
14024 */
14025 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14026 std::function<bool (const Inode &in)> test)
14027 {
14028 while (true) {
14029 ceph_assert(in != NULL);
14030 if (test(*in)) {
14031 return true;
14032 }
14033
14034 if (in == root_ancestor) {
14035 // We're done traversing, drop out
14036 return false;
14037 } else {
14038 // Continue up the tree
14039 in = get_quota_root(in, perms);
14040 }
14041 }
14042
14043 return false;
14044 }
14045
14046 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14047 {
14048 return check_quota_condition(in, perms,
14049 [](const Inode &in) {
14050 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14051 });
14052 }
14053
14054 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14055 const UserPerm& perms)
14056 {
14057 return check_quota_condition(in, perms,
14058 [&new_bytes](const Inode &in) {
14059 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14060 > in.quota.max_bytes;
14061 });
14062 }
14063
14064 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14065 {
14066 return check_quota_condition(in, perms,
14067 [](const Inode &in) {
14068 if (in.quota.max_bytes) {
14069 if (in.rstat.rbytes >= in.quota.max_bytes) {
14070 return true;
14071 }
14072
14073 ceph_assert(in.size >= in.reported_size);
14074 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14075 const uint64_t size = in.size - in.reported_size;
14076 return (space >> 4) < size;
14077 } else {
14078 return false;
14079 }
14080 });
14081 }
14082
14083 enum {
14084 POOL_CHECKED = 1,
14085 POOL_CHECKING = 2,
14086 POOL_READ = 4,
14087 POOL_WRITE = 8,
14088 };
14089
14090 int Client::check_pool_perm(Inode *in, int need)
14091 {
14092 if (!cct->_conf->client_check_pool_perm)
14093 return 0;
14094
14095 int64_t pool_id = in->layout.pool_id;
14096 std::string pool_ns = in->layout.pool_ns;
14097 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14098 int have = 0;
14099 while (true) {
14100 auto it = pool_perms.find(perm_key);
14101 if (it == pool_perms.end())
14102 break;
14103 if (it->second == POOL_CHECKING) {
14104 // avoid concurrent checkings
14105 wait_on_list(waiting_for_pool_perm);
14106 } else {
14107 have = it->second;
14108 ceph_assert(have & POOL_CHECKED);
14109 break;
14110 }
14111 }
14112
14113 if (!have) {
14114 if (in->snapid != CEPH_NOSNAP) {
14115 // pool permission check needs to write to the first object. But for snapshot,
14116 // head of the first object may have alread been deleted. To avoid creating
14117 // orphan object, skip the check for now.
14118 return 0;
14119 }
14120
14121 pool_perms[perm_key] = POOL_CHECKING;
14122
14123 char oid_buf[32];
14124 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14125 object_t oid = oid_buf;
14126
14127 SnapContext nullsnapc;
14128
14129 C_SaferCond rd_cond;
14130 ObjectOperation rd_op;
14131 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14132
14133 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14134 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14135
14136 C_SaferCond wr_cond;
14137 ObjectOperation wr_op;
14138 wr_op.create(true);
14139
14140 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14141 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14142
14143 client_lock.Unlock();
14144 int rd_ret = rd_cond.wait();
14145 int wr_ret = wr_cond.wait();
14146 client_lock.Lock();
14147
14148 bool errored = false;
14149
14150 if (rd_ret == 0 || rd_ret == -ENOENT)
14151 have |= POOL_READ;
14152 else if (rd_ret != -EPERM) {
14153 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14154 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14155 errored = true;
14156 }
14157
14158 if (wr_ret == 0 || wr_ret == -EEXIST)
14159 have |= POOL_WRITE;
14160 else if (wr_ret != -EPERM) {
14161 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14162 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14163 errored = true;
14164 }
14165
14166 if (errored) {
14167 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14168 // Raise EIO because actual error code might be misleading for
14169 // userspace filesystem user.
14170 pool_perms.erase(perm_key);
14171 signal_cond_list(waiting_for_pool_perm);
14172 return -EIO;
14173 }
14174
14175 pool_perms[perm_key] = have | POOL_CHECKED;
14176 signal_cond_list(waiting_for_pool_perm);
14177 }
14178
14179 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14180 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14181 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14182 return -EPERM;
14183 }
14184 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14185 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14186 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14187 return -EPERM;
14188 }
14189
14190 return 0;
14191 }
14192
14193 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14194 {
14195 if (acl_type == POSIX_ACL) {
14196 if (in->xattrs.count(ACL_EA_ACCESS)) {
14197 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14198
14199 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14200 }
14201 }
14202 return -EAGAIN;
14203 }
14204
14205 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14206 {
14207 if (acl_type == NO_ACL)
14208 return 0;
14209
14210 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14211 if (r < 0)
14212 goto out;
14213
14214 if (acl_type == POSIX_ACL) {
14215 if (in->xattrs.count(ACL_EA_ACCESS)) {
14216 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14217 bufferptr acl(access_acl.c_str(), access_acl.length());
14218 r = posix_acl_access_chmod(acl, mode);
14219 if (r < 0)
14220 goto out;
14221 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14222 } else {
14223 r = 0;
14224 }
14225 }
14226 out:
14227 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14228 return r;
14229 }
14230
14231 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14232 const UserPerm& perms)
14233 {
14234 if (acl_type == NO_ACL)
14235 return 0;
14236
14237 if (S_ISLNK(*mode))
14238 return 0;
14239
14240 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14241 if (r < 0)
14242 goto out;
14243
14244 if (acl_type == POSIX_ACL) {
14245 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14246 map<string, bufferptr> xattrs;
14247
14248 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14249 bufferptr acl(default_acl.c_str(), default_acl.length());
14250 r = posix_acl_inherit_mode(acl, mode);
14251 if (r < 0)
14252 goto out;
14253
14254 if (r > 0) {
14255 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14256 if (r < 0)
14257 goto out;
14258 if (r > 0)
14259 xattrs[ACL_EA_ACCESS] = acl;
14260 }
14261
14262 if (S_ISDIR(*mode))
14263 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14264
14265 r = xattrs.size();
14266 if (r > 0)
14267 encode(xattrs, xattrs_bl);
14268 } else {
14269 if (umask_cb)
14270 *mode &= ~umask_cb(callback_handle);
14271 r = 0;
14272 }
14273 }
14274 out:
14275 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14276 return r;
14277 }
14278
14279 void Client::set_filer_flags(int flags)
14280 {
14281 std::lock_guard l(client_lock);
14282 ceph_assert(flags == 0 ||
14283 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14284 objecter->add_global_op_flags(flags);
14285 }
14286
14287 void Client::clear_filer_flags(int flags)
14288 {
14289 std::lock_guard l(client_lock);
14290 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14291 objecter->clear_global_op_flag(flags);
14292 }
14293
14294 // called before mount
14295 void Client::set_uuid(const std::string& uuid)
14296 {
14297 std::lock_guard l(client_lock);
14298 assert(initialized);
14299 assert(!uuid.empty());
14300
14301 metadata["uuid"] = uuid;
14302 _close_sessions();
14303 }
14304
14305 // called before mount. 0 means infinite
14306 void Client::set_session_timeout(unsigned timeout)
14307 {
14308 std::lock_guard l(client_lock);
14309 assert(initialized);
14310
14311 metadata["timeout"] = stringify(timeout);
14312 }
14313
14314 // called before mount
14315 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14316 const std::string& fs_name)
14317 {
14318 std::lock_guard l(client_lock);
14319 if (!initialized)
14320 return -ENOTCONN;
14321
14322 if (uuid.empty())
14323 return -EINVAL;
14324
14325 {
14326 auto it = metadata.find("uuid");
14327 if (it != metadata.end() && it->second == uuid)
14328 return -EINVAL;
14329 }
14330
14331 int r = subscribe_mdsmap(fs_name);
14332 if (r < 0) {
14333 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14334 return r;
14335 }
14336
14337 if (metadata.empty())
14338 populate_metadata("");
14339
14340 while (mdsmap->get_epoch() == 0)
14341 wait_on_list(waiting_for_mdsmap);
14342
14343 reclaim_errno = 0;
14344 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14345 if (!mdsmap->is_up(mds)) {
14346 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14347 wait_on_list(waiting_for_mdsmap);
14348 continue;
14349 }
14350
14351 MetaSession *session;
14352 if (!have_open_session(mds)) {
14353 session = _get_or_open_mds_session(mds);
14354 if (session->state != MetaSession::STATE_OPENING) {
14355 // umounting?
14356 return -EINVAL;
14357 }
14358 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14359 wait_on_context_list(session->waiting_for_open);
14360 if (rejected_by_mds.count(mds))
14361 return -EPERM;
14362 continue;
14363 }
14364
14365 session = &mds_sessions.at(mds);
14366 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14367 return -EOPNOTSUPP;
14368
14369 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14370 session->reclaim_state == MetaSession::RECLAIMING) {
14371 session->reclaim_state = MetaSession::RECLAIMING;
14372 auto m = MClientReclaim::create(uuid, flags);
14373 session->con->send_message2(std::move(m));
14374 wait_on_list(waiting_for_reclaim);
14375 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14376 return reclaim_errno ? : -ENOTRECOVERABLE;
14377 } else {
14378 mds++;
14379 }
14380 }
14381
14382 // didn't find target session in any mds
14383 if (reclaim_target_addrs.empty()) {
14384 if (flags & CEPH_RECLAIM_RESET)
14385 return -ENOENT;
14386 return -ENOTRECOVERABLE;
14387 }
14388
14389 if (flags & CEPH_RECLAIM_RESET)
14390 return 0;
14391
14392 // use blacklist to check if target session was killed
14393 // (config option mds_session_blacklist_on_evict needs to be true)
14394 C_SaferCond cond;
14395 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14396 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14397 client_lock.Unlock();
14398 cond.wait();
14399 client_lock.Lock();
14400 }
14401
14402 bool blacklisted = objecter->with_osdmap(
14403 [this](const OSDMap &osd_map) -> bool {
14404 return osd_map.is_blacklisted(reclaim_target_addrs);
14405 });
14406 if (blacklisted)
14407 return -ENOTRECOVERABLE;
14408
14409 metadata["reclaiming_uuid"] = uuid;
14410 return 0;
14411 }
14412
14413 void Client::finish_reclaim()
14414 {
14415 auto it = metadata.find("reclaiming_uuid");
14416 if (it == metadata.end()) {
14417 for (auto &p : mds_sessions)
14418 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14419 return;
14420 }
14421
14422 for (auto &p : mds_sessions) {
14423 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14424 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14425 p.second.con->send_message2(std::move(m));
14426 }
14427
14428 metadata["uuid"] = it->second;
14429 metadata.erase(it);
14430 }
14431
14432 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14433 {
14434 mds_rank_t from = mds_rank_t(reply->get_source().num());
14435 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14436
14437 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14438 if (!session) {
14439 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14440 return;
14441 }
14442
14443 if (reply->get_result() >= 0) {
14444 session->reclaim_state = MetaSession::RECLAIM_OK;
14445 if (reply->get_epoch() > reclaim_osd_epoch)
14446 reclaim_osd_epoch = reply->get_epoch();
14447 if (!reply->get_addrs().empty())
14448 reclaim_target_addrs = reply->get_addrs();
14449 } else {
14450 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14451 reclaim_errno = reply->get_result();
14452 }
14453
14454 signal_cond_list(waiting_for_reclaim);
14455 }
14456
14457 /**
14458 * This is included in cap release messages, to cause
14459 * the MDS to wait until this OSD map epoch. It is necessary
14460 * in corner cases where we cancel RADOS ops, so that
14461 * nobody else tries to do IO to the same objects in
14462 * the same epoch as the cancelled ops.
14463 */
14464 void Client::set_cap_epoch_barrier(epoch_t e)
14465 {
14466 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14467 cap_epoch_barrier = e;
14468 }
14469
14470 const char** Client::get_tracked_conf_keys() const
14471 {
14472 static const char* keys[] = {
14473 "client_cache_size",
14474 "client_cache_mid",
14475 "client_acl_type",
14476 "client_deleg_timeout",
14477 "client_deleg_break_on_open",
14478 NULL
14479 };
14480 return keys;
14481 }
14482
14483 void Client::handle_conf_change(const ConfigProxy& conf,
14484 const std::set <std::string> &changed)
14485 {
14486 std::lock_guard lock(client_lock);
14487
14488 if (changed.count("client_cache_mid")) {
14489 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14490 }
14491 if (changed.count("client_acl_type")) {
14492 acl_type = NO_ACL;
14493 if (cct->_conf->client_acl_type == "posix_acl")
14494 acl_type = POSIX_ACL;
14495 }
14496 }
14497
14498 void intrusive_ptr_add_ref(Inode *in)
14499 {
14500 in->get();
14501 }
14502
14503 void intrusive_ptr_release(Inode *in)
14504 {
14505 in->client->put_inode(in);
14506 }
14507
14508 mds_rank_t Client::_get_random_up_mds() const
14509 {
14510 ceph_assert(client_lock.is_locked_by_me());
14511
14512 std::set<mds_rank_t> up;
14513 mdsmap->get_up_mds_set(up);
14514
14515 if (up.empty())
14516 return MDS_RANK_NONE;
14517 std::set<mds_rank_t>::const_iterator p = up.begin();
14518 for (int n = rand() % up.size(); n; n--)
14519 ++p;
14520 return *p;
14521 }
14522
14523
14524 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14525 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14526 {
14527 monclient->set_messenger(m);
14528 objecter->set_client_incarnation(0);
14529 }
14530
14531 StandaloneClient::~StandaloneClient()
14532 {
14533 delete objecter;
14534 objecter = nullptr;
14535 }
14536
14537 int StandaloneClient::init()
14538 {
14539 timer.init();
14540 objectcacher->start();
14541 objecter->init();
14542
14543 client_lock.Lock();
14544 ceph_assert(!is_initialized());
14545
14546 messenger->add_dispatcher_tail(objecter);
14547 messenger->add_dispatcher_tail(this);
14548
14549 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14550 int r = monclient->init();
14551 if (r < 0) {
14552 // need to do cleanup because we're in an intermediate init state
14553 timer.shutdown();
14554 client_lock.Unlock();
14555 objecter->shutdown();
14556 objectcacher->stop();
14557 monclient->shutdown();
14558 return r;
14559 }
14560 objecter->start();
14561
14562 client_lock.Unlock();
14563 _finish_init();
14564
14565 return 0;
14566 }
14567
14568 void StandaloneClient::shutdown()
14569 {
14570 Client::shutdown();
14571 objecter->shutdown();
14572 monclient->shutdown();
14573 }