]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/Mutex.h"
74 #include "common/perf_counters.h"
75 #include "common/admin_socket.h"
76 #include "common/errno.h"
77 #include "include/str_list.h"
78
79 #define dout_subsys ceph_subsys_client
80
81 #include "include/lru.h"
82 #include "include/compat.h"
83 #include "include/stringify.h"
84
85 #include "Client.h"
86 #include "Inode.h"
87 #include "Dentry.h"
88 #include "Delegation.h"
89 #include "Dir.h"
90 #include "ClientSnapRealm.h"
91 #include "Fh.h"
92 #include "MetaSession.h"
93 #include "MetaRequest.h"
94 #include "ObjecterWriteback.h"
95 #include "posix_acl.h"
96
97 #include "include/ceph_assert.h"
98 #include "include/stat.h"
99
100 #include "include/cephfs/ceph_statx.h"
101
102 #if HAVE_GETGROUPLIST
103 #include <grp.h>
104 #include <pwd.h>
105 #include <unistd.h>
106 #endif
107
108 #undef dout_prefix
109 #define dout_prefix *_dout << "client." << whoami << " "
110
111 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113 // FreeBSD fails to define this
114 #ifndef O_DSYNC
115 #define O_DSYNC 0x0
116 #endif
117 // Darwin fails to define this
118 #ifndef O_RSYNC
119 #define O_RSYNC 0x0
120 #endif
121
122 #ifndef O_DIRECT
123 #define O_DIRECT 0x0
124 #endif
125
126 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129 {
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132 }
133
134
135 // -------------
136
137 Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139 {
140 }
141
142 bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
145 {
146 std::unique_ptr<Formatter> f(Formatter::create(format));
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
150 m_client->dump_mds_requests(f.get());
151 else if (command == "mds_sessions")
152 m_client->dump_mds_sessions(f.get());
153 else if (command == "dump_cache")
154 m_client->dump_cache(f.get());
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
158 m_client->dump_status(f.get());
159 else
160 ceph_abort_msg("bad command registered");
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
164 return true;
165 }
166
167
168 // -------------
169
170 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176 void Client::_reset_faked_inos()
177 {
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
182 last_used_faked_root = 0;
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184 }
185
186 void Client::_assign_faked_ino(Inode *in)
187 {
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
192 last_used_faked_ino = 2048;
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
195 ceph_assert(it != free_faked_inos.end());
196 if (last_used_faked_ino < it.get_start()) {
197 ceph_assert(it.get_len() > 0);
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206 }
207
208 /*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214 */
215 void Client::_assign_faked_root(Inode *in)
216 {
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232 }
233
234 void Client::_release_faked_ino(Inode *in)
235 {
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238 }
239
240 vinodeno_t Client::_map_faked_ino(ino_t ino)
241 {
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
250 return vino;
251 }
252
253 vinodeno_t Client::map_faked_ino(ino_t ino)
254 {
255 std::lock_guard lock(client_lock);
256 return _map_faked_ino(ino);
257 }
258
259 // cons/des
260
261 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
263 timer(m->cct, client_lock),
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
274 m_command_hook(this),
275 fscid(0)
276 {
277 _reset_faked_inos();
278
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
281 fuse_default_permissions = cct->_conf.get_val<bool>(
282 "fuse_default_permissions");
283
284 if (cct->_conf->client_acl_type == "posix_acl")
285 acl_type = POSIX_ACL;
286
287 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
288
289 // file handles
290 free_fd_set.insert(10, 1<<30);
291
292 mdsmap.reset(new MDSMap);
293
294 // osd interfaces
295 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
296 &client_lock));
297 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
298 client_flush_set_callback, // all commit callback
299 (void*)this,
300 cct->_conf->client_oc_size,
301 cct->_conf->client_oc_max_objects,
302 cct->_conf->client_oc_max_dirty,
303 cct->_conf->client_oc_target_dirty,
304 cct->_conf->client_oc_max_dirty_age,
305 true));
306 objecter_finisher.start();
307 filer.reset(new Filer(objecter, &objecter_finisher));
308 objecter->enable_blacklist_events();
309 }
310
311
312 Client::~Client()
313 {
314 ceph_assert(!client_lock.is_locked());
315
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
319 client_lock.Lock();
320 tear_down_cache();
321 client_lock.Unlock();
322 }
323
324 void Client::tear_down_cache()
325 {
326 // fd's
327 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
328 it != fd_map.end();
329 ++it) {
330 Fh *fh = it->second;
331 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
332 _release_fh(fh);
333 }
334 fd_map.clear();
335
336 while (!opened_dirs.empty()) {
337 dir_result_t *dirp = *opened_dirs.begin();
338 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
339 _closedir(dirp);
340 }
341
342 // caps!
343 // *** FIXME ***
344
345 // empty lru
346 trim_cache();
347 ceph_assert(lru.lru_get_size() == 0);
348
349 // close root ino
350 ceph_assert(inode_map.size() <= 1 + root_parents.size());
351 if (root && inode_map.size() == 1 + root_parents.size()) {
352 delete root;
353 root = 0;
354 root_ancestor = 0;
355 while (!root_parents.empty())
356 root_parents.erase(root_parents.begin());
357 inode_map.clear();
358 _reset_faked_inos();
359 }
360
361 ceph_assert(inode_map.empty());
362 }
363
364 inodeno_t Client::get_root_ino()
365 {
366 std::lock_guard l(client_lock);
367 if (use_faked_inos())
368 return root->faked_ino;
369 else
370 return root->ino;
371 }
372
373 Inode *Client::get_root()
374 {
375 std::lock_guard l(client_lock);
376 root->ll_get();
377 return root;
378 }
379
380
381 // debug crapola
382
383 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
384 {
385 filepath path;
386 in->make_long_path(path);
387 ldout(cct, 1) << "dump_inode: "
388 << (disconnected ? "DISCONNECTED ":"")
389 << "inode " << in->ino
390 << " " << path
391 << " ref " << in->get_num_ref()
392 << *in << dendl;
393
394 if (f) {
395 f->open_object_section("inode");
396 f->dump_stream("path") << path;
397 if (disconnected)
398 f->dump_int("disconnected", 1);
399 in->dump(f);
400 f->close_section();
401 }
402
403 did.insert(in);
404 if (in->dir) {
405 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
406 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
407 it != in->dir->dentries.end();
408 ++it) {
409 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
410 if (f) {
411 f->open_object_section("dentry");
412 it->second->dump(f);
413 f->close_section();
414 }
415 if (it->second->inode)
416 dump_inode(f, it->second->inode.get(), did, false);
417 }
418 }
419 }
420
421 void Client::dump_cache(Formatter *f)
422 {
423 set<Inode*> did;
424
425 ldout(cct, 1) << __func__ << dendl;
426
427 if (f)
428 f->open_array_section("cache");
429
430 if (root)
431 dump_inode(f, root, did, true);
432
433 // make a second pass to catch anything disconnected
434 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
435 it != inode_map.end();
436 ++it) {
437 if (did.count(it->second))
438 continue;
439 dump_inode(f, it->second, did, true);
440 }
441
442 if (f)
443 f->close_section();
444 }
445
446 void Client::dump_status(Formatter *f)
447 {
448 ceph_assert(client_lock.is_locked_by_me());
449
450 ldout(cct, 1) << __func__ << dendl;
451
452 const epoch_t osd_epoch
453 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
454
455 if (f) {
456 f->open_object_section("metadata");
457 for (const auto& kv : metadata)
458 f->dump_string(kv.first.c_str(), kv.second);
459 f->close_section();
460
461 f->dump_int("dentry_count", lru.lru_get_size());
462 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
463 f->dump_int("id", get_nodeid().v);
464 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
465 f->dump_object("inst", inst);
466 f->dump_object("addr", inst.addr);
467 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
468 f->dump_string("addr_str", inst.addr.get_legacy_str());
469 f->dump_int("inode_count", inode_map.size());
470 f->dump_int("mds_epoch", mdsmap->get_epoch());
471 f->dump_int("osd_epoch", osd_epoch);
472 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
473 f->dump_bool("blacklisted", blacklisted);
474 }
475 }
476
477 int Client::init()
478 {
479 timer.init();
480 objectcacher->start();
481
482 client_lock.Lock();
483 ceph_assert(!initialized);
484
485 messenger->add_dispatcher_tail(this);
486 client_lock.Unlock();
487
488 _finish_init();
489 return 0;
490 }
491
492 void Client::_finish_init()
493 {
494 client_lock.Lock();
495 // logger
496 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
497 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
498 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
499 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
500 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
501 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
502 logger.reset(plb.create_perf_counters());
503 cct->get_perfcounters_collection()->add(logger.get());
504
505 client_lock.Unlock();
506
507 cct->_conf.add_observer(this);
508
509 AdminSocket* admin_socket = cct->get_admin_socket();
510 int ret = admin_socket->register_command("mds_requests",
511 "mds_requests",
512 &m_command_hook,
513 "show in-progress mds requests");
514 if (ret < 0) {
515 lderr(cct) << "error registering admin socket command: "
516 << cpp_strerror(-ret) << dendl;
517 }
518 ret = admin_socket->register_command("mds_sessions",
519 "mds_sessions",
520 &m_command_hook,
521 "show mds session state");
522 if (ret < 0) {
523 lderr(cct) << "error registering admin socket command: "
524 << cpp_strerror(-ret) << dendl;
525 }
526 ret = admin_socket->register_command("dump_cache",
527 "dump_cache",
528 &m_command_hook,
529 "show in-memory metadata cache contents");
530 if (ret < 0) {
531 lderr(cct) << "error registering admin socket command: "
532 << cpp_strerror(-ret) << dendl;
533 }
534 ret = admin_socket->register_command("kick_stale_sessions",
535 "kick_stale_sessions",
536 &m_command_hook,
537 "kick sessions that were remote reset");
538 if (ret < 0) {
539 lderr(cct) << "error registering admin socket command: "
540 << cpp_strerror(-ret) << dendl;
541 }
542 ret = admin_socket->register_command("status",
543 "status",
544 &m_command_hook,
545 "show overall client status");
546 if (ret < 0) {
547 lderr(cct) << "error registering admin socket command: "
548 << cpp_strerror(-ret) << dendl;
549 }
550
551 client_lock.Lock();
552 initialized = true;
553 client_lock.Unlock();
554 }
555
556 void Client::shutdown()
557 {
558 ldout(cct, 1) << __func__ << dendl;
559
560 // If we were not mounted, but were being used for sending
561 // MDS commands, we may have sessions that need closing.
562 client_lock.Lock();
563 _close_sessions();
564 client_lock.Unlock();
565
566 cct->_conf.remove_observer(this);
567
568 cct->get_admin_socket()->unregister_commands(&m_command_hook);
569
570 if (ino_invalidate_cb) {
571 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
572 async_ino_invalidator.wait_for_empty();
573 async_ino_invalidator.stop();
574 }
575
576 if (dentry_invalidate_cb) {
577 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
578 async_dentry_invalidator.wait_for_empty();
579 async_dentry_invalidator.stop();
580 }
581
582 if (switch_interrupt_cb) {
583 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
584 interrupt_finisher.wait_for_empty();
585 interrupt_finisher.stop();
586 }
587
588 if (remount_cb) {
589 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
590 remount_finisher.wait_for_empty();
591 remount_finisher.stop();
592 }
593
594 objectcacher->stop(); // outside of client_lock! this does a join.
595
596 client_lock.Lock();
597 ceph_assert(initialized);
598 initialized = false;
599 timer.shutdown();
600 client_lock.Unlock();
601
602 objecter_finisher.wait_for_empty();
603 objecter_finisher.stop();
604
605 if (logger) {
606 cct->get_perfcounters_collection()->remove(logger.get());
607 logger.reset();
608 }
609 }
610
611
612 // ===================
613 // metadata cache stuff
614
615 void Client::trim_cache(bool trim_kernel_dcache)
616 {
617 uint64_t max = cct->_conf->client_cache_size;
618 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
619 unsigned last = 0;
620 while (lru.lru_get_size() != last) {
621 last = lru.lru_get_size();
622
623 if (!unmounting && lru.lru_get_size() <= max) break;
624
625 // trim!
626 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
627 if (!dn)
628 break; // done
629
630 trim_dentry(dn);
631 }
632
633 if (trim_kernel_dcache && lru.lru_get_size() > max)
634 _invalidate_kernel_dcache();
635
636 // hose root?
637 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
638 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
639 delete root;
640 root = 0;
641 root_ancestor = 0;
642 while (!root_parents.empty())
643 root_parents.erase(root_parents.begin());
644 inode_map.clear();
645 _reset_faked_inos();
646 }
647 }
648
649 void Client::trim_cache_for_reconnect(MetaSession *s)
650 {
651 mds_rank_t mds = s->mds_num;
652 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
653
654 int trimmed = 0;
655 list<Dentry*> skipped;
656 while (lru.lru_get_size() > 0) {
657 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
658 if (!dn)
659 break;
660
661 if ((dn->inode && dn->inode->caps.count(mds)) ||
662 dn->dir->parent_inode->caps.count(mds)) {
663 trim_dentry(dn);
664 trimmed++;
665 } else
666 skipped.push_back(dn);
667 }
668
669 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
670 lru.lru_insert_mid(*p);
671
672 ldout(cct, 20) << __func__ << " mds." << mds
673 << " trimmed " << trimmed << " dentries" << dendl;
674
675 if (s->caps.size() > 0)
676 _invalidate_kernel_dcache();
677 }
678
679 void Client::trim_dentry(Dentry *dn)
680 {
681 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
682 << " in dir "
683 << std::hex << dn->dir->parent_inode->ino << std::dec
684 << dendl;
685 if (dn->inode) {
686 Inode *diri = dn->dir->parent_inode;
687 diri->dir_release_count++;
688 clear_dir_complete_and_ordered(diri, true);
689 }
690 unlink(dn, false, false); // drop dir, drop dentry
691 }
692
693
694 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
695 uint64_t truncate_seq, uint64_t truncate_size)
696 {
697 uint64_t prior_size = in->size;
698
699 if (truncate_seq > in->truncate_seq ||
700 (truncate_seq == in->truncate_seq && size > in->size)) {
701 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
702 in->size = size;
703 in->reported_size = size;
704 if (truncate_seq != in->truncate_seq) {
705 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
706 << truncate_seq << dendl;
707 in->truncate_seq = truncate_seq;
708 in->oset.truncate_seq = truncate_seq;
709
710 // truncate cached file data
711 if (prior_size > size) {
712 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
713 }
714 }
715
716 // truncate inline data
717 if (in->inline_version < CEPH_INLINE_NONE) {
718 uint32_t len = in->inline_data.length();
719 if (size < len)
720 in->inline_data.splice(size, len - size);
721 }
722 }
723 if (truncate_seq >= in->truncate_seq &&
724 in->truncate_size != truncate_size) {
725 if (in->is_file()) {
726 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
727 << truncate_size << dendl;
728 in->truncate_size = truncate_size;
729 in->oset.truncate_size = truncate_size;
730 } else {
731 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
732 }
733 }
734 }
735
736 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
737 utime_t ctime, utime_t mtime, utime_t atime)
738 {
739 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
740 << " ctime " << ctime << " mtime " << mtime << dendl;
741
742 if (time_warp_seq > in->time_warp_seq)
743 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
744 << " is higher than local time_warp_seq "
745 << in->time_warp_seq << dendl;
746
747 int warn = false;
748 // be careful with size, mtime, atime
749 if (issued & (CEPH_CAP_FILE_EXCL|
750 CEPH_CAP_FILE_WR|
751 CEPH_CAP_FILE_BUFFER|
752 CEPH_CAP_AUTH_EXCL|
753 CEPH_CAP_XATTR_EXCL)) {
754 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
755 if (ctime > in->ctime)
756 in->ctime = ctime;
757 if (time_warp_seq > in->time_warp_seq) {
758 //the mds updated times, so take those!
759 in->mtime = mtime;
760 in->atime = atime;
761 in->time_warp_seq = time_warp_seq;
762 } else if (time_warp_seq == in->time_warp_seq) {
763 //take max times
764 if (mtime > in->mtime)
765 in->mtime = mtime;
766 if (atime > in->atime)
767 in->atime = atime;
768 } else if (issued & CEPH_CAP_FILE_EXCL) {
769 //ignore mds values as we have a higher seq
770 } else warn = true;
771 } else {
772 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
773 if (time_warp_seq >= in->time_warp_seq) {
774 in->ctime = ctime;
775 in->mtime = mtime;
776 in->atime = atime;
777 in->time_warp_seq = time_warp_seq;
778 } else warn = true;
779 }
780 if (warn) {
781 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
782 << time_warp_seq << " is lower than local time_warp_seq "
783 << in->time_warp_seq
784 << dendl;
785 }
786 }
787
788 void Client::_fragmap_remove_non_leaves(Inode *in)
789 {
790 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
791 if (!in->dirfragtree.is_leaf(p->first))
792 in->fragmap.erase(p++);
793 else
794 ++p;
795 }
796
797 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
798 {
799 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
800 if (p->second == mds)
801 in->fragmap.erase(p++);
802 else
803 ++p;
804 }
805
806 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
807 MetaSession *session,
808 const UserPerm& request_perms)
809 {
810 Inode *in;
811 bool was_new = false;
812 if (inode_map.count(st->vino)) {
813 in = inode_map[st->vino];
814 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
815 } else {
816 in = new Inode(this, st->vino, &st->layout);
817 inode_map[st->vino] = in;
818
819 if (use_faked_inos())
820 _assign_faked_ino(in);
821
822 if (!root) {
823 root = in;
824 if (use_faked_inos())
825 _assign_faked_root(root);
826 root_ancestor = in;
827 cwd = root;
828 } else if (!mounted) {
829 root_parents[root_ancestor] = in;
830 root_ancestor = in;
831 }
832
833 // immutable bits
834 in->ino = st->vino.ino;
835 in->snapid = st->vino.snapid;
836 in->mode = st->mode & S_IFMT;
837 was_new = true;
838 }
839
840 in->rdev = st->rdev;
841 if (in->is_symlink())
842 in->symlink = st->symlink;
843
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool new_version = false;
846 if (in->version == 0 ||
847 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
848 (in->version & ~1) < st->version))
849 new_version = true;
850
851 int issued;
852 in->caps_issued(&issued);
853 issued |= in->caps_dirty();
854 int new_issued = ~issued & (int)st->cap.caps;
855
856 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
857 !(issued & CEPH_CAP_AUTH_EXCL)) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
862 in->snap_btime = st->snap_btime;
863 }
864
865 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
866 !(issued & CEPH_CAP_LINK_EXCL)) {
867 in->nlink = st->nlink;
868 }
869
870 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
871 update_inode_file_time(in, issued, st->time_warp_seq,
872 st->ctime, st->mtime, st->atime);
873 }
874
875 if (new_version ||
876 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
877 in->layout = st->layout;
878 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
879 }
880
881 if (in->is_dir()) {
882 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
883 in->dirstat = st->dirstat;
884 }
885 // dir_layout/rstat/quota are not tracked by capability, update them only if
886 // the inode stat is from auth mds
887 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
888 in->dir_layout = st->dir_layout;
889 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
890 in->rstat = st->rstat;
891 in->quota = st->quota;
892 in->dir_pin = st->dir_pin;
893 }
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
898 }
899 }
900
901 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
902 st->xattrbl.length() &&
903 st->xattr_version > in->xattr_version) {
904 auto p = st->xattrbl.cbegin();
905 decode(in->xattrs, p);
906 in->xattr_version = st->xattr_version;
907 }
908
909 if (st->inline_version > in->inline_version) {
910 in->inline_data = st->inline_data;
911 in->inline_version = st->inline_version;
912 }
913
914 /* always take a newer change attr */
915 if (st->change_attr > in->change_attr)
916 in->change_attr = st->change_attr;
917
918 if (st->version > in->version)
919 in->version = st->version;
920
921 if (was_new)
922 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
923
924 if (!st->cap.caps)
925 return in; // as with readdir returning indoes in different snaprealms (no caps!)
926
927 if (in->snapid == CEPH_NOSNAP) {
928 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
929 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
930 st->cap.flags, request_perms);
931 if (in->auth_cap && in->auth_cap->session == session) {
932 in->max_size = st->max_size;
933 in->rstat = st->rstat;
934 }
935
936 // setting I_COMPLETE needs to happen after adding the cap
937 if (in->is_dir() &&
938 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
939 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
940 in->dirstat.nfiles == 0 &&
941 in->dirstat.nsubdirs == 0) {
942 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
943 in->flags |= I_COMPLETE | I_DIR_ORDERED;
944 if (in->dir) {
945 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
946 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
947 in->dir->readdir_cache.clear();
948 for (const auto& p : in->dir->dentries) {
949 unlink(p.second, true, true); // keep dir, keep dentry
950 }
951 if (in->dir->dentries.empty())
952 close_dir(in->dir);
953 }
954 }
955 } else {
956 in->snap_caps |= st->cap.caps;
957 }
958
959 return in;
960 }
961
962
963 /*
964 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
965 */
966 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
967 Inode *in, utime_t from, MetaSession *session,
968 Dentry *old_dentry)
969 {
970 Dentry *dn = NULL;
971 if (dir->dentries.count(dname))
972 dn = dir->dentries[dname];
973
974 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
975 << " in dir " << dir->parent_inode->vino() << " dn " << dn
976 << dendl;
977
978 if (dn && dn->inode) {
979 if (dn->inode->vino() == in->vino()) {
980 touch_dn(dn);
981 ldout(cct, 12) << " had dentry " << dname
982 << " with correct vino " << dn->inode->vino()
983 << dendl;
984 } else {
985 ldout(cct, 12) << " had dentry " << dname
986 << " with WRONG vino " << dn->inode->vino()
987 << dendl;
988 unlink(dn, true, true); // keep dir, keep dentry
989 }
990 }
991
992 if (!dn || !dn->inode) {
993 InodeRef tmp_ref(in);
994 if (old_dentry) {
995 if (old_dentry->dir != dir) {
996 Inode *old_diri = old_dentry->dir->parent_inode;
997 old_diri->dir_ordered_count++;
998 clear_dir_complete_and_ordered(old_diri, false);
999 }
1000 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1001 }
1002 Inode *diri = dir->parent_inode;
1003 diri->dir_ordered_count++;
1004 clear_dir_complete_and_ordered(diri, false);
1005 dn = link(dir, dname, in, dn);
1006 }
1007
1008 update_dentry_lease(dn, dlease, from, session);
1009 return dn;
1010 }
1011
1012 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1013 {
1014 utime_t dttl = from;
1015 dttl += (float)dlease->duration_ms / 1000.0;
1016
1017 ceph_assert(dn);
1018
1019 if (dlease->mask & CEPH_LOCK_DN) {
1020 if (dttl > dn->lease_ttl) {
1021 ldout(cct, 10) << "got dentry lease on " << dn->name
1022 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1023 dn->lease_ttl = dttl;
1024 dn->lease_mds = session->mds_num;
1025 dn->lease_seq = dlease->seq;
1026 dn->lease_gen = session->cap_gen;
1027 }
1028 }
1029 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1030 }
1031
1032
1033 /*
1034 * update MDS location cache for a single inode
1035 */
1036 void Client::update_dir_dist(Inode *in, DirStat *dst)
1037 {
1038 // auth
1039 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1040 if (dst->auth >= 0) {
1041 in->fragmap[dst->frag] = dst->auth;
1042 } else {
1043 in->fragmap.erase(dst->frag);
1044 }
1045 if (!in->dirfragtree.is_leaf(dst->frag)) {
1046 in->dirfragtree.force_to_leaf(cct, dst->frag);
1047 _fragmap_remove_non_leaves(in);
1048 }
1049
1050 // replicated
1051 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1052 }
1053
1054 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1055 {
1056 if (diri->flags & I_COMPLETE) {
1057 if (complete) {
1058 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1059 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1060 } else {
1061 if (diri->flags & I_DIR_ORDERED) {
1062 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1063 diri->flags &= ~I_DIR_ORDERED;
1064 }
1065 }
1066 if (diri->dir)
1067 diri->dir->readdir_cache.clear();
1068 }
1069 }
1070
1071 /*
1072 * insert results from readdir or lssnap into the metadata cache.
1073 */
1074 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1075
1076 auto& reply = request->reply;
1077 ConnectionRef con = request->reply->get_connection();
1078 uint64_t features;
1079 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1080 features = (uint64_t)-1;
1081 }
1082 else {
1083 features = con->get_features();
1084 }
1085
1086 dir_result_t *dirp = request->dirp;
1087 ceph_assert(dirp);
1088
1089 // the extra buffer list is only set for readdir and lssnap replies
1090 auto p = reply->get_extra_bl().cbegin();
1091 if (!p.end()) {
1092 // snapdir?
1093 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1094 ceph_assert(diri);
1095 diri = open_snapdir(diri);
1096 }
1097
1098 // only open dir if we're actually adding stuff to it!
1099 Dir *dir = diri->open_dir();
1100 ceph_assert(dir);
1101
1102 // dirstat
1103 DirStat dst(p, features);
1104 __u32 numdn;
1105 __u16 flags;
1106 decode(numdn, p);
1107 decode(flags, p);
1108
1109 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1110 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1111
1112 frag_t fg = (unsigned)request->head.args.readdir.frag;
1113 unsigned readdir_offset = dirp->next_offset;
1114 string readdir_start = dirp->last_name;
1115 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1116
1117 unsigned last_hash = 0;
1118 if (hash_order) {
1119 if (!readdir_start.empty()) {
1120 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1121 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1122 /* mds understands offset_hash */
1123 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1124 }
1125 }
1126
1127 if (fg != dst.frag) {
1128 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1129 fg = dst.frag;
1130 if (!hash_order) {
1131 readdir_offset = 2;
1132 readdir_start.clear();
1133 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1134 }
1135 }
1136
1137 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1138 << ", hash_order=" << hash_order
1139 << ", readdir_start " << readdir_start
1140 << ", last_hash " << last_hash
1141 << ", next_offset " << readdir_offset << dendl;
1142
1143 if (diri->snapid != CEPH_SNAPDIR &&
1144 fg.is_leftmost() && readdir_offset == 2 &&
1145 !(hash_order && last_hash)) {
1146 dirp->release_count = diri->dir_release_count;
1147 dirp->ordered_count = diri->dir_ordered_count;
1148 dirp->start_shared_gen = diri->shared_gen;
1149 dirp->cache_index = 0;
1150 }
1151
1152 dirp->buffer_frag = fg;
1153
1154 _readdir_drop_dirp_buffer(dirp);
1155 dirp->buffer.reserve(numdn);
1156
1157 string dname;
1158 LeaseStat dlease;
1159 for (unsigned i=0; i<numdn; i++) {
1160 decode(dname, p);
1161 dlease.decode(p, features);
1162 InodeStat ist(p, features);
1163
1164 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1165
1166 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1167 request->perms);
1168 Dentry *dn;
1169 if (diri->dir->dentries.count(dname)) {
1170 Dentry *olddn = diri->dir->dentries[dname];
1171 if (olddn->inode != in) {
1172 // replace incorrect dentry
1173 unlink(olddn, true, true); // keep dir, dentry
1174 dn = link(dir, dname, in, olddn);
1175 ceph_assert(dn == olddn);
1176 } else {
1177 // keep existing dn
1178 dn = olddn;
1179 touch_dn(dn);
1180 }
1181 } else {
1182 // new dn
1183 dn = link(dir, dname, in, NULL);
1184 }
1185
1186 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1187 if (hash_order) {
1188 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1189 if (hash != last_hash)
1190 readdir_offset = 2;
1191 last_hash = hash;
1192 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1193 } else {
1194 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1195 }
1196 // add to readdir cache
1197 if (dirp->release_count == diri->dir_release_count &&
1198 dirp->ordered_count == diri->dir_ordered_count &&
1199 dirp->start_shared_gen == diri->shared_gen) {
1200 if (dirp->cache_index == dir->readdir_cache.size()) {
1201 if (i == 0) {
1202 ceph_assert(!dirp->inode->is_complete_and_ordered());
1203 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1204 }
1205 dir->readdir_cache.push_back(dn);
1206 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1207 if (dirp->inode->is_complete_and_ordered())
1208 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1209 else
1210 dir->readdir_cache[dirp->cache_index] = dn;
1211 } else {
1212 ceph_abort_msg("unexpected readdir buffer idx");
1213 }
1214 dirp->cache_index++;
1215 }
1216 // add to cached result list
1217 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1218 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1219 }
1220
1221 if (numdn > 0)
1222 dirp->last_name = dname;
1223 if (end)
1224 dirp->next_offset = 2;
1225 else
1226 dirp->next_offset = readdir_offset;
1227
1228 if (dir->is_empty())
1229 close_dir(dir);
1230 }
1231 }
1232
1233 /** insert_trace
1234 *
1235 * insert a trace from a MDS reply into the cache.
1236 */
1237 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1238 {
1239 auto& reply = request->reply;
1240 int op = request->get_op();
1241
1242 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1243 << " is_target=" << (int)reply->head.is_target
1244 << " is_dentry=" << (int)reply->head.is_dentry
1245 << dendl;
1246
1247 auto p = reply->get_trace_bl().cbegin();
1248 if (request->got_unsafe) {
1249 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1250 ceph_assert(p.end());
1251 return NULL;
1252 }
1253
1254 if (p.end()) {
1255 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1256
1257 Dentry *d = request->dentry();
1258 if (d) {
1259 Inode *diri = d->dir->parent_inode;
1260 diri->dir_release_count++;
1261 clear_dir_complete_and_ordered(diri, true);
1262 }
1263
1264 if (d && reply->get_result() == 0) {
1265 if (op == CEPH_MDS_OP_RENAME) {
1266 // rename
1267 Dentry *od = request->old_dentry();
1268 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1269 ceph_assert(od);
1270 unlink(od, true, true); // keep dir, dentry
1271 } else if (op == CEPH_MDS_OP_RMDIR ||
1272 op == CEPH_MDS_OP_UNLINK) {
1273 // unlink, rmdir
1274 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1275 unlink(d, true, true); // keep dir, dentry
1276 }
1277 }
1278 return NULL;
1279 }
1280
1281 ConnectionRef con = request->reply->get_connection();
1282 uint64_t features;
1283 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1284 features = (uint64_t)-1;
1285 }
1286 else {
1287 features = con->get_features();
1288 }
1289 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1290
1291 // snap trace
1292 SnapRealm *realm = NULL;
1293 if (reply->snapbl.length())
1294 update_snap_trace(reply->snapbl, &realm);
1295
1296 ldout(cct, 10) << " hrm "
1297 << " is_target=" << (int)reply->head.is_target
1298 << " is_dentry=" << (int)reply->head.is_dentry
1299 << dendl;
1300
1301 InodeStat dirst;
1302 DirStat dst;
1303 string dname;
1304 LeaseStat dlease;
1305 InodeStat ist;
1306
1307 if (reply->head.is_dentry) {
1308 dirst.decode(p, features);
1309 dst.decode(p, features);
1310 decode(dname, p);
1311 dlease.decode(p, features);
1312 }
1313
1314 Inode *in = 0;
1315 if (reply->head.is_target) {
1316 ist.decode(p, features);
1317 if (cct->_conf->client_debug_getattr_caps) {
1318 unsigned wanted = 0;
1319 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1320 wanted = request->head.args.getattr.mask;
1321 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1322 wanted = request->head.args.open.mask;
1323
1324 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1325 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1326 ceph_abort_msg("MDS reply does not contain xattrs");
1327 }
1328
1329 in = add_update_inode(&ist, request->sent_stamp, session,
1330 request->perms);
1331 }
1332
1333 Inode *diri = NULL;
1334 if (reply->head.is_dentry) {
1335 diri = add_update_inode(&dirst, request->sent_stamp, session,
1336 request->perms);
1337 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1338
1339 if (in) {
1340 Dir *dir = diri->open_dir();
1341 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1342 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1343 } else {
1344 Dentry *dn = NULL;
1345 if (diri->dir && diri->dir->dentries.count(dname)) {
1346 dn = diri->dir->dentries[dname];
1347 if (dn->inode) {
1348 diri->dir_ordered_count++;
1349 clear_dir_complete_and_ordered(diri, false);
1350 unlink(dn, true, true); // keep dir, dentry
1351 }
1352 }
1353 if (dlease.duration_ms > 0) {
1354 if (!dn) {
1355 Dir *dir = diri->open_dir();
1356 dn = link(dir, dname, NULL, NULL);
1357 }
1358 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1359 }
1360 }
1361 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1362 op == CEPH_MDS_OP_MKSNAP) {
1363 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1364 // fake it for snap lookup
1365 vinodeno_t vino = ist.vino;
1366 vino.snapid = CEPH_SNAPDIR;
1367 ceph_assert(inode_map.count(vino));
1368 diri = inode_map[vino];
1369
1370 string dname = request->path.last_dentry();
1371
1372 LeaseStat dlease;
1373 dlease.duration_ms = 0;
1374
1375 if (in) {
1376 Dir *dir = diri->open_dir();
1377 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1378 } else {
1379 if (diri->dir && diri->dir->dentries.count(dname)) {
1380 Dentry *dn = diri->dir->dentries[dname];
1381 if (dn->inode)
1382 unlink(dn, true, true); // keep dir, dentry
1383 }
1384 }
1385 }
1386
1387 if (in) {
1388 if (op == CEPH_MDS_OP_READDIR ||
1389 op == CEPH_MDS_OP_LSSNAP) {
1390 insert_readdir_results(request, session, in);
1391 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1392 // hack: return parent inode instead
1393 in = diri;
1394 }
1395
1396 if (request->dentry() == NULL && in != request->inode()) {
1397 // pin the target inode if its parent dentry is not pinned
1398 request->set_other_inode(in);
1399 }
1400 }
1401
1402 if (realm)
1403 put_snap_realm(realm);
1404
1405 request->target = in;
1406 return in;
1407 }
1408
1409 // -------
1410
1411 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1412 {
1413 mds_rank_t mds = MDS_RANK_NONE;
1414 __u32 hash = 0;
1415 bool is_hash = false;
1416
1417 Inode *in = NULL;
1418 Dentry *de = NULL;
1419
1420 if (req->resend_mds >= 0) {
1421 mds = req->resend_mds;
1422 req->resend_mds = -1;
1423 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1424 goto out;
1425 }
1426
1427 if (cct->_conf->client_use_random_mds)
1428 goto random_mds;
1429
1430 in = req->inode();
1431 de = req->dentry();
1432 if (in) {
1433 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1434 if (req->path.depth()) {
1435 hash = in->hash_dentry_name(req->path[0]);
1436 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1437 << " on " << req->path[0]
1438 << " => " << hash << dendl;
1439 is_hash = true;
1440 }
1441 } else if (de) {
1442 if (de->inode) {
1443 in = de->inode.get();
1444 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1445 } else {
1446 in = de->dir->parent_inode;
1447 hash = in->hash_dentry_name(de->name);
1448 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1449 << " on " << de->name
1450 << " => " << hash << dendl;
1451 is_hash = true;
1452 }
1453 }
1454 if (in) {
1455 if (in->snapid != CEPH_NOSNAP) {
1456 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1457 while (in->snapid != CEPH_NOSNAP) {
1458 if (in->snapid == CEPH_SNAPDIR)
1459 in = in->snapdir_parent.get();
1460 else if (!in->dentries.empty())
1461 /* In most cases there will only be one dentry, so getting it
1462 * will be the correct action. If there are multiple hard links,
1463 * I think the MDS should be able to redirect as needed*/
1464 in = in->get_first_parent()->dir->parent_inode;
1465 else {
1466 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1467 break;
1468 }
1469 }
1470 is_hash = false;
1471 }
1472
1473 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1474 << " hash=" << hash << dendl;
1475
1476 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1477 frag_t fg = in->dirfragtree[hash];
1478 if (in->fragmap.count(fg)) {
1479 mds = in->fragmap[fg];
1480 if (phash_diri)
1481 *phash_diri = in;
1482 } else if (in->auth_cap) {
1483 mds = in->auth_cap->session->mds_num;
1484 }
1485 if (mds >= 0) {
1486 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1487 goto out;
1488 }
1489 }
1490
1491 if (in->auth_cap && req->auth_is_best()) {
1492 mds = in->auth_cap->session->mds_num;
1493 } else if (!in->caps.empty()) {
1494 mds = in->caps.begin()->second.session->mds_num;
1495 } else {
1496 goto random_mds;
1497 }
1498 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1499
1500 goto out;
1501 }
1502
1503 random_mds:
1504 if (mds < 0) {
1505 mds = _get_random_up_mds();
1506 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1507 }
1508
1509 out:
1510 ldout(cct, 20) << "mds is " << mds << dendl;
1511 return mds;
1512 }
1513
1514
1515 void Client::connect_mds_targets(mds_rank_t mds)
1516 {
1517 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1518 ceph_assert(mds_sessions.count(mds));
1519 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1520 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1521 q != info.export_targets.end();
1522 ++q) {
1523 if (mds_sessions.count(*q) == 0 &&
1524 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1525 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1526 << " export target mds." << *q << dendl;
1527 _open_mds_session(*q);
1528 }
1529 }
1530 }
1531
1532 void Client::dump_mds_sessions(Formatter *f)
1533 {
1534 f->dump_int("id", get_nodeid().v);
1535 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1536 f->dump_object("inst", inst);
1537 f->dump_stream("inst_str") << inst;
1538 f->dump_stream("addr_str") << inst.addr;
1539 f->open_array_section("sessions");
1540 for (const auto &p : mds_sessions) {
1541 f->open_object_section("session");
1542 p.second.dump(f);
1543 f->close_section();
1544 }
1545 f->close_section();
1546 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1547 }
1548 void Client::dump_mds_requests(Formatter *f)
1549 {
1550 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1551 p != mds_requests.end();
1552 ++p) {
1553 f->open_object_section("request");
1554 p->second->dump(f);
1555 f->close_section();
1556 }
1557 }
1558
1559 int Client::verify_reply_trace(int r,
1560 MetaRequest *request, const MConstRef<MClientReply>& reply,
1561 InodeRef *ptarget, bool *pcreated,
1562 const UserPerm& perms)
1563 {
1564 // check whether this request actually did the create, and set created flag
1565 bufferlist extra_bl;
1566 inodeno_t created_ino;
1567 bool got_created_ino = false;
1568 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1569
1570 extra_bl = reply->get_extra_bl();
1571 if (extra_bl.length() >= 8) {
1572 // if the extra bufferlist has a buffer, we assume its the created inode
1573 // and that this request to create succeeded in actually creating
1574 // the inode (won the race with other create requests)
1575 decode(created_ino, extra_bl);
1576 got_created_ino = true;
1577 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1578 }
1579
1580 if (pcreated)
1581 *pcreated = got_created_ino;
1582
1583 if (request->target) {
1584 *ptarget = request->target;
1585 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1586 } else {
1587 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1588 (*ptarget) = p->second;
1589 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1590 } else {
1591 // we got a traceless reply, and need to look up what we just
1592 // created. for now, do this by name. someday, do this by the
1593 // ino... which we know! FIXME.
1594 InodeRef target;
1595 Dentry *d = request->dentry();
1596 if (d) {
1597 if (d->dir) {
1598 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1599 << d->dir->parent_inode->ino << "/" << d->name
1600 << " got_ino " << got_created_ino
1601 << " ino " << created_ino
1602 << dendl;
1603 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1604 &target, perms);
1605 } else {
1606 // if the dentry is not linked, just do our best. see #5021.
1607 ceph_abort_msg("how did this happen? i want logs!");
1608 }
1609 } else {
1610 Inode *in = request->inode();
1611 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1612 << in->ino << dendl;
1613 r = _getattr(in, request->regetattr_mask, perms, true);
1614 target = in;
1615 }
1616 if (r >= 0) {
1617 // verify ino returned in reply and trace_dist are the same
1618 if (got_created_ino &&
1619 created_ino.val != target->ino.val) {
1620 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1621 r = -EINTR;
1622 }
1623 if (ptarget)
1624 ptarget->swap(target);
1625 }
1626 }
1627 }
1628
1629 return r;
1630 }
1631
1632
1633 /**
1634 * make a request
1635 *
1636 * Blocking helper to make an MDS request.
1637 *
1638 * If the ptarget flag is set, behavior changes slightly: the caller
1639 * expects to get a pointer to the inode we are creating or operating
1640 * on. As a result, we will follow up any traceless mutation reply
1641 * with a getattr or lookup to transparently handle a traceless reply
1642 * from the MDS (as when the MDS restarts and the client has to replay
1643 * a request).
1644 *
1645 * @param request the MetaRequest to execute
1646 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1647 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1648 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1649 * @param use_mds [optional] prefer a specific mds (-1 for default)
1650 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1651 */
1652 int Client::make_request(MetaRequest *request,
1653 const UserPerm& perms,
1654 InodeRef *ptarget, bool *pcreated,
1655 mds_rank_t use_mds,
1656 bufferlist *pdirbl)
1657 {
1658 int r = 0;
1659
1660 // assign a unique tid
1661 ceph_tid_t tid = ++last_tid;
1662 request->set_tid(tid);
1663
1664 // and timestamp
1665 request->op_stamp = ceph_clock_now();
1666
1667 // make note
1668 mds_requests[tid] = request->get();
1669 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1670 oldest_tid = tid;
1671
1672 request->set_caller_perms(perms);
1673
1674 if (cct->_conf->client_inject_fixed_oldest_tid) {
1675 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1676 request->set_oldest_client_tid(1);
1677 } else {
1678 request->set_oldest_client_tid(oldest_tid);
1679 }
1680
1681 // hack target mds?
1682 if (use_mds >= 0)
1683 request->resend_mds = use_mds;
1684
1685 while (1) {
1686 if (request->aborted())
1687 break;
1688
1689 if (blacklisted) {
1690 request->abort(-EBLACKLISTED);
1691 break;
1692 }
1693
1694 // set up wait cond
1695 Cond caller_cond;
1696 request->caller_cond = &caller_cond;
1697
1698 // choose mds
1699 Inode *hash_diri = NULL;
1700 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1701 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1702 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1703 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1704 if (hash_diri) {
1705 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1706 _fragmap_remove_stopped_mds(hash_diri, mds);
1707 } else {
1708 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1709 request->resend_mds = _get_random_up_mds();
1710 }
1711 } else {
1712 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1713 wait_on_list(waiting_for_mdsmap);
1714 }
1715 continue;
1716 }
1717
1718 // open a session?
1719 MetaSession *session = NULL;
1720 if (!have_open_session(mds)) {
1721 session = _get_or_open_mds_session(mds);
1722
1723 // wait
1724 if (session->state == MetaSession::STATE_OPENING) {
1725 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1726 wait_on_context_list(session->waiting_for_open);
1727 // Abort requests on REJECT from MDS
1728 if (rejected_by_mds.count(mds)) {
1729 request->abort(-EPERM);
1730 break;
1731 }
1732 continue;
1733 }
1734
1735 if (!have_open_session(mds))
1736 continue;
1737 } else {
1738 session = &mds_sessions.at(mds);
1739 }
1740
1741 // send request.
1742 send_request(request, session);
1743
1744 // wait for signal
1745 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1746 request->kick = false;
1747 while (!request->reply && // reply
1748 request->resend_mds < 0 && // forward
1749 !request->kick)
1750 caller_cond.Wait(client_lock);
1751 request->caller_cond = NULL;
1752
1753 // did we get a reply?
1754 if (request->reply)
1755 break;
1756 }
1757
1758 if (!request->reply) {
1759 ceph_assert(request->aborted());
1760 ceph_assert(!request->got_unsafe);
1761 r = request->get_abort_code();
1762 request->item.remove_myself();
1763 unregister_request(request);
1764 put_request(request);
1765 return r;
1766 }
1767
1768 // got it!
1769 auto reply = std::move(request->reply);
1770 r = reply->get_result();
1771 if (r >= 0)
1772 request->success = true;
1773
1774 // kick dispatcher (we've got it!)
1775 ceph_assert(request->dispatch_cond);
1776 request->dispatch_cond->Signal();
1777 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1778 request->dispatch_cond = 0;
1779
1780 if (r >= 0 && ptarget)
1781 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1782
1783 if (pdirbl)
1784 *pdirbl = reply->get_extra_bl();
1785
1786 // -- log times --
1787 utime_t lat = ceph_clock_now();
1788 lat -= request->sent_stamp;
1789 ldout(cct, 20) << "lat " << lat << dendl;
1790 logger->tinc(l_c_lat, lat);
1791 logger->tinc(l_c_reply, lat);
1792
1793 put_request(request);
1794 return r;
1795 }
1796
1797 void Client::unregister_request(MetaRequest *req)
1798 {
1799 mds_requests.erase(req->tid);
1800 if (req->tid == oldest_tid) {
1801 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1802 while (true) {
1803 if (p == mds_requests.end()) {
1804 oldest_tid = 0;
1805 break;
1806 }
1807 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1808 oldest_tid = p->first;
1809 break;
1810 }
1811 ++p;
1812 }
1813 }
1814 put_request(req);
1815 }
1816
1817 void Client::put_request(MetaRequest *request)
1818 {
1819 if (request->_put()) {
1820 int op = -1;
1821 if (request->success)
1822 op = request->get_op();
1823 InodeRef other_in;
1824 request->take_other_inode(&other_in);
1825 delete request;
1826
1827 if (other_in &&
1828 (op == CEPH_MDS_OP_RMDIR ||
1829 op == CEPH_MDS_OP_RENAME ||
1830 op == CEPH_MDS_OP_RMSNAP)) {
1831 _try_to_trim_inode(other_in.get(), false);
1832 }
1833 }
1834 }
1835
1836 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1837 mds_rank_t mds, int drop,
1838 int unless, int force)
1839 {
1840 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1841 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1842 << ", have:" << ", force:" << force << ")" << dendl;
1843 int released = 0;
1844 auto it = in->caps.find(mds);
1845 if (it != in->caps.end()) {
1846 Cap &cap = it->second;
1847 drop &= ~(in->dirty_caps | get_caps_used(in));
1848 if ((drop & cap.issued) &&
1849 !(unless & cap.issued)) {
1850 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1851 cap.issued &= ~drop;
1852 cap.implemented &= ~drop;
1853 released = 1;
1854 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
1855 } else {
1856 released = force;
1857 }
1858 if (released) {
1859 ceph_mds_request_release rel;
1860 rel.ino = in->ino;
1861 rel.cap_id = cap.cap_id;
1862 rel.seq = cap.seq;
1863 rel.issue_seq = cap.issue_seq;
1864 rel.mseq = cap.mseq;
1865 rel.caps = cap.implemented;
1866 rel.wanted = cap.wanted;
1867 rel.dname_len = 0;
1868 rel.dname_seq = 0;
1869 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1870 }
1871 }
1872 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1873 << released << dendl;
1874 return released;
1875 }
1876
1877 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1878 mds_rank_t mds, int drop, int unless)
1879 {
1880 ldout(cct, 20) << __func__ << " enter(dn:"
1881 << dn << ")" << dendl;
1882 int released = 0;
1883 if (dn->dir)
1884 released = encode_inode_release(dn->dir->parent_inode, req,
1885 mds, drop, unless, 1);
1886 if (released && dn->lease_mds == mds) {
1887 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1888 auto& rel = req->cap_releases.back();
1889 rel.item.dname_len = dn->name.length();
1890 rel.item.dname_seq = dn->lease_seq;
1891 rel.dname = dn->name;
1892 }
1893 ldout(cct, 25) << __func__ << " exit(dn:"
1894 << dn << ")" << dendl;
1895 }
1896
1897
1898 /*
1899 * This requires the MClientRequest *request member to be set.
1900 * It will error out horribly without one.
1901 * Additionally, if you set any *drop member, you'd better have
1902 * set the corresponding dentry!
1903 */
1904 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1905 {
1906 ldout(cct, 20) << __func__ << " enter (req: "
1907 << req << ", mds: " << mds << ")" << dendl;
1908 if (req->inode_drop && req->inode())
1909 encode_inode_release(req->inode(), req,
1910 mds, req->inode_drop,
1911 req->inode_unless);
1912
1913 if (req->old_inode_drop && req->old_inode())
1914 encode_inode_release(req->old_inode(), req,
1915 mds, req->old_inode_drop,
1916 req->old_inode_unless);
1917 if (req->other_inode_drop && req->other_inode())
1918 encode_inode_release(req->other_inode(), req,
1919 mds, req->other_inode_drop,
1920 req->other_inode_unless);
1921
1922 if (req->dentry_drop && req->dentry())
1923 encode_dentry_release(req->dentry(), req,
1924 mds, req->dentry_drop,
1925 req->dentry_unless);
1926
1927 if (req->old_dentry_drop && req->old_dentry())
1928 encode_dentry_release(req->old_dentry(), req,
1929 mds, req->old_dentry_drop,
1930 req->old_dentry_unless);
1931 ldout(cct, 25) << __func__ << " exit (req: "
1932 << req << ", mds " << mds <<dendl;
1933 }
1934
1935 bool Client::have_open_session(mds_rank_t mds)
1936 {
1937 const auto &it = mds_sessions.find(mds);
1938 return it != mds_sessions.end() &&
1939 (it->second.state == MetaSession::STATE_OPEN ||
1940 it->second.state == MetaSession::STATE_STALE);
1941 }
1942
1943 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1944 {
1945 const auto &it = mds_sessions.find(mds);
1946 if (it == mds_sessions.end() || it->second.con != con) {
1947 return NULL;
1948 } else {
1949 return &it->second;
1950 }
1951 }
1952
1953 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1954 {
1955 auto it = mds_sessions.find(mds);
1956 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1957 }
1958
1959 /**
1960 * Populate a map of strings with client-identifying metadata,
1961 * such as the hostname. Call this once at initialization.
1962 */
1963 void Client::populate_metadata(const std::string &mount_root)
1964 {
1965 // Hostname
1966 struct utsname u;
1967 int r = uname(&u);
1968 if (r >= 0) {
1969 metadata["hostname"] = u.nodename;
1970 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1971 } else {
1972 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1973 }
1974
1975 metadata["pid"] = stringify(getpid());
1976
1977 // Ceph entity id (the '0' in "client.0")
1978 metadata["entity_id"] = cct->_conf->name.get_id();
1979
1980 // Our mount position
1981 if (!mount_root.empty()) {
1982 metadata["root"] = mount_root;
1983 }
1984
1985 // Ceph version
1986 metadata["ceph_version"] = pretty_version_to_str();
1987 metadata["ceph_sha1"] = git_version_to_str();
1988
1989 // Apply any metadata from the user's configured overrides
1990 std::vector<std::string> tokens;
1991 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1992 for (const auto &i : tokens) {
1993 auto eqpos = i.find("=");
1994 // Throw out anything that isn't of the form "<str>=<str>"
1995 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1996 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1997 continue;
1998 }
1999 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2000 }
2001 }
2002
2003 /**
2004 * Optionally add or override client metadata fields.
2005 */
2006 void Client::update_metadata(std::string const &k, std::string const &v)
2007 {
2008 std::lock_guard l(client_lock);
2009 ceph_assert(initialized);
2010
2011 auto it = metadata.find(k);
2012 if (it != metadata.end()) {
2013 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2014 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2015 }
2016
2017 metadata[k] = v;
2018 }
2019
2020 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2021 {
2022 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2023 auto addrs = mdsmap->get_addrs(mds);
2024 auto em = mds_sessions.emplace(std::piecewise_construct,
2025 std::forward_as_tuple(mds),
2026 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2027 ceph_assert(em.second); /* not already present */
2028 MetaSession *session = &em.first->second;
2029
2030 // Maybe skip sending a request to open if this MDS daemon
2031 // has previously sent us a REJECT.
2032 if (rejected_by_mds.count(mds)) {
2033 if (rejected_by_mds[mds] == session->addrs) {
2034 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
2035 "because we were rejected" << dendl;
2036 return session;
2037 } else {
2038 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
2039 "rejected us, trying with new inst" << dendl;
2040 rejected_by_mds.erase(mds);
2041 }
2042 }
2043
2044 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2045 m->metadata = metadata;
2046 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2047 session->con->send_message2(std::move(m));
2048 return session;
2049 }
2050
2051 void Client::_close_mds_session(MetaSession *s)
2052 {
2053 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2054 s->state = MetaSession::STATE_CLOSING;
2055 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2056 }
2057
2058 void Client::_closed_mds_session(MetaSession *s)
2059 {
2060 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2061 s->state = MetaSession::STATE_CLOSED;
2062 s->con->mark_down();
2063 signal_context_list(s->waiting_for_open);
2064 mount_cond.Signal();
2065 remove_session_caps(s);
2066 kick_requests_closed(s);
2067 mds_sessions.erase(s->mds_num);
2068 }
2069
2070 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2071 {
2072 mds_rank_t from = mds_rank_t(m->get_source().num());
2073 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2074
2075 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2076 if (!session) {
2077 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2078 return;
2079 }
2080
2081 switch (m->get_op()) {
2082 case CEPH_SESSION_OPEN:
2083 {
2084 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2085 missing_features -= m->supported_features;
2086 if (!missing_features.empty()) {
2087 lderr(cct) << "mds." << from << " lacks required features '"
2088 << missing_features << "', closing session " << dendl;
2089 rejected_by_mds[session->mds_num] = session->addrs;
2090 _close_mds_session(session);
2091 _closed_mds_session(session);
2092 break;
2093 }
2094 session->mds_features = std::move(m->supported_features);
2095
2096 renew_caps(session);
2097 session->state = MetaSession::STATE_OPEN;
2098 if (unmounting)
2099 mount_cond.Signal();
2100 else
2101 connect_mds_targets(from);
2102 signal_context_list(session->waiting_for_open);
2103 break;
2104 }
2105
2106 case CEPH_SESSION_CLOSE:
2107 _closed_mds_session(session);
2108 break;
2109
2110 case CEPH_SESSION_RENEWCAPS:
2111 if (session->cap_renew_seq == m->get_seq()) {
2112 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2113 session->cap_ttl =
2114 session->last_cap_renew_request + mdsmap->get_session_timeout();
2115 if (was_stale)
2116 wake_up_session_caps(session, false);
2117 }
2118 break;
2119
2120 case CEPH_SESSION_STALE:
2121 // invalidate session caps/leases
2122 session->cap_gen++;
2123 session->cap_ttl = ceph_clock_now();
2124 session->cap_ttl -= 1;
2125 renew_caps(session);
2126 break;
2127
2128 case CEPH_SESSION_RECALL_STATE:
2129 trim_caps(session, m->get_max_caps());
2130 break;
2131
2132 case CEPH_SESSION_FLUSHMSG:
2133 /* flush cap release */
2134 if (auto& m = session->release; m) {
2135 session->con->send_message2(std::move(m));
2136 }
2137 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2138 break;
2139
2140 case CEPH_SESSION_FORCE_RO:
2141 force_session_readonly(session);
2142 break;
2143
2144 case CEPH_SESSION_REJECT:
2145 {
2146 std::string_view error_str;
2147 auto it = m->metadata.find("error_string");
2148 if (it != m->metadata.end())
2149 error_str = it->second;
2150 else
2151 error_str = "unknown error";
2152 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2153
2154 rejected_by_mds[session->mds_num] = session->addrs;
2155 _closed_mds_session(session);
2156 }
2157 break;
2158
2159 default:
2160 ceph_abort();
2161 }
2162 }
2163
2164 bool Client::_any_stale_sessions() const
2165 {
2166 ceph_assert(client_lock.is_locked_by_me());
2167
2168 for (const auto &p : mds_sessions) {
2169 if (p.second.state == MetaSession::STATE_STALE) {
2170 return true;
2171 }
2172 }
2173
2174 return false;
2175 }
2176
2177 void Client::_kick_stale_sessions()
2178 {
2179 ldout(cct, 1) << __func__ << dendl;
2180
2181 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2182 MetaSession &s = it->second;
2183 ++it;
2184 if (s.state == MetaSession::STATE_STALE)
2185 _closed_mds_session(&s);
2186 }
2187 }
2188
2189 void Client::send_request(MetaRequest *request, MetaSession *session,
2190 bool drop_cap_releases)
2191 {
2192 // make the request
2193 mds_rank_t mds = session->mds_num;
2194 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2195 << " for mds." << mds << dendl;
2196 auto r = build_client_request(request);
2197 if (request->dentry()) {
2198 r->set_dentry_wanted();
2199 }
2200 if (request->got_unsafe) {
2201 r->set_replayed_op();
2202 if (request->target)
2203 r->head.ino = request->target->ino;
2204 } else {
2205 encode_cap_releases(request, mds);
2206 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2207 request->cap_releases.clear();
2208 else
2209 r->releases.swap(request->cap_releases);
2210 }
2211 r->set_mdsmap_epoch(mdsmap->get_epoch());
2212 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2213 objecter->with_osdmap([r](const OSDMap& o) {
2214 r->set_osdmap_epoch(o.get_epoch());
2215 });
2216 }
2217
2218 if (request->mds == -1) {
2219 request->sent_stamp = ceph_clock_now();
2220 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2221 }
2222 request->mds = mds;
2223
2224 Inode *in = request->inode();
2225 if (in) {
2226 auto it = in->caps.find(mds);
2227 if (it != in->caps.end()) {
2228 request->sent_on_mseq = it->second.mseq;
2229 }
2230 }
2231
2232 session->requests.push_back(&request->item);
2233
2234 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2235 session->con->send_message2(std::move(r));
2236 }
2237
2238 MClientRequest::ref Client::build_client_request(MetaRequest *request)
2239 {
2240 auto req = MClientRequest::create(request->get_op());
2241 req->set_tid(request->tid);
2242 req->set_stamp(request->op_stamp);
2243 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2244
2245 // if the filepath's haven't been set, set them!
2246 if (request->path.empty()) {
2247 Inode *in = request->inode();
2248 Dentry *de = request->dentry();
2249 if (in)
2250 in->make_nosnap_relative_path(request->path);
2251 else if (de) {
2252 if (de->inode)
2253 de->inode->make_nosnap_relative_path(request->path);
2254 else if (de->dir) {
2255 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2256 request->path.push_dentry(de->name);
2257 }
2258 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2259 << " No path, inode, or appropriately-endowed dentry given!"
2260 << dendl;
2261 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2262 << " No path, inode, or dentry given!"
2263 << dendl;
2264 }
2265 req->set_filepath(request->get_filepath());
2266 req->set_filepath2(request->get_filepath2());
2267 req->set_data(request->data);
2268 req->set_retry_attempt(request->retry_attempt++);
2269 req->head.num_fwd = request->num_fwd;
2270 const gid_t *_gids;
2271 int gid_count = request->perms.get_gids(&_gids);
2272 req->set_gid_list(gid_count, _gids);
2273 return req;
2274 }
2275
2276
2277
2278 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2279 {
2280 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2281 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2282 if (!session) {
2283 return;
2284 }
2285 ceph_tid_t tid = fwd->get_tid();
2286
2287 if (mds_requests.count(tid) == 0) {
2288 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2289 return;
2290 }
2291
2292 MetaRequest *request = mds_requests[tid];
2293 ceph_assert(request);
2294
2295 // reset retry counter
2296 request->retry_attempt = 0;
2297
2298 // request not forwarded, or dest mds has no session.
2299 // resend.
2300 ldout(cct, 10) << __func__ << " tid " << tid
2301 << " fwd " << fwd->get_num_fwd()
2302 << " to mds." << fwd->get_dest_mds()
2303 << ", resending to " << fwd->get_dest_mds()
2304 << dendl;
2305
2306 request->mds = -1;
2307 request->item.remove_myself();
2308 request->num_fwd = fwd->get_num_fwd();
2309 request->resend_mds = fwd->get_dest_mds();
2310 request->caller_cond->Signal();
2311 }
2312
2313 bool Client::is_dir_operation(MetaRequest *req)
2314 {
2315 int op = req->get_op();
2316 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2317 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2318 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2319 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2320 return true;
2321 return false;
2322 }
2323
2324 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2325 {
2326 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2327 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2328 if (!session) {
2329 return;
2330 }
2331
2332 ceph_tid_t tid = reply->get_tid();
2333 bool is_safe = reply->is_safe();
2334
2335 if (mds_requests.count(tid) == 0) {
2336 lderr(cct) << __func__ << " no pending request on tid " << tid
2337 << " safe is:" << is_safe << dendl;
2338 return;
2339 }
2340 MetaRequest *request = mds_requests.at(tid);
2341
2342 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2343 << " tid " << tid << dendl;
2344
2345 if (request->got_unsafe && !is_safe) {
2346 //duplicate response
2347 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2348 << mds_num << " safe:" << is_safe << dendl;
2349 return;
2350 }
2351
2352 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2353 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2354 << " from mds." << request->mds << dendl;
2355 request->send_to_auth = true;
2356 request->resend_mds = choose_target_mds(request);
2357 Inode *in = request->inode();
2358 std::map<mds_rank_t, Cap>::const_iterator it;
2359 if (request->resend_mds >= 0 &&
2360 request->resend_mds == request->mds &&
2361 (in == NULL ||
2362 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2363 request->sent_on_mseq == it->second.mseq)) {
2364 ldout(cct, 20) << "have to return ESTALE" << dendl;
2365 } else {
2366 request->caller_cond->Signal();
2367 return;
2368 }
2369 }
2370
2371 ceph_assert(!request->reply);
2372 request->reply = reply;
2373 insert_trace(request, session);
2374
2375 // Handle unsafe reply
2376 if (!is_safe) {
2377 request->got_unsafe = true;
2378 session->unsafe_requests.push_back(&request->unsafe_item);
2379 if (is_dir_operation(request)) {
2380 Inode *dir = request->inode();
2381 ceph_assert(dir);
2382 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2383 }
2384 if (request->target) {
2385 InodeRef &in = request->target;
2386 in->unsafe_ops.push_back(&request->unsafe_target_item);
2387 }
2388 }
2389
2390 // Only signal the caller once (on the first reply):
2391 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2392 if (!is_safe || !request->got_unsafe) {
2393 Cond cond;
2394 request->dispatch_cond = &cond;
2395
2396 // wake up waiter
2397 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2398 request->caller_cond->Signal();
2399
2400 // wake for kick back
2401 while (request->dispatch_cond) {
2402 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
2403 cond.Wait(client_lock);
2404 }
2405 }
2406
2407 if (is_safe) {
2408 // the filesystem change is committed to disk
2409 // we're done, clean up
2410 if (request->got_unsafe) {
2411 request->unsafe_item.remove_myself();
2412 request->unsafe_dir_item.remove_myself();
2413 request->unsafe_target_item.remove_myself();
2414 signal_cond_list(request->waitfor_safe);
2415 }
2416 request->item.remove_myself();
2417 unregister_request(request);
2418 }
2419 if (unmounting)
2420 mount_cond.Signal();
2421 }
2422
2423 void Client::_handle_full_flag(int64_t pool)
2424 {
2425 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2426 << "on " << pool << dendl;
2427 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2428 // to do this rather than blocking, because otherwise when we fill up we
2429 // potentially lock caps forever on files with dirty pages, and we need
2430 // to be able to release those caps to the MDS so that it can delete files
2431 // and free up space.
2432 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2433
2434 // For all inodes with layouts in this pool and a pending flush write op
2435 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2436 // from ObjectCacher so that it doesn't re-issue the write in response to
2437 // the ENOSPC error.
2438 // Fortunately since we're cancelling everything in a given pool, we don't
2439 // need to know which ops belong to which ObjectSet, we can just blow all
2440 // the un-flushed cached data away and mark any dirty inodes' async_err
2441 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2442 // affecting this pool, and all the objectsets we're purging were also
2443 // in this pool.
2444 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2445 i != inode_map.end(); ++i)
2446 {
2447 Inode *inode = i->second;
2448 if (inode->oset.dirty_or_tx
2449 && (pool == -1 || inode->layout.pool_id == pool)) {
2450 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2451 << " has dirty objects, purging and setting ENOSPC" << dendl;
2452 objectcacher->purge_set(&inode->oset);
2453 inode->set_async_err(-ENOSPC);
2454 }
2455 }
2456
2457 if (cancelled_epoch != (epoch_t)-1) {
2458 set_cap_epoch_barrier(cancelled_epoch);
2459 }
2460 }
2461
2462 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2463 {
2464 std::set<entity_addr_t> new_blacklists;
2465 objecter->consume_blacklist_events(&new_blacklists);
2466
2467 const auto myaddrs = messenger->get_myaddrs();
2468 bool new_blacklist = false;
2469 bool prenautilus = objecter->with_osdmap(
2470 [&](const OSDMap& o) {
2471 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2472 });
2473 if (!blacklisted) {
2474 for (auto a : myaddrs.v) {
2475 // blacklist entries are always TYPE_ANY for nautilus+
2476 a.set_type(entity_addr_t::TYPE_ANY);
2477 if (new_blacklists.count(a)) {
2478 new_blacklist = true;
2479 break;
2480 }
2481 if (prenautilus) {
2482 // ...except pre-nautilus, they were TYPE_LEGACY
2483 a.set_type(entity_addr_t::TYPE_LEGACY);
2484 if (new_blacklists.count(a)) {
2485 new_blacklist = true;
2486 break;
2487 }
2488 }
2489 }
2490 }
2491 if (new_blacklist) {
2492 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2493 return o.get_epoch();
2494 });
2495 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2496 blacklisted = true;
2497
2498 _abort_mds_sessions(-EBLACKLISTED);
2499
2500 // Since we know all our OSD ops will fail, cancel them all preemtively,
2501 // so that on an unhealthy cluster we can umount promptly even if e.g.
2502 // some PGs were inaccessible.
2503 objecter->op_cancel_writes(-EBLACKLISTED);
2504
2505 } else if (blacklisted) {
2506 // Handle case where we were blacklisted but no longer are
2507 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2508 return o.is_blacklisted(myaddrs);});
2509 }
2510
2511 // Always subscribe to next osdmap for blacklisted client
2512 // until this client is not blacklisted.
2513 if (blacklisted) {
2514 objecter->maybe_request_map();
2515 }
2516
2517 if (objecter->osdmap_full_flag()) {
2518 _handle_full_flag(-1);
2519 } else {
2520 // Accumulate local list of full pools so that I can drop
2521 // the objecter lock before re-entering objecter in
2522 // cancel_writes
2523 std::vector<int64_t> full_pools;
2524
2525 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2526 for (const auto& kv : o.get_pools()) {
2527 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2528 full_pools.push_back(kv.first);
2529 }
2530 }
2531 });
2532
2533 for (auto p : full_pools)
2534 _handle_full_flag(p);
2535
2536 // Subscribe to subsequent maps to watch for the full flag going
2537 // away. For the global full flag objecter does this for us, but
2538 // it pays no attention to the per-pool full flag so in this branch
2539 // we do it ourselves.
2540 if (!full_pools.empty()) {
2541 objecter->maybe_request_map();
2542 }
2543 }
2544 }
2545
2546
2547 // ------------------------
2548 // incoming messages
2549
2550
2551 bool Client::ms_dispatch2(const MessageRef &m)
2552 {
2553 std::lock_guard l(client_lock);
2554 if (!initialized) {
2555 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2556 return true;
2557 }
2558
2559 switch (m->get_type()) {
2560 // mounting and mds sessions
2561 case CEPH_MSG_MDS_MAP:
2562 handle_mds_map(MMDSMap::msgref_cast(m));
2563 break;
2564 case CEPH_MSG_FS_MAP:
2565 handle_fs_map(MFSMap::msgref_cast(m));
2566 break;
2567 case CEPH_MSG_FS_MAP_USER:
2568 handle_fs_map_user(MFSMapUser::msgref_cast(m));
2569 break;
2570 case CEPH_MSG_CLIENT_SESSION:
2571 handle_client_session(MClientSession::msgref_cast(m));
2572 break;
2573
2574 case CEPH_MSG_OSD_MAP:
2575 handle_osd_map(MOSDMap::msgref_cast(m));
2576 break;
2577
2578 // requests
2579 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2580 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
2581 break;
2582 case CEPH_MSG_CLIENT_REPLY:
2583 handle_client_reply(MClientReply::msgref_cast(m));
2584 break;
2585
2586 // reclaim reply
2587 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2588 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
2589 break;
2590
2591 case CEPH_MSG_CLIENT_SNAP:
2592 handle_snap(MClientSnap::msgref_cast(m));
2593 break;
2594 case CEPH_MSG_CLIENT_CAPS:
2595 handle_caps(MClientCaps::msgref_cast(m));
2596 break;
2597 case CEPH_MSG_CLIENT_LEASE:
2598 handle_lease(MClientLease::msgref_cast(m));
2599 break;
2600 case MSG_COMMAND_REPLY:
2601 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2602 handle_command_reply(MCommandReply::msgref_cast(m));
2603 } else {
2604 return false;
2605 }
2606 break;
2607 case CEPH_MSG_CLIENT_QUOTA:
2608 handle_quota(MClientQuota::msgref_cast(m));
2609 break;
2610
2611 default:
2612 return false;
2613 }
2614
2615 // unmounting?
2616 if (unmounting) {
2617 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2618 << "+" << inode_map.size() << dendl;
2619 long unsigned size = lru.lru_get_size() + inode_map.size();
2620 trim_cache();
2621 if (size < lru.lru_get_size() + inode_map.size()) {
2622 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2623 mount_cond.Signal();
2624 } else {
2625 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2626 << "+" << inode_map.size() << dendl;
2627 }
2628 }
2629
2630 return true;
2631 }
2632
2633 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2634 {
2635 fsmap.reset(new FSMap(m->get_fsmap()));
2636
2637 signal_cond_list(waiting_for_fsmap);
2638
2639 monclient->sub_got("fsmap", fsmap->get_epoch());
2640 }
2641
2642 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2643 {
2644 fsmap_user.reset(new FSMapUser);
2645 *fsmap_user = m->get_fsmap();
2646
2647 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2648 signal_cond_list(waiting_for_fsmap);
2649 }
2650
2651 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2652 {
2653 mds_gid_t old_inc, new_inc;
2654 if (m->get_epoch() <= mdsmap->get_epoch()) {
2655 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2656 << " is identical to or older than our "
2657 << mdsmap->get_epoch() << dendl;
2658 return;
2659 }
2660
2661 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2662
2663 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2664 oldmap.swap(mdsmap);
2665
2666 mdsmap->decode(m->get_encoded());
2667
2668 // Cancel any commands for missing or laggy GIDs
2669 std::list<ceph_tid_t> cancel_ops;
2670 auto &commands = command_table.get_commands();
2671 for (const auto &i : commands) {
2672 auto &op = i.second;
2673 const mds_gid_t op_mds_gid = op.mds_gid;
2674 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2675 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2676 cancel_ops.push_back(i.first);
2677 if (op.outs) {
2678 std::ostringstream ss;
2679 ss << "MDS " << op_mds_gid << " went away";
2680 *(op.outs) = ss.str();
2681 }
2682 op.con->mark_down();
2683 if (op.on_finish) {
2684 op.on_finish->complete(-ETIMEDOUT);
2685 }
2686 }
2687 }
2688
2689 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2690 i != cancel_ops.end(); ++i) {
2691 command_table.erase(*i);
2692 }
2693
2694 // reset session
2695 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2696 mds_rank_t mds = p->first;
2697 MetaSession *session = &p->second;
2698 ++p;
2699
2700 int oldstate = oldmap->get_state(mds);
2701 int newstate = mdsmap->get_state(mds);
2702 if (!mdsmap->is_up(mds)) {
2703 session->con->mark_down();
2704 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2705 old_inc = oldmap->get_incarnation(mds);
2706 new_inc = mdsmap->get_incarnation(mds);
2707 if (old_inc != new_inc) {
2708 ldout(cct, 1) << "mds incarnation changed from "
2709 << old_inc << " to " << new_inc << dendl;
2710 oldstate = MDSMap::STATE_NULL;
2711 }
2712 session->con->mark_down();
2713 session->addrs = mdsmap->get_addrs(mds);
2714 // When new MDS starts to take over, notify kernel to trim unused entries
2715 // in its dcache/icache. Hopefully, the kernel will release some unused
2716 // inodes before the new MDS enters reconnect state.
2717 trim_cache_for_reconnect(session);
2718 } else if (oldstate == newstate)
2719 continue; // no change
2720
2721 session->mds_state = newstate;
2722 if (newstate == MDSMap::STATE_RECONNECT) {
2723 session->con = messenger->connect_to_mds(session->addrs);
2724 send_reconnect(session);
2725 } else if (newstate > MDSMap::STATE_RECONNECT) {
2726 if (oldstate < MDSMap::STATE_RECONNECT) {
2727 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2728 _closed_mds_session(session);
2729 continue;
2730 }
2731 if (newstate >= MDSMap::STATE_ACTIVE) {
2732 if (oldstate < MDSMap::STATE_ACTIVE) {
2733 // kick new requests
2734 kick_requests(session);
2735 kick_flushing_caps(session);
2736 signal_context_list(session->waiting_for_open);
2737 wake_up_session_caps(session, true);
2738 }
2739 connect_mds_targets(mds);
2740 }
2741 } else if (newstate == MDSMap::STATE_NULL &&
2742 mds >= mdsmap->get_max_mds()) {
2743 _closed_mds_session(session);
2744 }
2745 }
2746
2747 // kick any waiting threads
2748 signal_cond_list(waiting_for_mdsmap);
2749
2750 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2751 }
2752
2753 void Client::send_reconnect(MetaSession *session)
2754 {
2755 mds_rank_t mds = session->mds_num;
2756 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2757
2758 // trim unused caps to reduce MDS's cache rejoin time
2759 trim_cache_for_reconnect(session);
2760
2761 session->readonly = false;
2762
2763 session->release.reset();
2764
2765 // reset my cap seq number
2766 session->seq = 0;
2767 //connect to the mds' offload targets
2768 connect_mds_targets(mds);
2769 //make sure unsafe requests get saved
2770 resend_unsafe_requests(session);
2771
2772 early_kick_flushing_caps(session);
2773
2774 auto m = MClientReconnect::create();
2775 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2776
2777 // i have an open session.
2778 ceph::unordered_set<inodeno_t> did_snaprealm;
2779 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2780 p != inode_map.end();
2781 ++p) {
2782 Inode *in = p->second;
2783 auto it = in->caps.find(mds);
2784 if (it != in->caps.end()) {
2785 if (allow_multi &&
2786 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2787 m->mark_more();
2788 session->con->send_message2(std::move(m));
2789
2790 m = MClientReconnect::create();
2791 }
2792
2793 Cap &cap = it->second;
2794 ldout(cct, 10) << " caps on " << p->first
2795 << " " << ccap_string(cap.issued)
2796 << " wants " << ccap_string(in->caps_wanted())
2797 << dendl;
2798 filepath path;
2799 in->make_long_path(path);
2800 ldout(cct, 10) << " path " << path << dendl;
2801
2802 bufferlist flockbl;
2803 _encode_filelocks(in, flockbl);
2804
2805 cap.seq = 0; // reset seq.
2806 cap.issue_seq = 0; // reset seq.
2807 cap.mseq = 0; // reset seq.
2808 // cap gen should catch up with session cap_gen
2809 if (cap.gen < session->cap_gen) {
2810 cap.gen = session->cap_gen;
2811 cap.issued = cap.implemented = CEPH_CAP_PIN;
2812 } else {
2813 cap.issued = cap.implemented;
2814 }
2815 snapid_t snap_follows = 0;
2816 if (!in->cap_snaps.empty())
2817 snap_follows = in->cap_snaps.begin()->first;
2818
2819 m->add_cap(p->first.ino,
2820 cap.cap_id,
2821 path.get_ino(), path.get_path(), // ino
2822 in->caps_wanted(), // wanted
2823 cap.issued, // issued
2824 in->snaprealm->ino,
2825 snap_follows,
2826 flockbl);
2827
2828 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2829 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2830 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2831 did_snaprealm.insert(in->snaprealm->ino);
2832 }
2833 }
2834 }
2835
2836 if (!allow_multi)
2837 m->set_encoding_version(0); // use connection features to choose encoding
2838 session->con->send_message2(std::move(m));
2839
2840 mount_cond.Signal();
2841
2842 if (session->reclaim_state == MetaSession::RECLAIMING)
2843 signal_cond_list(waiting_for_reclaim);
2844 }
2845
2846
2847 void Client::kick_requests(MetaSession *session)
2848 {
2849 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2850 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2851 p != mds_requests.end();
2852 ++p) {
2853 MetaRequest *req = p->second;
2854 if (req->got_unsafe)
2855 continue;
2856 if (req->aborted()) {
2857 if (req->caller_cond) {
2858 req->kick = true;
2859 req->caller_cond->Signal();
2860 }
2861 continue;
2862 }
2863 if (req->retry_attempt > 0)
2864 continue; // new requests only
2865 if (req->mds == session->mds_num) {
2866 send_request(p->second, session);
2867 }
2868 }
2869 }
2870
2871 void Client::resend_unsafe_requests(MetaSession *session)
2872 {
2873 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2874 !iter.end();
2875 ++iter)
2876 send_request(*iter, session);
2877
2878 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2879 // process completed requests in clientreplay stage.
2880 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2881 p != mds_requests.end();
2882 ++p) {
2883 MetaRequest *req = p->second;
2884 if (req->got_unsafe)
2885 continue;
2886 if (req->aborted())
2887 continue;
2888 if (req->retry_attempt == 0)
2889 continue; // old requests only
2890 if (req->mds == session->mds_num)
2891 send_request(req, session, true);
2892 }
2893 }
2894
2895 void Client::wait_unsafe_requests()
2896 {
2897 list<MetaRequest*> last_unsafe_reqs;
2898 for (const auto &p : mds_sessions) {
2899 const MetaSession &s = p.second;
2900 if (!s.unsafe_requests.empty()) {
2901 MetaRequest *req = s.unsafe_requests.back();
2902 req->get();
2903 last_unsafe_reqs.push_back(req);
2904 }
2905 }
2906
2907 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2908 p != last_unsafe_reqs.end();
2909 ++p) {
2910 MetaRequest *req = *p;
2911 if (req->unsafe_item.is_on_list())
2912 wait_on_list(req->waitfor_safe);
2913 put_request(req);
2914 }
2915 }
2916
2917 void Client::kick_requests_closed(MetaSession *session)
2918 {
2919 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2920 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2921 p != mds_requests.end(); ) {
2922 MetaRequest *req = p->second;
2923 ++p;
2924 if (req->mds == session->mds_num) {
2925 if (req->caller_cond) {
2926 req->kick = true;
2927 req->caller_cond->Signal();
2928 }
2929 req->item.remove_myself();
2930 if (req->got_unsafe) {
2931 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2932 req->unsafe_item.remove_myself();
2933 if (is_dir_operation(req)) {
2934 Inode *dir = req->inode();
2935 assert(dir);
2936 dir->set_async_err(-EIO);
2937 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2938 << dir->ino << " " << req->get_tid() << dendl;
2939 req->unsafe_dir_item.remove_myself();
2940 }
2941 if (req->target) {
2942 InodeRef &in = req->target;
2943 in->set_async_err(-EIO);
2944 lderr(cct) << "kick_requests_closed drop req of inode : "
2945 << in->ino << " " << req->get_tid() << dendl;
2946 req->unsafe_target_item.remove_myself();
2947 }
2948 signal_cond_list(req->waitfor_safe);
2949 unregister_request(req);
2950 }
2951 }
2952 }
2953 ceph_assert(session->requests.empty());
2954 ceph_assert(session->unsafe_requests.empty());
2955 }
2956
2957
2958
2959
2960 /************
2961 * leases
2962 */
2963
2964 void Client::got_mds_push(MetaSession *s)
2965 {
2966 s->seq++;
2967 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2968 if (s->state == MetaSession::STATE_CLOSING) {
2969 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2970 }
2971 }
2972
2973 void Client::handle_lease(const MConstRef<MClientLease>& m)
2974 {
2975 ldout(cct, 10) << __func__ << " " << *m << dendl;
2976
2977 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2978
2979 mds_rank_t mds = mds_rank_t(m->get_source().num());
2980 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2981 if (!session) {
2982 return;
2983 }
2984
2985 got_mds_push(session);
2986
2987 ceph_seq_t seq = m->get_seq();
2988
2989 Inode *in;
2990 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2991 if (inode_map.count(vino) == 0) {
2992 ldout(cct, 10) << " don't have vino " << vino << dendl;
2993 goto revoke;
2994 }
2995 in = inode_map[vino];
2996
2997 if (m->get_mask() & CEPH_LOCK_DN) {
2998 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2999 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3000 goto revoke;
3001 }
3002 Dentry *dn = in->dir->dentries[m->dname];
3003 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3004 dn->lease_mds = -1;
3005 }
3006
3007 revoke:
3008 {
3009 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3010 m->get_connection()->send_message2(std::move(reply));
3011 }
3012 }
3013
3014 void Client::put_inode(Inode *in, int n)
3015 {
3016 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3017 int left = in->_put(n);
3018 if (left == 0) {
3019 // release any caps
3020 remove_all_caps(in);
3021
3022 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3023 bool unclean = objectcacher->release_set(&in->oset);
3024 ceph_assert(!unclean);
3025 inode_map.erase(in->vino());
3026 if (use_faked_inos())
3027 _release_faked_ino(in);
3028
3029 if (in == root) {
3030 root = 0;
3031 root_ancestor = 0;
3032 while (!root_parents.empty())
3033 root_parents.erase(root_parents.begin());
3034 }
3035
3036 delete in;
3037 }
3038 }
3039
3040 void Client::close_dir(Dir *dir)
3041 {
3042 Inode *in = dir->parent_inode;
3043 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3044 ceph_assert(dir->is_empty());
3045 ceph_assert(in->dir == dir);
3046 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3047 if (!in->dentries.empty())
3048 in->get_first_parent()->put(); // unpin dentry
3049
3050 delete in->dir;
3051 in->dir = 0;
3052 put_inode(in); // unpin inode
3053 }
3054
3055 /**
3056 * Don't call this with in==NULL, use get_or_create for that
3057 * leave dn set to default NULL unless you're trying to add
3058 * a new inode to a pre-created Dentry
3059 */
3060 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3061 {
3062 if (!dn) {
3063 // create a new Dentry
3064 dn = new Dentry(dir, name);
3065
3066 lru.lru_insert_mid(dn); // mid or top?
3067
3068 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3069 << " dn " << dn << " (new dn)" << dendl;
3070 } else {
3071 ceph_assert(!dn->inode);
3072 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3073 << " dn " << dn << " (old dn)" << dendl;
3074 }
3075
3076 if (in) { // link to inode
3077 InodeRef tmp_ref;
3078 // only one parent for directories!
3079 if (in->is_dir() && !in->dentries.empty()) {
3080 tmp_ref = in; // prevent unlink below from freeing the inode.
3081 Dentry *olddn = in->get_first_parent();
3082 ceph_assert(olddn->dir != dir || olddn->name != name);
3083 Inode *old_diri = olddn->dir->parent_inode;
3084 old_diri->dir_release_count++;
3085 clear_dir_complete_and_ordered(old_diri, true);
3086 unlink(olddn, true, true); // keep dir, dentry
3087 }
3088
3089 dn->link(in);
3090 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3091 }
3092
3093 return dn;
3094 }
3095
3096 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3097 {
3098 InodeRef in(dn->inode);
3099 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3100 << " inode " << dn->inode << dendl;
3101
3102 // unlink from inode
3103 if (dn->inode) {
3104 dn->unlink();
3105 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3106 }
3107
3108 if (keepdentry) {
3109 dn->lease_mds = -1;
3110 } else {
3111 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3112
3113 // unlink from dir
3114 Dir *dir = dn->dir;
3115 dn->detach();
3116
3117 // delete den
3118 lru.lru_remove(dn);
3119 dn->put();
3120
3121 if (dir->is_empty() && !keepdir)
3122 close_dir(dir);
3123 }
3124 }
3125
3126 /**
3127 * For asynchronous flushes, check for errors from the IO and
3128 * update the inode if necessary
3129 */
3130 class C_Client_FlushComplete : public Context {
3131 private:
3132 Client *client;
3133 InodeRef inode;
3134 public:
3135 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3136 void finish(int r) override {
3137 ceph_assert(client->client_lock.is_locked_by_me());
3138 if (r != 0) {
3139 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3140 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3141 << " 0x" << std::hex << inode->ino << std::dec
3142 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3143 inode->set_async_err(r);
3144 }
3145 }
3146 };
3147
3148
3149 /****
3150 * caps
3151 */
3152
3153 void Client::get_cap_ref(Inode *in, int cap)
3154 {
3155 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3156 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3157 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3158 in->get();
3159 }
3160 if ((cap & CEPH_CAP_FILE_CACHE) &&
3161 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3162 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3163 in->get();
3164 }
3165 in->get_cap_ref(cap);
3166 }
3167
3168 void Client::put_cap_ref(Inode *in, int cap)
3169 {
3170 int last = in->put_cap_ref(cap);
3171 if (last) {
3172 int put_nref = 0;
3173 int drop = last & ~in->caps_issued();
3174 if (in->snapid == CEPH_NOSNAP) {
3175 if ((last & CEPH_CAP_FILE_WR) &&
3176 !in->cap_snaps.empty() &&
3177 in->cap_snaps.rbegin()->second.writing) {
3178 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3179 in->cap_snaps.rbegin()->second.writing = 0;
3180 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3181 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3182 }
3183 if (last & CEPH_CAP_FILE_BUFFER) {
3184 for (auto &p : in->cap_snaps)
3185 p.second.dirty_data = 0;
3186 signal_cond_list(in->waitfor_commit);
3187 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3188 ++put_nref;
3189 }
3190 }
3191 if (last & CEPH_CAP_FILE_CACHE) {
3192 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3193 ++put_nref;
3194 }
3195 if (drop)
3196 check_caps(in, 0);
3197 if (put_nref)
3198 put_inode(in, put_nref);
3199 }
3200 }
3201
3202 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3203 {
3204 int r = check_pool_perm(in, need);
3205 if (r < 0)
3206 return r;
3207
3208 while (1) {
3209 int file_wanted = in->caps_file_wanted();
3210 if ((file_wanted & need) != need) {
3211 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3212 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3213 << dendl;
3214 return -EBADF;
3215 }
3216
3217 int implemented;
3218 int have = in->caps_issued(&implemented);
3219
3220 bool waitfor_caps = false;
3221 bool waitfor_commit = false;
3222
3223 if (have & need & CEPH_CAP_FILE_WR) {
3224 if (endoff > 0 &&
3225 (endoff >= (loff_t)in->max_size ||
3226 endoff > (loff_t)(in->size << 1)) &&
3227 endoff > (loff_t)in->wanted_max_size) {
3228 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3229 in->wanted_max_size = endoff;
3230 check_caps(in, 0);
3231 }
3232
3233 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3234 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3235 waitfor_caps = true;
3236 }
3237 if (!in->cap_snaps.empty()) {
3238 if (in->cap_snaps.rbegin()->second.writing) {
3239 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3240 waitfor_caps = true;
3241 }
3242 for (auto &p : in->cap_snaps) {
3243 if (p.second.dirty_data) {
3244 waitfor_commit = true;
3245 break;
3246 }
3247 }
3248 if (waitfor_commit) {
3249 _flush(in, new C_Client_FlushComplete(this, in));
3250 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3251 }
3252 }
3253 }
3254
3255 if (!waitfor_caps && !waitfor_commit) {
3256 if ((have & need) == need) {
3257 int revoking = implemented & ~have;
3258 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3259 << " need " << ccap_string(need) << " want " << ccap_string(want)
3260 << " revoking " << ccap_string(revoking)
3261 << dendl;
3262 if ((revoking & want) == 0) {
3263 *phave = need | (have & want);
3264 in->get_cap_ref(need);
3265 return 0;
3266 }
3267 }
3268 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3269 waitfor_caps = true;
3270 }
3271
3272 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3273 in->auth_cap->session->readonly)
3274 return -EROFS;
3275
3276 if (in->flags & I_CAP_DROPPED) {
3277 int mds_wanted = in->caps_mds_wanted();
3278 if ((mds_wanted & need) != need) {
3279 int ret = _renew_caps(in);
3280 if (ret < 0)
3281 return ret;
3282 continue;
3283 }
3284 if (!(file_wanted & ~mds_wanted))
3285 in->flags &= ~I_CAP_DROPPED;
3286 }
3287
3288 if (waitfor_caps)
3289 wait_on_list(in->waitfor_caps);
3290 else if (waitfor_commit)
3291 wait_on_list(in->waitfor_commit);
3292 }
3293 }
3294
3295 int Client::get_caps_used(Inode *in)
3296 {
3297 unsigned used = in->caps_used();
3298 if (!(used & CEPH_CAP_FILE_CACHE) &&
3299 !objectcacher->set_is_empty(&in->oset))
3300 used |= CEPH_CAP_FILE_CACHE;
3301 return used;
3302 }
3303
3304 void Client::cap_delay_requeue(Inode *in)
3305 {
3306 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3307 in->hold_caps_until = ceph_clock_now();
3308 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3309 delayed_list.push_back(&in->delay_cap_item);
3310 }
3311
3312 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3313 int flags, int used, int want, int retain,
3314 int flush, ceph_tid_t flush_tid)
3315 {
3316 int held = cap->issued | cap->implemented;
3317 int revoking = cap->implemented & ~cap->issued;
3318 retain &= ~revoking;
3319 int dropping = cap->issued & ~retain;
3320 int op = CEPH_CAP_OP_UPDATE;
3321
3322 ldout(cct, 10) << __func__ << " " << *in
3323 << " mds." << session->mds_num << " seq " << cap->seq
3324 << " used " << ccap_string(used)
3325 << " want " << ccap_string(want)
3326 << " flush " << ccap_string(flush)
3327 << " retain " << ccap_string(retain)
3328 << " held "<< ccap_string(held)
3329 << " revoking " << ccap_string(revoking)
3330 << " dropping " << ccap_string(dropping)
3331 << dendl;
3332
3333 if (cct->_conf->client_inject_release_failure && revoking) {
3334 const int would_have_issued = cap->issued & retain;
3335 const int would_have_implemented = cap->implemented & (cap->issued | used);
3336 // Simulated bug:
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3340 cap->issued = cap->issued | cap->implemented;
3341
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3346 cap->issued ^= xattr_mask & revoking;
3347 cap->implemented ^= xattr_mask & revoking;
3348
3349 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3350 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3351 } else {
3352 // Normal behaviour
3353 cap->issued &= retain;
3354 cap->implemented &= cap->issued | used;
3355 }
3356
3357 snapid_t follows = 0;
3358
3359 if (flush)
3360 follows = in->snaprealm->get_snap_context().seq;
3361
3362 auto m = MClientCaps::create(op,
3363 in->ino,
3364 0,
3365 cap->cap_id, cap->seq,
3366 cap->implemented,
3367 want,
3368 flush,
3369 cap->mseq,
3370 cap_epoch_barrier);
3371 m->caller_uid = in->cap_dirtier_uid;
3372 m->caller_gid = in->cap_dirtier_gid;
3373
3374 m->head.issue_seq = cap->issue_seq;
3375 m->set_tid(flush_tid);
3376
3377 m->head.uid = in->uid;
3378 m->head.gid = in->gid;
3379 m->head.mode = in->mode;
3380
3381 m->head.nlink = in->nlink;
3382
3383 if (flush & CEPH_CAP_XATTR_EXCL) {
3384 encode(in->xattrs, m->xattrbl);
3385 m->head.xattr_version = in->xattr_version;
3386 }
3387
3388 m->size = in->size;
3389 m->max_size = in->max_size;
3390 m->truncate_seq = in->truncate_seq;
3391 m->truncate_size = in->truncate_size;
3392 m->mtime = in->mtime;
3393 m->atime = in->atime;
3394 m->ctime = in->ctime;
3395 m->btime = in->btime;
3396 m->time_warp_seq = in->time_warp_seq;
3397 m->change_attr = in->change_attr;
3398
3399 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3400 !in->cap_snaps.empty() &&
3401 in->cap_snaps.rbegin()->second.flush_tid == 0)
3402 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3403 m->flags = flags;
3404
3405 if (flush & CEPH_CAP_FILE_WR) {
3406 m->inline_version = in->inline_version;
3407 m->inline_data = in->inline_data;
3408 }
3409
3410 in->reported_size = in->size;
3411 m->set_snap_follows(follows);
3412 cap->wanted = want;
3413 if (cap == in->auth_cap) {
3414 m->set_max_size(in->wanted_max_size);
3415 in->requested_max_size = in->wanted_max_size;
3416 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3417 }
3418
3419 if (!session->flushing_caps_tids.empty())
3420 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3421
3422 session->con->send_message2(std::move(m));
3423 }
3424
3425 static bool is_max_size_approaching(Inode *in)
3426 {
3427 /* mds will adjust max size according to the reported size */
3428 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3429 return false;
3430 if (in->size >= in->max_size)
3431 return true;
3432 /* half of previous max_size increment has been used */
3433 if (in->max_size > in->reported_size &&
3434 (in->size << 1) >= in->max_size + in->reported_size)
3435 return true;
3436 return false;
3437 }
3438
3439 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3440 {
3441 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3442 return used;
3443 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3444 return used;
3445
3446 if (issued & CEPH_CAP_FILE_LAZYIO) {
3447 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3448 used &= ~CEPH_CAP_FILE_CACHE;
3449 used |= CEPH_CAP_FILE_LAZYIO;
3450 }
3451 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3452 used &= ~CEPH_CAP_FILE_BUFFER;
3453 used |= CEPH_CAP_FILE_LAZYIO;
3454 }
3455 } else {
3456 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3457 used &= ~CEPH_CAP_FILE_CACHE;
3458 used |= CEPH_CAP_FILE_LAZYIO;
3459 }
3460 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3461 used &= ~CEPH_CAP_FILE_BUFFER;
3462 used |= CEPH_CAP_FILE_LAZYIO;
3463 }
3464 }
3465 return used;
3466 }
3467
3468 /**
3469 * check_caps
3470 *
3471 * Examine currently used and wanted versus held caps. Release, flush or ack
3472 * revoked caps to the MDS as appropriate.
3473 *
3474 * @param in the inode to check
3475 * @param flags flags to apply to cap check
3476 */
3477 void Client::check_caps(Inode *in, unsigned flags)
3478 {
3479 unsigned wanted = in->caps_wanted();
3480 unsigned used = get_caps_used(in);
3481 unsigned cap_used;
3482
3483 int implemented;
3484 int issued = in->caps_issued(&implemented);
3485 int revoking = implemented & ~issued;
3486
3487 int orig_used = used;
3488 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3489
3490 int retain = wanted | used | CEPH_CAP_PIN;
3491 if (!unmounting && in->nlink > 0) {
3492 if (wanted) {
3493 retain |= CEPH_CAP_ANY;
3494 } else if (in->is_dir() &&
3495 (issued & CEPH_CAP_FILE_SHARED) &&
3496 (in->flags & I_COMPLETE)) {
3497 // we do this here because we don't want to drop to Fs (and then
3498 // drop the Fs if we do a create!) if that alone makes us send lookups
3499 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3500 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3501 retain |= wanted;
3502 } else {
3503 retain |= CEPH_CAP_ANY_SHARED;
3504 // keep RD only if we didn't have the file open RW,
3505 // because then the mds would revoke it anyway to
3506 // journal max_size=0.
3507 if (in->max_size == 0)
3508 retain |= CEPH_CAP_ANY_RD;
3509 }
3510 }
3511
3512 ldout(cct, 10) << __func__ << " on " << *in
3513 << " wanted " << ccap_string(wanted)
3514 << " used " << ccap_string(used)
3515 << " issued " << ccap_string(issued)
3516 << " revoking " << ccap_string(revoking)
3517 << " flags=" << flags
3518 << dendl;
3519
3520 if (in->snapid != CEPH_NOSNAP)
3521 return; //snap caps last forever, can't write
3522
3523 if (in->caps.empty())
3524 return; // guard if at end of func
3525
3526 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3527 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3528 if (_release(in))
3529 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3530 }
3531
3532
3533 for (auto &p : in->caps) {
3534 mds_rank_t mds = p.first;
3535 Cap &cap = p.second;
3536
3537 MetaSession *session = &mds_sessions.at(mds);
3538
3539 cap_used = used;
3540 if (in->auth_cap && &cap != in->auth_cap)
3541 cap_used &= ~in->auth_cap->issued;
3542
3543 revoking = cap.implemented & ~cap.issued;
3544
3545 ldout(cct, 10) << " cap mds." << mds
3546 << " issued " << ccap_string(cap.issued)
3547 << " implemented " << ccap_string(cap.implemented)
3548 << " revoking " << ccap_string(revoking) << dendl;
3549
3550 if (in->wanted_max_size > in->max_size &&
3551 in->wanted_max_size > in->requested_max_size &&
3552 &cap == in->auth_cap)
3553 goto ack;
3554
3555 /* approaching file_max? */
3556 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3557 &cap == in->auth_cap &&
3558 is_max_size_approaching(in)) {
3559 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3560 << ", reported " << in->reported_size << dendl;
3561 goto ack;
3562 }
3563
3564 /* completed revocation? */
3565 if (revoking && (revoking & cap_used) == 0) {
3566 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3567 goto ack;
3568 }
3569
3570 /* want more caps from mds? */
3571 if (wanted & ~(cap.wanted | cap.issued))
3572 goto ack;
3573
3574 if (!revoking && unmounting && (cap_used == 0))
3575 goto ack;
3576
3577 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3578 !in->dirty_caps) // and we have no dirty caps
3579 continue;
3580
3581 if (!(flags & CHECK_CAPS_NODELAY)) {
3582 ldout(cct, 10) << "delaying cap release" << dendl;
3583 cap_delay_requeue(in);
3584 continue;
3585 }
3586
3587 ack:
3588 if (&cap == in->auth_cap) {
3589 if (in->flags & I_KICK_FLUSH) {
3590 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3591 << " to mds." << mds << dendl;
3592 kick_flushing_caps(in, session);
3593 }
3594 if (!in->cap_snaps.empty() &&
3595 in->cap_snaps.rbegin()->second.flush_tid == 0)
3596 flush_snaps(in);
3597 }
3598
3599 int flushing;
3600 ceph_tid_t flush_tid;
3601 if (in->auth_cap == &cap && in->dirty_caps) {
3602 flushing = mark_caps_flushing(in, &flush_tid);
3603 } else {
3604 flushing = 0;
3605 flush_tid = 0;
3606 }
3607
3608 int msg_flags = (flags & CHECK_CAPS_SYNCHRONOUS) ? MClientCaps::FLAG_SYNC : 0;
3609 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3610 flushing, flush_tid);
3611 }
3612 }
3613
3614
3615 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3616 {
3617 int used = get_caps_used(in);
3618 int dirty = in->caps_dirty();
3619 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3620
3621 if (in->cap_snaps.size() &&
3622 in->cap_snaps.rbegin()->second.writing) {
3623 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3624 return;
3625 } else if (in->caps_dirty() ||
3626 (used & CEPH_CAP_FILE_WR) ||
3627 (dirty & CEPH_CAP_ANY_WR)) {
3628 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3629 ceph_assert(capsnapem.second); /* element inserted */
3630 CapSnap &capsnap = capsnapem.first->second;
3631 capsnap.context = old_snapc;
3632 capsnap.issued = in->caps_issued();
3633 capsnap.dirty = in->caps_dirty();
3634
3635 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3636
3637 capsnap.uid = in->uid;
3638 capsnap.gid = in->gid;
3639 capsnap.mode = in->mode;
3640 capsnap.btime = in->btime;
3641 capsnap.xattrs = in->xattrs;
3642 capsnap.xattr_version = in->xattr_version;
3643 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3644 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3645
3646 if (used & CEPH_CAP_FILE_WR) {
3647 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3648 capsnap.writing = 1;
3649 } else {
3650 finish_cap_snap(in, capsnap, used);
3651 }
3652 } else {
3653 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3654 }
3655 }
3656
3657 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3658 {
3659 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3660 capsnap.size = in->size;
3661 capsnap.mtime = in->mtime;
3662 capsnap.atime = in->atime;
3663 capsnap.ctime = in->ctime;
3664 capsnap.time_warp_seq = in->time_warp_seq;
3665 capsnap.change_attr = in->change_attr;
3666 capsnap.dirty |= in->caps_dirty();
3667
3668 /* Only reset it if it wasn't set before */
3669 if (capsnap.cap_dirtier_uid == -1) {
3670 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3671 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3672 }
3673
3674 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3675 capsnap.inline_data = in->inline_data;
3676 capsnap.inline_version = in->inline_version;
3677 }
3678
3679 if (used & CEPH_CAP_FILE_BUFFER) {
3680 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3681 << " WRBUFFER, delaying" << dendl;
3682 } else {
3683 capsnap.dirty_data = 0;
3684 flush_snaps(in);
3685 }
3686 }
3687
3688 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3689 {
3690 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3691 in->cap_snaps.at(seq).dirty_data = 0;
3692 flush_snaps(in);
3693 }
3694
3695 void Client::send_flush_snap(Inode *in, MetaSession *session,
3696 snapid_t follows, CapSnap& capsnap)
3697 {
3698 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP,
3699 in->ino, in->snaprealm->ino, 0,
3700 in->auth_cap->mseq, cap_epoch_barrier);
3701 m->caller_uid = capsnap.cap_dirtier_uid;
3702 m->caller_gid = capsnap.cap_dirtier_gid;
3703
3704 m->set_client_tid(capsnap.flush_tid);
3705 m->head.snap_follows = follows;
3706
3707 m->head.caps = capsnap.issued;
3708 m->head.dirty = capsnap.dirty;
3709
3710 m->head.uid = capsnap.uid;
3711 m->head.gid = capsnap.gid;
3712 m->head.mode = capsnap.mode;
3713 m->btime = capsnap.btime;
3714
3715 m->size = capsnap.size;
3716
3717 m->head.xattr_version = capsnap.xattr_version;
3718 encode(capsnap.xattrs, m->xattrbl);
3719
3720 m->ctime = capsnap.ctime;
3721 m->btime = capsnap.btime;
3722 m->mtime = capsnap.mtime;
3723 m->atime = capsnap.atime;
3724 m->time_warp_seq = capsnap.time_warp_seq;
3725 m->change_attr = capsnap.change_attr;
3726
3727 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3728 m->inline_version = in->inline_version;
3729 m->inline_data = in->inline_data;
3730 }
3731
3732 ceph_assert(!session->flushing_caps_tids.empty());
3733 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3734
3735 session->con->send_message2(std::move(m));
3736 }
3737
3738 void Client::flush_snaps(Inode *in)
3739 {
3740 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3741 ceph_assert(in->cap_snaps.size());
3742
3743 // pick auth mds
3744 ceph_assert(in->auth_cap);
3745 MetaSession *session = in->auth_cap->session;
3746
3747 for (auto &p : in->cap_snaps) {
3748 CapSnap &capsnap = p.second;
3749 // only do new flush
3750 if (capsnap.flush_tid > 0)
3751 continue;
3752
3753 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3754 << " follows " << p.first
3755 << " size " << capsnap.size
3756 << " mtime " << capsnap.mtime
3757 << " dirty_data=" << capsnap.dirty_data
3758 << " writing=" << capsnap.writing
3759 << " on " << *in << dendl;
3760 if (capsnap.dirty_data || capsnap.writing)
3761 break;
3762
3763 capsnap.flush_tid = ++last_flush_tid;
3764 session->flushing_caps_tids.insert(capsnap.flush_tid);
3765 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3766 if (!in->flushing_cap_item.is_on_list())
3767 session->flushing_caps.push_back(&in->flushing_cap_item);
3768
3769 send_flush_snap(in, session, p.first, capsnap);
3770 }
3771 }
3772
3773 void Client::wait_on_list(list<Cond*>& ls)
3774 {
3775 Cond cond;
3776 ls.push_back(&cond);
3777 cond.Wait(client_lock);
3778 ls.remove(&cond);
3779 }
3780
3781 void Client::signal_cond_list(list<Cond*>& ls)
3782 {
3783 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3784 (*it)->Signal();
3785 }
3786
3787 void Client::wait_on_context_list(list<Context*>& ls)
3788 {
3789 Cond cond;
3790 bool done = false;
3791 int r;
3792 ls.push_back(new C_Cond(&cond, &done, &r));
3793 while (!done)
3794 cond.Wait(client_lock);
3795 }
3796
3797 void Client::signal_context_list(list<Context*>& ls)
3798 {
3799 while (!ls.empty()) {
3800 ls.front()->complete(0);
3801 ls.pop_front();
3802 }
3803 }
3804
3805 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3806 {
3807 for (const auto &cap : s->caps) {
3808 auto &in = cap->inode;
3809 if (reconnect) {
3810 in.requested_max_size = 0;
3811 in.wanted_max_size = 0;
3812 } else {
3813 if (cap->gen < s->cap_gen) {
3814 // mds did not re-issue stale cap.
3815 cap->issued = cap->implemented = CEPH_CAP_PIN;
3816 // make sure mds knows what we want.
3817 if (in.caps_file_wanted() & ~cap->wanted)
3818 in.flags |= I_CAP_DROPPED;
3819 }
3820 }
3821 signal_cond_list(in.waitfor_caps);
3822 }
3823 }
3824
3825
3826 // flush dirty data (from objectcache)
3827
3828 class C_Client_CacheInvalidate : public Context {
3829 private:
3830 Client *client;
3831 vinodeno_t ino;
3832 int64_t offset, length;
3833 public:
3834 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3835 client(c), offset(off), length(len) {
3836 if (client->use_faked_inos())
3837 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3838 else
3839 ino = in->vino();
3840 }
3841 void finish(int r) override {
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3843 ceph_assert(!client->client_lock.is_locked_by_me());
3844 client->_async_invalidate(ino, offset, length);
3845 }
3846 };
3847
3848 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3849 {
3850 if (unmounting)
3851 return;
3852 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3853 ino_invalidate_cb(callback_handle, ino, off, len);
3854 }
3855
3856 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3857
3858 if (ino_invalidate_cb)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3861 }
3862
3863 void Client::_invalidate_inode_cache(Inode *in)
3864 {
3865 ldout(cct, 10) << __func__ << " " << *in << dendl;
3866
3867 // invalidate our userspace inode cache
3868 if (cct->_conf->client_oc) {
3869 objectcacher->release_set(&in->oset);
3870 if (!objectcacher->set_is_empty(&in->oset))
3871 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3872 }
3873
3874 _schedule_invalidate_callback(in, 0, 0);
3875 }
3876
3877 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3878 {
3879 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3880
3881 // invalidate our userspace inode cache
3882 if (cct->_conf->client_oc) {
3883 vector<ObjectExtent> ls;
3884 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3885 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3886 }
3887
3888 _schedule_invalidate_callback(in, off, len);
3889 }
3890
3891 bool Client::_release(Inode *in)
3892 {
3893 ldout(cct, 20) << "_release " << *in << dendl;
3894 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3895 _invalidate_inode_cache(in);
3896 return true;
3897 }
3898 return false;
3899 }
3900
3901 bool Client::_flush(Inode *in, Context *onfinish)
3902 {
3903 ldout(cct, 10) << "_flush " << *in << dendl;
3904
3905 if (!in->oset.dirty_or_tx) {
3906 ldout(cct, 10) << " nothing to flush" << dendl;
3907 onfinish->complete(0);
3908 return true;
3909 }
3910
3911 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3912 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3913 objectcacher->purge_set(&in->oset);
3914 if (onfinish) {
3915 onfinish->complete(-ENOSPC);
3916 }
3917 return true;
3918 }
3919
3920 return objectcacher->flush_set(&in->oset, onfinish);
3921 }
3922
3923 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3924 {
3925 ceph_assert(client_lock.is_locked());
3926 if (!in->oset.dirty_or_tx) {
3927 ldout(cct, 10) << " nothing to flush" << dendl;
3928 return;
3929 }
3930
3931 C_SaferCond onflush("Client::_flush_range flock");
3932 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3933 offset, size, &onflush);
3934 if (!ret) {
3935 // wait for flush
3936 client_lock.Unlock();
3937 onflush.wait();
3938 client_lock.Lock();
3939 }
3940 }
3941
3942 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3943 {
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3946 Inode *in = static_cast<Inode *>(oset->parent);
3947 ceph_assert(in);
3948 _flushed(in);
3949 }
3950
3951 void Client::_flushed(Inode *in)
3952 {
3953 ldout(cct, 10) << "_flushed " << *in << dendl;
3954
3955 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3956 }
3957
3958
3959
3960 // checks common to add_update_cap, handle_cap_grant
3961 void Client::check_cap_issue(Inode *in, unsigned issued)
3962 {
3963 unsigned had = in->caps_issued();
3964
3965 if ((issued & CEPH_CAP_FILE_CACHE) &&
3966 !(had & CEPH_CAP_FILE_CACHE))
3967 in->cache_gen++;
3968
3969 if ((issued & CEPH_CAP_FILE_SHARED) &&
3970 !(had & CEPH_CAP_FILE_SHARED)) {
3971 in->shared_gen++;
3972
3973 if (in->is_dir())
3974 clear_dir_complete_and_ordered(in, true);
3975 }
3976 }
3977
3978 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3979 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3980 inodeno_t realm, int flags, const UserPerm& cap_perms)
3981 {
3982 if (!in->is_any_caps()) {
3983 ceph_assert(in->snaprealm == 0);
3984 in->snaprealm = get_snap_realm(realm);
3985 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3986 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3987 } else {
3988 ceph_assert(in->snaprealm);
3989 if ((flags & CEPH_CAP_FLAG_AUTH) &&
3990 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
3991 in->snaprealm_item.remove_myself();
3992 auto oldrealm = in->snaprealm;
3993 in->snaprealm = get_snap_realm(realm);
3994 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3995 put_snap_realm(oldrealm);
3996 }
3997 }
3998
3999 mds_rank_t mds = mds_session->mds_num;
4000 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4001 Cap &cap = capem.first->second;
4002 if (!capem.second) {
4003 if (cap.gen < mds_session->cap_gen)
4004 cap.issued = cap.implemented = CEPH_CAP_PIN;
4005
4006 /*
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4010 *
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4014 */
4015 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4016 if (&cap != in->auth_cap)
4017 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4018
4019 ceph_assert(cap.cap_id == cap_id);
4020 seq = cap.seq;
4021 mseq = cap.mseq;
4022 issued |= cap.issued;
4023 flags |= CEPH_CAP_FLAG_AUTH;
4024 }
4025 }
4026
4027 check_cap_issue(in, issued);
4028
4029 if (flags & CEPH_CAP_FLAG_AUTH) {
4030 if (in->auth_cap != &cap &&
4031 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4032 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4033 ldout(cct, 10) << __func__ << " changing auth cap: "
4034 << "add myself to new auth MDS' flushing caps list" << dendl;
4035 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4036 }
4037 in->auth_cap = &cap;
4038 }
4039 }
4040
4041 unsigned old_caps = cap.issued;
4042 cap.cap_id = cap_id;
4043 cap.issued = issued;
4044 cap.implemented |= issued;
4045 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4046 cap.wanted = wanted;
4047 else
4048 cap.wanted |= wanted;
4049 cap.seq = seq;
4050 cap.issue_seq = seq;
4051 cap.mseq = mseq;
4052 cap.gen = mds_session->cap_gen;
4053 cap.latest_perms = cap_perms;
4054 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4055 << " from mds." << mds
4056 << " on " << *in
4057 << dendl;
4058
4059 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4060 // non-auth MDS is revoking the newly grant caps ?
4061 for (auto &p : in->caps) {
4062 if (&p.second == &cap)
4063 continue;
4064 if (p.second.implemented & ~p.second.issued & issued) {
4065 check_caps(in, CHECK_CAPS_NODELAY);
4066 break;
4067 }
4068 }
4069 }
4070
4071 if (issued & ~old_caps)
4072 signal_cond_list(in->waitfor_caps);
4073 }
4074
4075 void Client::remove_cap(Cap *cap, bool queue_release)
4076 {
4077 auto &in = cap->inode;
4078 MetaSession *session = cap->session;
4079 mds_rank_t mds = cap->session->mds_num;
4080
4081 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4082
4083 if (queue_release) {
4084 session->enqueue_cap_release(
4085 in.ino,
4086 cap->cap_id,
4087 cap->issue_seq,
4088 cap->mseq,
4089 cap_epoch_barrier);
4090 }
4091
4092 if (in.auth_cap == cap) {
4093 if (in.flushing_cap_item.is_on_list()) {
4094 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4095 in.flushing_cap_item.remove_myself();
4096 }
4097 in.auth_cap = NULL;
4098 }
4099 size_t n = in.caps.erase(mds);
4100 ceph_assert(n == 1);
4101 cap = nullptr;
4102
4103 if (!in.is_any_caps()) {
4104 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4105 in.snaprealm_item.remove_myself();
4106 put_snap_realm(in.snaprealm);
4107 in.snaprealm = 0;
4108 }
4109 }
4110
4111 void Client::remove_all_caps(Inode *in)
4112 {
4113 while (!in->caps.empty())
4114 remove_cap(&in->caps.begin()->second, true);
4115 }
4116
4117 void Client::remove_session_caps(MetaSession *s)
4118 {
4119 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4120
4121 while (s->caps.size()) {
4122 Cap *cap = *s->caps.begin();
4123 InodeRef in(&cap->inode);
4124 bool dirty_caps = false;
4125 if (in->auth_cap == cap) {
4126 dirty_caps = in->dirty_caps | in->flushing_caps;
4127 in->wanted_max_size = 0;
4128 in->requested_max_size = 0;
4129 }
4130 if (cap->wanted | cap->issued)
4131 in->flags |= I_CAP_DROPPED;
4132 remove_cap(cap, false);
4133 in->cap_snaps.clear();
4134 if (dirty_caps) {
4135 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4136 if (in->flushing_caps) {
4137 num_flushing_caps--;
4138 in->flushing_cap_tids.clear();
4139 }
4140 in->flushing_caps = 0;
4141 in->mark_caps_clean();
4142 put_inode(in.get());
4143 }
4144 signal_cond_list(in->waitfor_caps);
4145 }
4146 s->flushing_caps_tids.clear();
4147 sync_cond.Signal();
4148 }
4149
4150 int Client::_do_remount(bool retry_on_error)
4151 {
4152 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4153
4154 errno = 0;
4155 int r = remount_cb(callback_handle);
4156 if (r == 0) {
4157 retries_on_invalidate = 0;
4158 } else {
4159 int e = errno;
4160 client_t whoami = get_nodeid();
4161 if (r == -1) {
4162 lderr(cct) <<
4163 "failed to remount (to trim kernel dentries): "
4164 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4165 } else {
4166 lderr(cct) <<
4167 "failed to remount (to trim kernel dentries): "
4168 "return code = " << r << dendl;
4169 }
4170 bool should_abort =
4171 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4172 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4173 !(retry_on_error && (++retries_on_invalidate < max_retries));
4174 if (should_abort && !unmounting) {
4175 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4176 ceph_abort();
4177 }
4178 }
4179 return r;
4180 }
4181
4182 class C_Client_Remount : public Context {
4183 private:
4184 Client *client;
4185 public:
4186 explicit C_Client_Remount(Client *c) : client(c) {}
4187 void finish(int r) override {
4188 ceph_assert(r == 0);
4189 client->_do_remount(true);
4190 }
4191 };
4192
4193 void Client::_invalidate_kernel_dcache()
4194 {
4195 if (unmounting)
4196 return;
4197 if (can_invalidate_dentries) {
4198 if (dentry_invalidate_cb && root->dir) {
4199 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4200 p != root->dir->dentries.end();
4201 ++p) {
4202 if (p->second->inode)
4203 _schedule_invalidate_dentry_callback(p->second, false);
4204 }
4205 }
4206 } else if (remount_cb) {
4207 // Hacky:
4208 // when remounting a file system, linux kernel trims all unused dentries in the fs
4209 remount_finisher.queue(new C_Client_Remount(this));
4210 }
4211 }
4212
4213 void Client::_trim_negative_child_dentries(InodeRef& in)
4214 {
4215 if (!in->is_dir())
4216 return;
4217
4218 Dir* dir = in->dir;
4219 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4220 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4221 Dentry *dn = p->second;
4222 ++p;
4223 ceph_assert(!dn->inode);
4224 if (dn->lru_is_expireable())
4225 unlink(dn, true, false); // keep dir, drop dentry
4226 }
4227 if (dir->dentries.empty()) {
4228 close_dir(dir);
4229 }
4230 }
4231
4232 if (in->flags & I_SNAPDIR_OPEN) {
4233 InodeRef snapdir = open_snapdir(in.get());
4234 _trim_negative_child_dentries(snapdir);
4235 }
4236 }
4237
4238 void Client::trim_caps(MetaSession *s, uint64_t max)
4239 {
4240 mds_rank_t mds = s->mds_num;
4241 size_t caps_size = s->caps.size();
4242 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4243 << " caps " << caps_size << dendl;
4244
4245 uint64_t trimmed = 0;
4246 auto p = s->caps.begin();
4247 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4248 * looking at from getting deleted during traversal. */
4249 while ((caps_size - trimmed) > max && !p.end()) {
4250 Cap *cap = *p;
4251 InodeRef in(&cap->inode);
4252
4253 // Increment p early because it will be invalidated if cap
4254 // is deleted inside remove_cap
4255 ++p;
4256
4257 if (in->caps.size() > 1 && cap != in->auth_cap) {
4258 int mine = cap->issued | cap->implemented;
4259 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4260 // disposable non-auth cap
4261 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4262 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4263 cap = (remove_cap(cap, true), nullptr);
4264 trimmed++;
4265 }
4266 } else {
4267 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4268 _trim_negative_child_dentries(in);
4269 bool all = true;
4270 auto q = in->dentries.begin();
4271 while (q != in->dentries.end()) {
4272 Dentry *dn = *q;
4273 ++q;
4274 if (dn->lru_is_expireable()) {
4275 if (can_invalidate_dentries &&
4276 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4277 // Only issue one of these per DN for inodes in root: handle
4278 // others more efficiently by calling for root-child DNs at
4279 // the end of this function.
4280 _schedule_invalidate_dentry_callback(dn, true);
4281 }
4282 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4283 to_trim.insert(dn);
4284 } else {
4285 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4286 all = false;
4287 }
4288 }
4289 if (all && in->ino != MDS_INO_ROOT) {
4290 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4291 trimmed++;
4292 }
4293 }
4294 }
4295 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4296 for (const auto &dn : to_trim) {
4297 trim_dentry(dn);
4298 }
4299 to_trim.clear();
4300
4301 caps_size = s->caps.size();
4302 if (caps_size > (size_t)max)
4303 _invalidate_kernel_dcache();
4304 }
4305
4306 void Client::force_session_readonly(MetaSession *s)
4307 {
4308 s->readonly = true;
4309 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4310 auto &in = (*p)->inode;
4311 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4312 signal_cond_list(in.waitfor_caps);
4313 }
4314 }
4315
4316 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4317 {
4318 MetaSession *session = in->auth_cap->session;
4319
4320 int flushing = in->dirty_caps;
4321 ceph_assert(flushing);
4322
4323 ceph_tid_t flush_tid = ++last_flush_tid;
4324 in->flushing_cap_tids[flush_tid] = flushing;
4325
4326 if (!in->flushing_caps) {
4327 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4328 num_flushing_caps++;
4329 } else {
4330 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4331 }
4332
4333 in->flushing_caps |= flushing;
4334 in->mark_caps_clean();
4335
4336 if (!in->flushing_cap_item.is_on_list())
4337 session->flushing_caps.push_back(&in->flushing_cap_item);
4338 session->flushing_caps_tids.insert(flush_tid);
4339
4340 *ptid = flush_tid;
4341 return flushing;
4342 }
4343
4344 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4345 {
4346 for (auto &p : in->cap_snaps) {
4347 CapSnap &capsnap = p.second;
4348 if (capsnap.flush_tid > 0) {
4349 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4350 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4351 }
4352 }
4353 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4354 it != in->flushing_cap_tids.end();
4355 ++it) {
4356 old_s->flushing_caps_tids.erase(it->first);
4357 new_s->flushing_caps_tids.insert(it->first);
4358 }
4359 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4360 }
4361
4362 /*
4363 * Flush all caps back to the MDS. Because the callers generally wait on the
4364 * result of this function (syncfs and umount cases), we set
4365 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4366 */
4367 void Client::flush_caps_sync()
4368 {
4369 ldout(cct, 10) << __func__ << dendl;
4370 xlist<Inode*>::iterator p = delayed_list.begin();
4371 while (!p.end()) {
4372 unsigned flags = CHECK_CAPS_NODELAY;
4373 Inode *in = *p;
4374
4375 ++p;
4376 delayed_list.pop_front();
4377 if (p.end() && dirty_list.empty())
4378 flags |= CHECK_CAPS_SYNCHRONOUS;
4379 check_caps(in, flags);
4380 }
4381
4382 // other caps, too
4383 p = dirty_list.begin();
4384 while (!p.end()) {
4385 unsigned flags = CHECK_CAPS_NODELAY;
4386 Inode *in = *p;
4387
4388 ++p;
4389 if (p.end())
4390 flags |= CHECK_CAPS_SYNCHRONOUS;
4391 check_caps(in, flags);
4392 }
4393 }
4394
4395 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4396 {
4397 while (in->flushing_caps) {
4398 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4399 ceph_assert(it != in->flushing_cap_tids.end());
4400 if (it->first > want)
4401 break;
4402 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4403 << ccap_string(it->second) << " want " << want
4404 << " last " << it->first << dendl;
4405 wait_on_list(in->waitfor_caps);
4406 }
4407 }
4408
4409 void Client::wait_sync_caps(ceph_tid_t want)
4410 {
4411 retry:
4412 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4413 << num_flushing_caps << " total flushing)" << dendl;
4414 for (auto &p : mds_sessions) {
4415 MetaSession *s = &p.second;
4416 if (s->flushing_caps_tids.empty())
4417 continue;
4418 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4419 if (oldest_tid <= want) {
4420 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4421 << " (want " << want << ")" << dendl;
4422 sync_cond.Wait(client_lock);
4423 goto retry;
4424 }
4425 }
4426 }
4427
4428 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4429 {
4430 in->flags &= ~I_KICK_FLUSH;
4431
4432 Cap *cap = in->auth_cap;
4433 ceph_assert(cap->session == session);
4434
4435 ceph_tid_t last_snap_flush = 0;
4436 for (auto p = in->flushing_cap_tids.rbegin();
4437 p != in->flushing_cap_tids.rend();
4438 ++p) {
4439 if (!p->second) {
4440 last_snap_flush = p->first;
4441 break;
4442 }
4443 }
4444
4445 int wanted = in->caps_wanted();
4446 int used = get_caps_used(in) | in->caps_dirty();
4447 auto it = in->cap_snaps.begin();
4448 for (auto& p : in->flushing_cap_tids) {
4449 if (p.second) {
4450 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4451 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4452 p.second, p.first);
4453 } else {
4454 ceph_assert(it != in->cap_snaps.end());
4455 ceph_assert(it->second.flush_tid == p.first);
4456 send_flush_snap(in, session, it->first, it->second);
4457 ++it;
4458 }
4459 }
4460 }
4461
4462 void Client::kick_flushing_caps(MetaSession *session)
4463 {
4464 mds_rank_t mds = session->mds_num;
4465 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4466
4467 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4468 Inode *in = *p;
4469 if (in->flags & I_KICK_FLUSH) {
4470 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4471 kick_flushing_caps(in, session);
4472 }
4473 }
4474 }
4475
4476 void Client::early_kick_flushing_caps(MetaSession *session)
4477 {
4478 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4479 Inode *in = *p;
4480 Cap *cap = in->auth_cap;
4481 ceph_assert(cap);
4482
4483 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4484 // stage. This guarantees that MDS processes the cap flush message before issuing
4485 // the flushing caps to other client.
4486 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4487 in->flags |= I_KICK_FLUSH;
4488 continue;
4489 }
4490
4491 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4492 << " to mds." << session->mds_num << dendl;
4493 // send_reconnect() also will reset these sequence numbers. make sure
4494 // sequence numbers in cap flush message match later reconnect message.
4495 cap->seq = 0;
4496 cap->issue_seq = 0;
4497 cap->mseq = 0;
4498 cap->issued = cap->implemented;
4499
4500 kick_flushing_caps(in, session);
4501 }
4502 }
4503
4504 void SnapRealm::build_snap_context()
4505 {
4506 set<snapid_t> snaps;
4507 snapid_t max_seq = seq;
4508
4509 // start with prior_parents?
4510 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4511 snaps.insert(prior_parent_snaps[i]);
4512
4513 // current parent's snaps
4514 if (pparent) {
4515 const SnapContext& psnapc = pparent->get_snap_context();
4516 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4517 if (psnapc.snaps[i] >= parent_since)
4518 snaps.insert(psnapc.snaps[i]);
4519 if (psnapc.seq > max_seq)
4520 max_seq = psnapc.seq;
4521 }
4522
4523 // my snaps
4524 for (unsigned i=0; i<my_snaps.size(); i++)
4525 snaps.insert(my_snaps[i]);
4526
4527 // ok!
4528 cached_snap_context.seq = max_seq;
4529 cached_snap_context.snaps.resize(0);
4530 cached_snap_context.snaps.reserve(snaps.size());
4531 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4532 cached_snap_context.snaps.push_back(*p);
4533 }
4534
4535 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4536 {
4537 list<SnapRealm*> q;
4538 q.push_back(realm);
4539
4540 while (!q.empty()) {
4541 realm = q.front();
4542 q.pop_front();
4543
4544 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4545 realm->invalidate_cache();
4546
4547 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4548 p != realm->pchildren.end();
4549 ++p)
4550 q.push_back(*p);
4551 }
4552 }
4553
4554 SnapRealm *Client::get_snap_realm(inodeno_t r)
4555 {
4556 SnapRealm *realm = snap_realms[r];
4557 if (!realm)
4558 snap_realms[r] = realm = new SnapRealm(r);
4559 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4560 realm->nref++;
4561 return realm;
4562 }
4563
4564 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4565 {
4566 if (snap_realms.count(r) == 0) {
4567 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4568 return NULL;
4569 }
4570 SnapRealm *realm = snap_realms[r];
4571 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4572 realm->nref++;
4573 return realm;
4574 }
4575
4576 void Client::put_snap_realm(SnapRealm *realm)
4577 {
4578 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4579 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4580 if (--realm->nref == 0) {
4581 snap_realms.erase(realm->ino);
4582 if (realm->pparent) {
4583 realm->pparent->pchildren.erase(realm);
4584 put_snap_realm(realm->pparent);
4585 }
4586 delete realm;
4587 }
4588 }
4589
4590 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4591 {
4592 if (realm->parent != parent) {
4593 ldout(cct, 10) << __func__ << " " << *realm
4594 << " " << realm->parent << " -> " << parent << dendl;
4595 realm->parent = parent;
4596 if (realm->pparent) {
4597 realm->pparent->pchildren.erase(realm);
4598 put_snap_realm(realm->pparent);
4599 }
4600 realm->pparent = get_snap_realm(parent);
4601 realm->pparent->pchildren.insert(realm);
4602 return true;
4603 }
4604 return false;
4605 }
4606
4607 static bool has_new_snaps(const SnapContext& old_snapc,
4608 const SnapContext& new_snapc)
4609 {
4610 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4611 }
4612
4613
4614 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4615 {
4616 SnapRealm *first_realm = NULL;
4617 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4618
4619 map<SnapRealm*, SnapContext> dirty_realms;
4620
4621 auto p = bl.cbegin();
4622 while (!p.end()) {
4623 SnapRealmInfo info;
4624 decode(info, p);
4625 SnapRealm *realm = get_snap_realm(info.ino());
4626
4627 bool invalidate = false;
4628
4629 if (info.seq() > realm->seq) {
4630 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4631 << dendl;
4632
4633 if (flush) {
4634 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4635 // flush me + children
4636 list<SnapRealm*> q;
4637 q.push_back(realm);
4638 while (!q.empty()) {
4639 SnapRealm *realm = q.front();
4640 q.pop_front();
4641
4642 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4643 p != realm->pchildren.end();
4644 ++p)
4645 q.push_back(*p);
4646
4647 if (dirty_realms.count(realm) == 0) {
4648 realm->nref++;
4649 dirty_realms[realm] = realm->get_snap_context();
4650 }
4651 }
4652 }
4653
4654 // update
4655 realm->seq = info.seq();
4656 realm->created = info.created();
4657 realm->parent_since = info.parent_since();
4658 realm->prior_parent_snaps = info.prior_parent_snaps;
4659 realm->my_snaps = info.my_snaps;
4660 invalidate = true;
4661 }
4662
4663 // _always_ verify parent
4664 if (adjust_realm_parent(realm, info.parent()))
4665 invalidate = true;
4666
4667 if (invalidate) {
4668 invalidate_snaprealm_and_children(realm);
4669 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4670 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4671 } else {
4672 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4673 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4674 }
4675
4676 if (!first_realm)
4677 first_realm = realm;
4678 else
4679 put_snap_realm(realm);
4680 }
4681
4682 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4683 q != dirty_realms.end();
4684 ++q) {
4685 SnapRealm *realm = q->first;
4686 // if there are new snaps ?
4687 if (has_new_snaps(q->second, realm->get_snap_context())) {
4688 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4689 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4690 while (!r.end()) {
4691 Inode *in = *r;
4692 ++r;
4693 queue_cap_snap(in, q->second);
4694 }
4695 } else {
4696 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4697 }
4698 put_snap_realm(realm);
4699 }
4700
4701 if (realm_ret)
4702 *realm_ret = first_realm;
4703 else
4704 put_snap_realm(first_realm);
4705 }
4706
4707 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4708 {
4709 ldout(cct, 10) << __func__ << " " << *m << dendl;
4710 mds_rank_t mds = mds_rank_t(m->get_source().num());
4711 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4712 if (!session) {
4713 return;
4714 }
4715
4716 got_mds_push(session);
4717
4718 map<Inode*, SnapContext> to_move;
4719 SnapRealm *realm = 0;
4720
4721 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4722 ceph_assert(m->head.split);
4723 SnapRealmInfo info;
4724 auto p = m->bl.cbegin();
4725 decode(info, p);
4726 ceph_assert(info.ino() == m->head.split);
4727
4728 // flush, then move, ino's.
4729 realm = get_snap_realm(info.ino());
4730 ldout(cct, 10) << " splitting off " << *realm << dendl;
4731 for (auto& ino : m->split_inos) {
4732 vinodeno_t vino(ino, CEPH_NOSNAP);
4733 if (inode_map.count(vino)) {
4734 Inode *in = inode_map[vino];
4735 if (!in->snaprealm || in->snaprealm == realm)
4736 continue;
4737 if (in->snaprealm->created > info.created()) {
4738 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4739 << *in->snaprealm << dendl;
4740 continue;
4741 }
4742 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4743
4744
4745 in->snaprealm_item.remove_myself();
4746 to_move[in] = in->snaprealm->get_snap_context();
4747 put_snap_realm(in->snaprealm);
4748 }
4749 }
4750
4751 // move child snaprealms, too
4752 for (auto& child_realm : m->split_realms) {
4753 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4754 SnapRealm *child = get_snap_realm_maybe(child_realm);
4755 if (!child)
4756 continue;
4757 adjust_realm_parent(child, realm->ino);
4758 put_snap_realm(child);
4759 }
4760 }
4761
4762 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4763
4764 if (realm) {
4765 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4766 Inode *in = p->first;
4767 in->snaprealm = realm;
4768 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4769 realm->nref++;
4770 // queue for snap writeback
4771 if (has_new_snaps(p->second, realm->get_snap_context()))
4772 queue_cap_snap(in, p->second);
4773 }
4774 put_snap_realm(realm);
4775 }
4776 }
4777
4778 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4779 {
4780 mds_rank_t mds = mds_rank_t(m->get_source().num());
4781 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4782 if (!session) {
4783 return;
4784 }
4785
4786 got_mds_push(session);
4787
4788 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4789
4790 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4791 if (inode_map.count(vino)) {
4792 Inode *in = NULL;
4793 in = inode_map[vino];
4794
4795 if (in) {
4796 in->quota = m->quota;
4797 in->rstat = m->rstat;
4798 }
4799 }
4800 }
4801
4802 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4803 {
4804 mds_rank_t mds = mds_rank_t(m->get_source().num());
4805 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4806 if (!session) {
4807 return;
4808 }
4809
4810 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4811 // Pause RADOS operations until we see the required epoch
4812 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4813 }
4814
4815 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4816 // Record the barrier so that we will transmit it to MDS when releasing
4817 set_cap_epoch_barrier(m->osd_epoch_barrier);
4818 }
4819
4820 got_mds_push(session);
4821
4822 Inode *in;
4823 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4824 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4825 in = it->second;
4826 } else {
4827 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4828 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4829 session->enqueue_cap_release(
4830 m->get_ino(),
4831 m->get_cap_id(),
4832 m->get_seq(),
4833 m->get_mseq(),
4834 cap_epoch_barrier);
4835 } else {
4836 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4837 }
4838
4839 // in case the mds is waiting on e.g. a revocation
4840 flush_cap_releases();
4841 return;
4842 }
4843
4844 switch (m->get_op()) {
4845 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4846 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4847 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4848 }
4849
4850 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4851 Cap &cap = in->caps.at(mds);
4852
4853 switch (m->get_op()) {
4854 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4855 case CEPH_CAP_OP_IMPORT:
4856 case CEPH_CAP_OP_REVOKE:
4857 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4858 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4859 }
4860 } else {
4861 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4862 return;
4863 }
4864 }
4865
4866 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4867 {
4868 mds_rank_t mds = session->mds_num;
4869
4870 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4871 << " IMPORT from mds." << mds << dendl;
4872
4873 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4874 Cap *cap = NULL;
4875 UserPerm cap_perms;
4876 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4877 cap = &it->second;
4878 cap_perms = cap->latest_perms;
4879 }
4880
4881 // add/update it
4882 SnapRealm *realm = NULL;
4883 update_snap_trace(m->snapbl, &realm);
4884
4885 add_update_cap(in, session, m->get_cap_id(),
4886 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4887 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4888
4889 if (cap && cap->cap_id == m->peer.cap_id) {
4890 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4891 }
4892
4893 if (realm)
4894 put_snap_realm(realm);
4895
4896 if (in->auth_cap && in->auth_cap->session == session) {
4897 // reflush any/all caps (if we are now the auth_cap)
4898 kick_flushing_caps(in, session);
4899 }
4900 }
4901
4902 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4903 {
4904 mds_rank_t mds = session->mds_num;
4905
4906 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4907 << " EXPORT from mds." << mds << dendl;
4908
4909 auto it = in->caps.find(mds);
4910 if (it != in->caps.end()) {
4911 Cap &cap = it->second;
4912 if (cap.cap_id == m->get_cap_id()) {
4913 if (m->peer.cap_id) {
4914 const auto peer_mds = mds_rank_t(m->peer.mds);
4915 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4916 auto it = in->caps.find(peer_mds);
4917 if (it != in->caps.end()) {
4918 Cap &tcap = it->second;
4919 if (tcap.cap_id == m->peer.cap_id &&
4920 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4921 tcap.cap_id = m->peer.cap_id;
4922 tcap.seq = m->peer.seq - 1;
4923 tcap.issue_seq = tcap.seq;
4924 tcap.issued |= cap.issued;
4925 tcap.implemented |= cap.issued;
4926 if (&cap == in->auth_cap)
4927 in->auth_cap = &tcap;
4928 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4929 adjust_session_flushing_caps(in, session, tsession);
4930 }
4931 } else {
4932 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4933 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4934 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4935 cap.latest_perms);
4936 }
4937 } else {
4938 if (cap.wanted | cap.issued)
4939 in->flags |= I_CAP_DROPPED;
4940 }
4941
4942 remove_cap(&cap, false);
4943 }
4944 }
4945 }
4946
4947 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4948 {
4949 mds_rank_t mds = session->mds_num;
4950 ceph_assert(in->caps.count(mds));
4951
4952 ldout(cct, 10) << __func__ << " on ino " << *in
4953 << " size " << in->size << " -> " << m->get_size()
4954 << dendl;
4955
4956 int issued;
4957 in->caps_issued(&issued);
4958 issued |= in->caps_dirty();
4959 update_inode_file_size(in, issued, m->get_size(),
4960 m->get_truncate_seq(), m->get_truncate_size());
4961 }
4962
4963 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
4964 {
4965 ceph_tid_t flush_ack_tid = m->get_client_tid();
4966 int dirty = m->get_dirty();
4967 int cleaned = 0;
4968 int flushed = 0;
4969
4970 auto it = in->flushing_cap_tids.begin();
4971 if (it->first < flush_ack_tid) {
4972 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4973 << " got unexpected flush ack tid " << flush_ack_tid
4974 << " expected is " << it->first << dendl;
4975 }
4976 for (; it != in->flushing_cap_tids.end(); ) {
4977 if (!it->second) {
4978 // cap snap
4979 ++it;
4980 continue;
4981 }
4982 if (it->first == flush_ack_tid)
4983 cleaned = it->second;
4984 if (it->first <= flush_ack_tid) {
4985 session->flushing_caps_tids.erase(it->first);
4986 in->flushing_cap_tids.erase(it++);
4987 ++flushed;
4988 continue;
4989 }
4990 cleaned &= ~it->second;
4991 if (!cleaned)
4992 break;
4993 ++it;
4994 }
4995
4996 ldout(cct, 5) << __func__ << " mds." << session->mds_num
4997 << " cleaned " << ccap_string(cleaned) << " on " << *in
4998 << " with " << ccap_string(dirty) << dendl;
4999
5000 if (flushed) {
5001 signal_cond_list(in->waitfor_caps);
5002 if (session->flushing_caps_tids.empty() ||
5003 *session->flushing_caps_tids.begin() > flush_ack_tid)
5004 sync_cond.Signal();
5005 }
5006
5007 if (!dirty) {
5008 in->cap_dirtier_uid = -1;
5009 in->cap_dirtier_gid = -1;
5010 }
5011
5012 if (!cleaned) {
5013 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5014 } else {
5015 if (in->flushing_caps) {
5016 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5017 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5018 in->flushing_caps &= ~cleaned;
5019 if (in->flushing_caps == 0) {
5020 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5021 num_flushing_caps--;
5022 if (in->flushing_cap_tids.empty())
5023 in->flushing_cap_item.remove_myself();
5024 }
5025 if (!in->caps_dirty())
5026 put_inode(in);
5027 }
5028 }
5029 }
5030
5031
5032 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5033 {
5034 ceph_tid_t flush_ack_tid = m->get_client_tid();
5035 mds_rank_t mds = session->mds_num;
5036 ceph_assert(in->caps.count(mds));
5037 snapid_t follows = m->get_snap_follows();
5038
5039 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5040 auto& capsnap = it->second;
5041 if (flush_ack_tid != capsnap.flush_tid) {
5042 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5043 } else {
5044 InodeRef tmp_ref(in);
5045 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5046 << " on " << *in << dendl;
5047 session->flushing_caps_tids.erase(capsnap.flush_tid);
5048 in->flushing_cap_tids.erase(capsnap.flush_tid);
5049 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5050 in->flushing_cap_item.remove_myself();
5051 in->cap_snaps.erase(it);
5052
5053 signal_cond_list(in->waitfor_caps);
5054 if (session->flushing_caps_tids.empty() ||
5055 *session->flushing_caps_tids.begin() > flush_ack_tid)
5056 sync_cond.Signal();
5057 }
5058 } else {
5059 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5060 << " on " << *in << dendl;
5061 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5062 }
5063 }
5064
5065 class C_Client_DentryInvalidate : public Context {
5066 private:
5067 Client *client;
5068 vinodeno_t dirino;
5069 vinodeno_t ino;
5070 string name;
5071 public:
5072 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5073 client(c), name(dn->name) {
5074 if (client->use_faked_inos()) {
5075 dirino.ino = dn->dir->parent_inode->faked_ino;
5076 if (del)
5077 ino.ino = dn->inode->faked_ino;
5078 } else {
5079 dirino = dn->dir->parent_inode->vino();
5080 if (del)
5081 ino = dn->inode->vino();
5082 }
5083 if (!del)
5084 ino.ino = inodeno_t();
5085 }
5086 void finish(int r) override {
5087 // _async_dentry_invalidate is responsible for its own locking
5088 ceph_assert(!client->client_lock.is_locked_by_me());
5089 client->_async_dentry_invalidate(dirino, ino, name);
5090 }
5091 };
5092
5093 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5094 {
5095 if (unmounting)
5096 return;
5097 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5098 << " in dir " << dirino << dendl;
5099 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5100 }
5101
5102 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5103 {
5104 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5105 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5106 }
5107
5108 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5109 {
5110 int ref = in->get_num_ref();
5111 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5112
5113 if (in->dir && !in->dir->dentries.empty()) {
5114 for (auto p = in->dir->dentries.begin();
5115 p != in->dir->dentries.end(); ) {
5116 Dentry *dn = p->second;
5117 ++p;
5118 /* rmsnap removes whole subtree, need trim inodes recursively.
5119 * we don't need to invalidate dentries recursively. because
5120 * invalidating a directory dentry effectively invalidate
5121 * whole subtree */
5122 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5123 _try_to_trim_inode(dn->inode.get(), false);
5124
5125 if (dn->lru_is_expireable())
5126 unlink(dn, true, false); // keep dir, drop dentry
5127 }
5128 if (in->dir->dentries.empty()) {
5129 close_dir(in->dir);
5130 --ref;
5131 }
5132 }
5133
5134 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5135 InodeRef snapdir = open_snapdir(in);
5136 _try_to_trim_inode(snapdir.get(), false);
5137 --ref;
5138 }
5139
5140 if (ref > 0) {
5141 auto q = in->dentries.begin();
5142 while (q != in->dentries.end()) {
5143 Dentry *dn = *q;
5144 ++q;
5145 if( in->ll_ref > 0 && sched_inval) {
5146 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5147 // so in->dentries doesn't always reflect the state of kernel's dcache.
5148 _schedule_invalidate_dentry_callback(dn, true);
5149 }
5150 unlink(dn, true, true);
5151 }
5152 }
5153 }
5154
5155 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5156 {
5157 mds_rank_t mds = session->mds_num;
5158 int used = get_caps_used(in);
5159 int wanted = in->caps_wanted();
5160
5161 const unsigned new_caps = m->get_caps();
5162 const bool was_stale = session->cap_gen > cap->gen;
5163 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5164 << " mds." << mds << " seq " << m->get_seq()
5165 << " caps now " << ccap_string(new_caps)
5166 << " was " << ccap_string(cap->issued)
5167 << (was_stale ? " (stale)" : "") << dendl;
5168
5169 if (was_stale)
5170 cap->issued = cap->implemented = CEPH_CAP_PIN;
5171 cap->seq = m->get_seq();
5172 cap->gen = session->cap_gen;
5173
5174 check_cap_issue(in, new_caps);
5175
5176 // update inode
5177 int issued;
5178 in->caps_issued(&issued);
5179 issued |= in->caps_dirty();
5180
5181 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5182 !(issued & CEPH_CAP_AUTH_EXCL)) {
5183 in->mode = m->head.mode;
5184 in->uid = m->head.uid;
5185 in->gid = m->head.gid;
5186 in->btime = m->btime;
5187 }
5188 bool deleted_inode = false;
5189 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5190 !(issued & CEPH_CAP_LINK_EXCL)) {
5191 in->nlink = m->head.nlink;
5192 if (in->nlink == 0 &&
5193 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5194 deleted_inode = true;
5195 }
5196 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5197 m->xattrbl.length() &&
5198 m->head.xattr_version > in->xattr_version) {
5199 auto p = m->xattrbl.cbegin();
5200 decode(in->xattrs, p);
5201 in->xattr_version = m->head.xattr_version;
5202 }
5203
5204 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5205 in->dirstat.nfiles = m->get_nfiles();
5206 in->dirstat.nsubdirs = m->get_nsubdirs();
5207 }
5208
5209 if (new_caps & CEPH_CAP_ANY_RD) {
5210 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5211 m->get_ctime(), m->get_mtime(), m->get_atime());
5212 }
5213
5214 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5215 in->layout = m->get_layout();
5216 update_inode_file_size(in, issued, m->get_size(),
5217 m->get_truncate_seq(), m->get_truncate_size());
5218 }
5219
5220 if (m->inline_version > in->inline_version) {
5221 in->inline_data = m->inline_data;
5222 in->inline_version = m->inline_version;
5223 }
5224
5225 /* always take a newer change attr */
5226 if (m->get_change_attr() > in->change_attr)
5227 in->change_attr = m->get_change_attr();
5228
5229 // max_size
5230 if (cap == in->auth_cap &&
5231 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5232 (m->get_max_size() != in->max_size)) {
5233 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5234 in->max_size = m->get_max_size();
5235 if (in->max_size > in->wanted_max_size) {
5236 in->wanted_max_size = 0;
5237 in->requested_max_size = 0;
5238 }
5239 }
5240
5241 bool check = false;
5242 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5243 (wanted & ~(cap->wanted | new_caps))) {
5244 // If mds is importing cap, prior cap messages that update 'wanted'
5245 // may get dropped by mds (migrate seq mismatch).
5246 //
5247 // We don't send cap message to update 'wanted' if what we want are
5248 // already issued. If mds revokes caps, cap message that releases caps
5249 // also tells mds what we want. But if caps got revoked by mds forcedly
5250 // (session stale). We may haven't told mds what we want.
5251 check = true;
5252 }
5253
5254
5255 // update caps
5256 auto revoked = cap->issued & ~new_caps;
5257 if (revoked) {
5258 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5259 cap->issued = new_caps;
5260 cap->implemented |= new_caps;
5261
5262 // recall delegations if we're losing caps necessary for them
5263 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5264 in->recall_deleg(false);
5265 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5266 in->recall_deleg(true);
5267
5268 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5269 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5270 !_flush(in, new C_Client_FlushComplete(this, in))) {
5271 // waitin' for flush
5272 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5273 if (_release(in))
5274 check = true;
5275 } else {
5276 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5277 check = true;
5278 }
5279 } else if (cap->issued == new_caps) {
5280 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5281 } else {
5282 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5283 cap->issued = new_caps;
5284 cap->implemented |= new_caps;
5285
5286 if (cap == in->auth_cap) {
5287 // non-auth MDS is revoking the newly grant caps ?
5288 for (const auto &p : in->caps) {
5289 if (&p.second == cap)
5290 continue;
5291 if (p.second.implemented & ~p.second.issued & new_caps) {
5292 check = true;
5293 break;
5294 }
5295 }
5296 }
5297 }
5298
5299 if (check)
5300 check_caps(in, 0);
5301
5302 // wake up waiters
5303 if (new_caps)
5304 signal_cond_list(in->waitfor_caps);
5305
5306 // may drop inode's last ref
5307 if (deleted_inode)
5308 _try_to_trim_inode(in, true);
5309 }
5310
5311 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5312 {
5313 if (perms.uid() == 0)
5314 return 0;
5315
5316 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5317 int ret = _posix_acl_permission(in, perms, want);
5318 if (ret != -EAGAIN)
5319 return ret;
5320 }
5321
5322 // check permissions before doing anything else
5323 if (!in->check_mode(perms, want))
5324 return -EACCES;
5325 return 0;
5326 }
5327
5328 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5329 const UserPerm& perms)
5330 {
5331 int r = _getattr_for_perm(in, perms);
5332 if (r < 0)
5333 goto out;
5334
5335 r = 0;
5336 if (strncmp(name, "system.", 7) == 0) {
5337 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5338 r = -EPERM;
5339 } else {
5340 r = inode_permission(in, perms, want);
5341 }
5342 out:
5343 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5344 return r;
5345 }
5346
5347 ostream& operator<<(ostream &out, const UserPerm& perm) {
5348 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5349 return out;
5350 }
5351
5352 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5353 const UserPerm& perms)
5354 {
5355 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5356 int r = _getattr_for_perm(in, perms);
5357 if (r < 0)
5358 goto out;
5359
5360 if (mask & CEPH_SETATTR_SIZE) {
5361 r = inode_permission(in, perms, MAY_WRITE);
5362 if (r < 0)
5363 goto out;
5364 }
5365
5366 r = -EPERM;
5367 if (mask & CEPH_SETATTR_UID) {
5368 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5369 goto out;
5370 }
5371 if (mask & CEPH_SETATTR_GID) {
5372 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5373 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5374 goto out;
5375 }
5376
5377 if (mask & CEPH_SETATTR_MODE) {
5378 if (perms.uid() != 0 && perms.uid() != in->uid)
5379 goto out;
5380
5381 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5382 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5383 stx->stx_mode &= ~S_ISGID;
5384 }
5385
5386 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5387 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5388 if (perms.uid() != 0 && perms.uid() != in->uid) {
5389 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5390 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5391 check_mask |= CEPH_SETATTR_MTIME;
5392 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5393 check_mask |= CEPH_SETATTR_ATIME;
5394 if (check_mask & mask) {
5395 goto out;
5396 } else {
5397 r = inode_permission(in, perms, MAY_WRITE);
5398 if (r < 0)
5399 goto out;
5400 }
5401 }
5402 }
5403 r = 0;
5404 out:
5405 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5406 return r;
5407 }
5408
5409 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5410 {
5411 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5412 unsigned want = 0;
5413
5414 if ((flags & O_ACCMODE) == O_WRONLY)
5415 want = MAY_WRITE;
5416 else if ((flags & O_ACCMODE) == O_RDWR)
5417 want = MAY_READ | MAY_WRITE;
5418 else if ((flags & O_ACCMODE) == O_RDONLY)
5419 want = MAY_READ;
5420 if (flags & O_TRUNC)
5421 want |= MAY_WRITE;
5422
5423 int r = 0;
5424 switch (in->mode & S_IFMT) {
5425 case S_IFLNK:
5426 r = -ELOOP;
5427 goto out;
5428 case S_IFDIR:
5429 if (want & MAY_WRITE) {
5430 r = -EISDIR;
5431 goto out;
5432 }
5433 break;
5434 }
5435
5436 r = _getattr_for_perm(in, perms);
5437 if (r < 0)
5438 goto out;
5439
5440 r = inode_permission(in, perms, want);
5441 out:
5442 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5443 return r;
5444 }
5445
5446 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5447 {
5448 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5449 int r = _getattr_for_perm(dir, perms);
5450 if (r < 0)
5451 goto out;
5452
5453 r = inode_permission(dir, perms, MAY_EXEC);
5454 out:
5455 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5456 return r;
5457 }
5458
5459 int Client::may_create(Inode *dir, const UserPerm& perms)
5460 {
5461 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5462 int r = _getattr_for_perm(dir, perms);
5463 if (r < 0)
5464 goto out;
5465
5466 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5467 out:
5468 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5469 return r;
5470 }
5471
5472 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5473 {
5474 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5475 int r = _getattr_for_perm(dir, perms);
5476 if (r < 0)
5477 goto out;
5478
5479 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5480 if (r < 0)
5481 goto out;
5482
5483 /* 'name == NULL' means rmsnap */
5484 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5485 InodeRef otherin;
5486 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5487 if (r < 0)
5488 goto out;
5489 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5490 r = -EPERM;
5491 }
5492 out:
5493 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5494 return r;
5495 }
5496
5497 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5498 {
5499 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5500 int r = _getattr_for_perm(in, perms);
5501 if (r < 0)
5502 goto out;
5503
5504 if (perms.uid() == 0 || perms.uid() == in->uid) {
5505 r = 0;
5506 goto out;
5507 }
5508
5509 r = -EPERM;
5510 if (!S_ISREG(in->mode))
5511 goto out;
5512
5513 if (in->mode & S_ISUID)
5514 goto out;
5515
5516 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5517 goto out;
5518
5519 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5520 out:
5521 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5522 return r;
5523 }
5524
5525 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5526 {
5527 int mask = CEPH_STAT_CAP_MODE;
5528 bool force = false;
5529 if (acl_type != NO_ACL) {
5530 mask |= CEPH_STAT_CAP_XATTR;
5531 force = in->xattr_version == 0;
5532 }
5533 return _getattr(in, mask, perms, force);
5534 }
5535
5536 vinodeno_t Client::_get_vino(Inode *in)
5537 {
5538 /* The caller must hold the client lock */
5539 return vinodeno_t(in->ino, in->snapid);
5540 }
5541
5542 /**
5543 * Resolve an MDS spec to a list of MDS daemon GIDs.
5544 *
5545 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5546 * It may be '*' in which case it matches all GIDs.
5547 *
5548 * If no error is returned, the `targets` vector will be populated with at least
5549 * one MDS.
5550 */
5551 int Client::resolve_mds(
5552 const std::string &mds_spec,
5553 std::vector<mds_gid_t> *targets)
5554 {
5555 ceph_assert(fsmap);
5556 ceph_assert(targets != nullptr);
5557
5558 mds_role_t role;
5559 std::stringstream ss;
5560 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5561 if (role_r == 0) {
5562 // We got a role, resolve it to a GID
5563 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5564 << role << "'" << dendl;
5565 targets->push_back(
5566 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5567 return 0;
5568 }
5569
5570 std::string strtol_err;
5571 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5572 if (strtol_err.empty()) {
5573 // It is a possible GID
5574 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5575 if (fsmap->gid_exists(mds_gid)) {
5576 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5577 targets->push_back(mds_gid);
5578 } else {
5579 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5580 << dendl;
5581 return -ENOENT;
5582 }
5583 } else if (mds_spec == "*") {
5584 // It is a wildcard: use all MDSs
5585 const auto mds_info = fsmap->get_mds_info();
5586
5587 if (mds_info.empty()) {
5588 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5589 return -ENOENT;
5590 }
5591
5592 for (const auto i : mds_info) {
5593 targets->push_back(i.first);
5594 }
5595 } else {
5596 // It did not parse as an integer, it is not a wildcard, it must be a name
5597 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5598 if (mds_gid == 0) {
5599 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5600
5601 lderr(cct) << "FSMap: " << *fsmap << dendl;
5602
5603 return -ENOENT;
5604 } else {
5605 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5606 << "' to GID " << mds_gid << dendl;
5607 targets->push_back(mds_gid);
5608 }
5609 }
5610
5611 return 0;
5612 }
5613
5614
5615 /**
5616 * Authenticate with mon and establish global ID
5617 */
5618 int Client::authenticate()
5619 {
5620 ceph_assert(client_lock.is_locked_by_me());
5621
5622 if (monclient->is_authenticated()) {
5623 return 0;
5624 }
5625
5626 client_lock.Unlock();
5627 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5628 client_lock.Lock();
5629 if (r < 0) {
5630 return r;
5631 }
5632
5633 whoami = monclient->get_global_id();
5634 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5635
5636 return 0;
5637 }
5638
5639 int Client::fetch_fsmap(bool user)
5640 {
5641 int r;
5642 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5643 // rather than MDSMap because no one MDSMap contains all the daemons, and
5644 // a `tell` can address any daemon.
5645 version_t fsmap_latest;
5646 do {
5647 C_SaferCond cond;
5648 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5649 client_lock.Unlock();
5650 r = cond.wait();
5651 client_lock.Lock();
5652 } while (r == -EAGAIN);
5653
5654 if (r < 0) {
5655 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5656 return r;
5657 }
5658
5659 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5660
5661 if (user) {
5662 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5663 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5664 monclient->renew_subs();
5665 wait_on_list(waiting_for_fsmap);
5666 }
5667 ceph_assert(fsmap_user);
5668 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5669 } else {
5670 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5671 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5672 monclient->renew_subs();
5673 wait_on_list(waiting_for_fsmap);
5674 }
5675 ceph_assert(fsmap);
5676 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5677 }
5678 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5679 << fsmap_latest << dendl;
5680 return 0;
5681 }
5682
5683 /**
5684 *
5685 * @mds_spec one of ID, rank, GID, "*"
5686 *
5687 */
5688 int Client::mds_command(
5689 const std::string &mds_spec,
5690 const vector<string>& cmd,
5691 const bufferlist& inbl,
5692 bufferlist *outbl,
5693 string *outs,
5694 Context *onfinish)
5695 {
5696 std::lock_guard lock(client_lock);
5697
5698 if (!initialized)
5699 return -ENOTCONN;
5700
5701 int r;
5702 r = authenticate();
5703 if (r < 0) {
5704 return r;
5705 }
5706
5707 r = fetch_fsmap(false);
5708 if (r < 0) {
5709 return r;
5710 }
5711
5712 // Look up MDS target(s) of the command
5713 std::vector<mds_gid_t> targets;
5714 r = resolve_mds(mds_spec, &targets);
5715 if (r < 0) {
5716 return r;
5717 }
5718
5719 // If daemons are laggy, we won't send them commands. If all
5720 // are laggy then we fail.
5721 std::vector<mds_gid_t> non_laggy;
5722 for (const auto gid : targets) {
5723 const auto info = fsmap->get_info_gid(gid);
5724 if (!info.laggy()) {
5725 non_laggy.push_back(gid);
5726 }
5727 }
5728 if (non_laggy.size() == 0) {
5729 *outs = "All targeted MDS daemons are laggy";
5730 return -ENOENT;
5731 }
5732
5733 if (metadata.empty()) {
5734 // We are called on an unmounted client, so metadata
5735 // won't be initialized yet.
5736 populate_metadata("");
5737 }
5738
5739 // Send commands to targets
5740 C_GatherBuilder gather(cct, onfinish);
5741 for (const auto target_gid : non_laggy) {
5742 const auto info = fsmap->get_info_gid(target_gid);
5743
5744 // Open a connection to the target MDS
5745 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5746
5747 // Generate MDSCommandOp state
5748 auto &op = command_table.start_command();
5749
5750 op.on_finish = gather.new_sub();
5751 op.cmd = cmd;
5752 op.outbl = outbl;
5753 op.outs = outs;
5754 op.inbl = inbl;
5755 op.mds_gid = target_gid;
5756 op.con = conn;
5757
5758 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5759 << " tid=" << op.tid << cmd << dendl;
5760
5761 // Construct and send MCommand
5762 auto m = op.get_message(monclient->get_fsid());
5763 conn->send_message2(std::move(m));
5764 }
5765 gather.activate();
5766
5767 return 0;
5768 }
5769
5770 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5771 {
5772 ceph_tid_t const tid = m->get_tid();
5773
5774 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5775
5776 if (!command_table.exists(tid)) {
5777 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5778 return;
5779 }
5780
5781 auto &op = command_table.get_command(tid);
5782 if (op.outbl) {
5783 *op.outbl = m->get_data();
5784 }
5785 if (op.outs) {
5786 *op.outs = m->rs;
5787 }
5788
5789 if (op.on_finish) {
5790 op.on_finish->complete(m->r);
5791 }
5792
5793 command_table.erase(tid);
5794 }
5795
5796 // -------------------
5797 // MOUNT
5798
5799 int Client::subscribe_mdsmap(const std::string &fs_name)
5800 {
5801 int r = authenticate();
5802 if (r < 0) {
5803 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5804 return r;
5805 }
5806
5807 std::string resolved_fs_name;
5808 if (fs_name.empty()) {
5809 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5810 } else {
5811 resolved_fs_name = fs_name;
5812 }
5813
5814 std::string want = "mdsmap";
5815 if (!resolved_fs_name.empty()) {
5816 r = fetch_fsmap(true);
5817 if (r < 0)
5818 return r;
5819 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5820 if (fscid == FS_CLUSTER_ID_NONE) {
5821 return -ENOENT;
5822 }
5823
5824 std::ostringstream oss;
5825 oss << want << "." << fscid;
5826 want = oss.str();
5827 }
5828 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5829
5830 monclient->sub_want(want, 0, 0);
5831 monclient->renew_subs();
5832
5833 return 0;
5834 }
5835
5836 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5837 bool require_mds, const std::string &fs_name)
5838 {
5839 std::lock_guard lock(client_lock);
5840
5841 if (mounted) {
5842 ldout(cct, 5) << "already mounted" << dendl;
5843 return 0;
5844 }
5845
5846 unmounting = false;
5847
5848 int r = subscribe_mdsmap(fs_name);
5849 if (r < 0) {
5850 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5851 return r;
5852 }
5853
5854 tick(); // start tick
5855
5856 if (require_mds) {
5857 while (1) {
5858 auto availability = mdsmap->is_cluster_available();
5859 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5860 // Error out
5861 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5862 return CEPH_FUSE_NO_MDS_UP;
5863 } else if (availability == MDSMap::AVAILABLE) {
5864 // Continue to mount
5865 break;
5866 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5867 // Else, wait. MDSMonitor will update the map to bring
5868 // us to a conclusion eventually.
5869 wait_on_list(waiting_for_mdsmap);
5870 } else {
5871 // Unexpected value!
5872 ceph_abort();
5873 }
5874 }
5875 }
5876
5877 populate_metadata(mount_root.empty() ? "/" : mount_root);
5878
5879 filepath fp(CEPH_INO_ROOT);
5880 if (!mount_root.empty()) {
5881 fp = filepath(mount_root.c_str());
5882 }
5883 while (true) {
5884 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5885 req->set_filepath(fp);
5886 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5887 int res = make_request(req, perms);
5888 if (res < 0) {
5889 if (res == -EACCES && root) {
5890 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5891 break;
5892 }
5893 return res;
5894 }
5895
5896 if (fp.depth())
5897 fp.pop_dentry();
5898 else
5899 break;
5900 }
5901
5902 ceph_assert(root);
5903 _ll_get(root);
5904
5905 mounted = true;
5906
5907 // trace?
5908 if (!cct->_conf->client_trace.empty()) {
5909 traceout.open(cct->_conf->client_trace.c_str());
5910 if (traceout.is_open()) {
5911 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5912 } else {
5913 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5914 }
5915 }
5916
5917 /*
5918 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5919 ldout(cct, 3) << "op: struct stat st;" << dendl;
5920 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5921 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5922 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5923 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5924 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5925 ldout(cct, 3) << "op: int fd;" << dendl;
5926 */
5927 return 0;
5928 }
5929
5930 // UNMOUNT
5931
5932 void Client::_close_sessions()
5933 {
5934 while (!mds_sessions.empty()) {
5935 // send session closes!
5936 for (auto &p : mds_sessions) {
5937 if (p.second.state != MetaSession::STATE_CLOSING) {
5938 _close_mds_session(&p.second);
5939 }
5940 }
5941
5942 // wait for sessions to close
5943 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5944 mount_cond.Wait(client_lock);
5945 }
5946 }
5947
5948 void Client::flush_mdlog_sync()
5949 {
5950 if (mds_requests.empty())
5951 return;
5952 for (auto &p : mds_sessions) {
5953 flush_mdlog(&p.second);
5954 }
5955 }
5956
5957 void Client::flush_mdlog(MetaSession *session)
5958 {
5959 // Only send this to Luminous or newer MDS daemons, older daemons
5960 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5961 const uint64_t features = session->con->get_features();
5962 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5963 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5964 session->con->send_message2(std::move(m));
5965 }
5966 }
5967
5968
5969 void Client::_abort_mds_sessions(int err)
5970 {
5971 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5972 auto req = p->second;
5973 ++p;
5974 // unsafe requests will be removed during close session below.
5975 if (req->got_unsafe)
5976 continue;
5977
5978 req->abort(err);
5979 if (req->caller_cond) {
5980 req->kick = true;
5981 req->caller_cond->Signal();
5982 }
5983 }
5984
5985 // Process aborts on any requests that were on this waitlist.
5986 // Any requests that were on a waiting_for_open session waitlist
5987 // will get kicked during close session below.
5988 signal_cond_list(waiting_for_mdsmap);
5989
5990 // Force-close all sessions
5991 while(!mds_sessions.empty()) {
5992 auto& session = mds_sessions.begin()->second;
5993 _closed_mds_session(&session);
5994 }
5995 }
5996
5997 void Client::_unmount(bool abort)
5998 {
5999 if (unmounting)
6000 return;
6001
6002 if (abort || blacklisted) {
6003 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6004 } else {
6005 ldout(cct, 2) << "unmounting" << dendl;
6006 }
6007 unmounting = true;
6008
6009 deleg_timeout = 0;
6010
6011 if (abort) {
6012 // Abort all mds sessions
6013 _abort_mds_sessions(-ENOTCONN);
6014
6015 objecter->op_cancel_writes(-ENOTCONN);
6016 } else {
6017 // flush the mdlog for pending requests, if any
6018 flush_mdlog_sync();
6019 }
6020
6021 while (!mds_requests.empty()) {
6022 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6023 mount_cond.Wait(client_lock);
6024 }
6025
6026 if (tick_event)
6027 timer.cancel_event(tick_event);
6028 tick_event = 0;
6029
6030 cwd.reset();
6031
6032 // clean up any unclosed files
6033 while (!fd_map.empty()) {
6034 Fh *fh = fd_map.begin()->second;
6035 fd_map.erase(fd_map.begin());
6036 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6037 _release_fh(fh);
6038 }
6039
6040 while (!ll_unclosed_fh_set.empty()) {
6041 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6042 Fh *fh = *it;
6043 ll_unclosed_fh_set.erase(fh);
6044 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6045 _release_fh(fh);
6046 }
6047
6048 while (!opened_dirs.empty()) {
6049 dir_result_t *dirp = *opened_dirs.begin();
6050 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6051 _closedir(dirp);
6052 }
6053
6054 _ll_drop_pins();
6055
6056 while (unsafe_sync_write > 0) {
6057 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6058 mount_cond.Wait(client_lock);
6059 }
6060
6061 if (cct->_conf->client_oc) {
6062 // flush/release all buffered data
6063 std::list<InodeRef> anchor;
6064 for (auto& p : inode_map) {
6065 Inode *in = p.second;
6066 if (!in) {
6067 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6068 ceph_assert(in);
6069 }
6070
6071 // prevent inode from getting freed
6072 anchor.emplace_back(in);
6073
6074 if (abort || blacklisted) {
6075 objectcacher->purge_set(&in->oset);
6076 } else if (!in->caps.empty()) {
6077 _release(in);
6078 _flush(in, new C_Client_FlushComplete(this, in));
6079 }
6080 }
6081 }
6082
6083 if (abort || blacklisted) {
6084 for (auto p = dirty_list.begin(); !p.end(); ) {
6085 Inode *in = *p;
6086 ++p;
6087 if (in->dirty_caps) {
6088 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6089 in->mark_caps_clean();
6090 put_inode(in);
6091 }
6092 }
6093 } else {
6094 flush_caps_sync();
6095 wait_sync_caps(last_flush_tid);
6096 }
6097
6098 // empty lru cache
6099 trim_cache();
6100
6101 while (lru.lru_get_size() > 0 ||
6102 !inode_map.empty()) {
6103 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6104 << "+" << inode_map.size() << " items"
6105 << ", waiting (for caps to release?)"
6106 << dendl;
6107 utime_t until = ceph_clock_now() + utime_t(5, 0);
6108 int r = mount_cond.WaitUntil(client_lock, until);
6109 if (r == ETIMEDOUT) {
6110 dump_cache(NULL);
6111 }
6112 }
6113 ceph_assert(lru.lru_get_size() == 0);
6114 ceph_assert(inode_map.empty());
6115
6116 // stop tracing
6117 if (!cct->_conf->client_trace.empty()) {
6118 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6119 traceout.close();
6120 }
6121
6122 _close_sessions();
6123
6124 mounted = false;
6125
6126 ldout(cct, 2) << "unmounted." << dendl;
6127 }
6128
6129 void Client::unmount()
6130 {
6131 std::lock_guard lock(client_lock);
6132 _unmount(false);
6133 }
6134
6135 void Client::abort_conn()
6136 {
6137 std::lock_guard lock(client_lock);
6138 _unmount(true);
6139 }
6140
6141 void Client::flush_cap_releases()
6142 {
6143 // send any cap releases
6144 for (auto &p : mds_sessions) {
6145 auto &session = p.second;
6146 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6147 p.first)) {
6148 if (cct->_conf->client_inject_release_failure) {
6149 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6150 } else {
6151 session.con->send_message2(std::move(session.release));
6152 }
6153 session.release.reset();
6154 }
6155 }
6156 }
6157
6158 void Client::tick()
6159 {
6160 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6161 sleep(cct->_conf->client_debug_inject_tick_delay);
6162 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6163 cct->_conf.apply_changes(nullptr);
6164 }
6165
6166 ldout(cct, 21) << "tick" << dendl;
6167 tick_event = timer.add_event_after(
6168 cct->_conf->client_tick_interval,
6169 new FunctionContext([this](int) {
6170 // Called back via Timer, which takes client_lock for us
6171 ceph_assert(client_lock.is_locked_by_me());
6172 tick();
6173 }));
6174 utime_t now = ceph_clock_now();
6175
6176 if (!mounted && !mds_requests.empty()) {
6177 MetaRequest *req = mds_requests.begin()->second;
6178 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6179 req->abort(-ETIMEDOUT);
6180 if (req->caller_cond) {
6181 req->kick = true;
6182 req->caller_cond->Signal();
6183 }
6184 signal_cond_list(waiting_for_mdsmap);
6185 for (auto &p : mds_sessions) {
6186 signal_context_list(p.second.waiting_for_open);
6187 }
6188 }
6189 }
6190
6191 if (mdsmap->get_epoch()) {
6192 // renew caps?
6193 utime_t el = now - last_cap_renew;
6194 if (el > mdsmap->get_session_timeout() / 3.0)
6195 renew_caps();
6196
6197 flush_cap_releases();
6198 }
6199
6200 // delayed caps
6201 xlist<Inode*>::iterator p = delayed_list.begin();
6202 while (!p.end()) {
6203 Inode *in = *p;
6204 ++p;
6205 if (in->hold_caps_until > now)
6206 break;
6207 delayed_list.pop_front();
6208 check_caps(in, CHECK_CAPS_NODELAY);
6209 }
6210
6211 trim_cache(true);
6212 }
6213
6214 void Client::renew_caps()
6215 {
6216 ldout(cct, 10) << "renew_caps()" << dendl;
6217 last_cap_renew = ceph_clock_now();
6218
6219 for (auto &p : mds_sessions) {
6220 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6221 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6222 renew_caps(&p.second);
6223 }
6224 }
6225
6226 void Client::renew_caps(MetaSession *session)
6227 {
6228 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6229 session->last_cap_renew_request = ceph_clock_now();
6230 uint64_t seq = ++session->cap_renew_seq;
6231 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6232 }
6233
6234
6235 // ===============================================================
6236 // high level (POSIXy) interface
6237
6238 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6239 InodeRef *target, const UserPerm& perms)
6240 {
6241 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6242 MetaRequest *req = new MetaRequest(op);
6243 filepath path;
6244 dir->make_nosnap_relative_path(path);
6245 path.push_dentry(name);
6246 req->set_filepath(path);
6247 req->set_inode(dir);
6248 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6249 mask |= DEBUG_GETATTR_CAPS;
6250 req->head.args.getattr.mask = mask;
6251
6252 ldout(cct, 10) << __func__ << " on " << path << dendl;
6253
6254 int r = make_request(req, perms, target);
6255 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6256 return r;
6257 }
6258
6259 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6260 const UserPerm& perms)
6261 {
6262 int r = 0;
6263 Dentry *dn = NULL;
6264
6265 if (dname == "..") {
6266 if (dir->dentries.empty()) {
6267 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6268 filepath path(dir->ino);
6269 req->set_filepath(path);
6270
6271 InodeRef tmptarget;
6272 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6273
6274 if (r == 0) {
6275 Inode *tempino = tmptarget.get();
6276 _ll_get(tempino);
6277 *target = tempino;
6278 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6279 } else {
6280 *target = dir;
6281 }
6282 }
6283 else
6284 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6285 goto done;
6286 }
6287
6288 if (dname == ".") {
6289 *target = dir;
6290 goto done;
6291 }
6292
6293 if (!dir->is_dir()) {
6294 r = -ENOTDIR;
6295 goto done;
6296 }
6297
6298 if (dname.length() > NAME_MAX) {
6299 r = -ENAMETOOLONG;
6300 goto done;
6301 }
6302
6303 if (dname == cct->_conf->client_snapdir &&
6304 dir->snapid == CEPH_NOSNAP) {
6305 *target = open_snapdir(dir);
6306 goto done;
6307 }
6308
6309 if (dir->dir &&
6310 dir->dir->dentries.count(dname)) {
6311 dn = dir->dir->dentries[dname];
6312
6313 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6314 << " seq " << dn->lease_seq
6315 << dendl;
6316
6317 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6318 // is dn lease valid?
6319 utime_t now = ceph_clock_now();
6320 if (dn->lease_mds >= 0 &&
6321 dn->lease_ttl > now &&
6322 mds_sessions.count(dn->lease_mds)) {
6323 MetaSession &s = mds_sessions.at(dn->lease_mds);
6324 if (s.cap_ttl > now &&
6325 s.cap_gen == dn->lease_gen) {
6326 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6327 // make trim_caps() behave.
6328 dir->try_touch_cap(dn->lease_mds);
6329 goto hit_dn;
6330 }
6331 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6332 << " vs lease_gen " << dn->lease_gen << dendl;
6333 }
6334 // dir shared caps?
6335 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6336 if (dn->cap_shared_gen == dir->shared_gen &&
6337 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6338 goto hit_dn;
6339 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6340 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6341 << *dir << " dn '" << dname << "'" << dendl;
6342 return -ENOENT;
6343 }
6344 }
6345 } else {
6346 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6347 }
6348 } else {
6349 // can we conclude ENOENT locally?
6350 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6351 (dir->flags & I_COMPLETE)) {
6352 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6353 return -ENOENT;
6354 }
6355 }
6356
6357 r = _do_lookup(dir, dname, mask, target, perms);
6358 goto done;
6359
6360 hit_dn:
6361 if (dn->inode) {
6362 *target = dn->inode;
6363 } else {
6364 r = -ENOENT;
6365 }
6366 touch_dn(dn);
6367
6368 done:
6369 if (r < 0)
6370 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6371 else
6372 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6373 return r;
6374 }
6375
6376 int Client::get_or_create(Inode *dir, const char* name,
6377 Dentry **pdn, bool expect_null)
6378 {
6379 // lookup
6380 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6381 dir->open_dir();
6382 if (dir->dir->dentries.count(name)) {
6383 Dentry *dn = dir->dir->dentries[name];
6384
6385 // is dn lease valid?
6386 utime_t now = ceph_clock_now();
6387 if (dn->inode &&
6388 dn->lease_mds >= 0 &&
6389 dn->lease_ttl > now &&
6390 mds_sessions.count(dn->lease_mds)) {
6391 MetaSession &s = mds_sessions.at(dn->lease_mds);
6392 if (s.cap_ttl > now &&
6393 s.cap_gen == dn->lease_gen) {
6394 if (expect_null)
6395 return -EEXIST;
6396 }
6397 }
6398 *pdn = dn;
6399 } else {
6400 // otherwise link up a new one
6401 *pdn = link(dir->dir, name, NULL, NULL);
6402 }
6403
6404 // success
6405 return 0;
6406 }
6407
6408 int Client::path_walk(const filepath& origpath, InodeRef *end,
6409 const UserPerm& perms, bool followsym, int mask)
6410 {
6411 filepath path = origpath;
6412 InodeRef cur;
6413 if (origpath.absolute())
6414 cur = root;
6415 else
6416 cur = cwd;
6417 ceph_assert(cur);
6418
6419 ldout(cct, 10) << __func__ << " " << path << dendl;
6420
6421 int symlinks = 0;
6422
6423 unsigned i=0;
6424 while (i < path.depth() && cur) {
6425 int caps = 0;
6426 const string &dname = path[i];
6427 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6428 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6429 InodeRef next;
6430 if (cct->_conf->client_permissions) {
6431 int r = may_lookup(cur.get(), perms);
6432 if (r < 0)
6433 return r;
6434 caps = CEPH_CAP_AUTH_SHARED;
6435 }
6436
6437 /* Get extra requested caps on the last component */
6438 if (i == (path.depth() - 1))
6439 caps |= mask;
6440 int r = _lookup(cur.get(), dname, caps, &next, perms);
6441 if (r < 0)
6442 return r;
6443 // only follow trailing symlink if followsym. always follow
6444 // 'directory' symlinks.
6445 if (next && next->is_symlink()) {
6446 symlinks++;
6447 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6448 if (symlinks > MAXSYMLINKS) {
6449 return -ELOOP;
6450 }
6451
6452 if (i < path.depth() - 1) {
6453 // dir symlink
6454 // replace consumed components of path with symlink dir target
6455 filepath resolved(next->symlink.c_str());
6456 resolved.append(path.postfixpath(i + 1));
6457 path = resolved;
6458 i = 0;
6459 if (next->symlink[0] == '/') {
6460 cur = root;
6461 }
6462 continue;
6463 } else if (followsym) {
6464 if (next->symlink[0] == '/') {
6465 path = next->symlink.c_str();
6466 i = 0;
6467 // reset position
6468 cur = root;
6469 } else {
6470 filepath more(next->symlink.c_str());
6471 // we need to remove the symlink component from off of the path
6472 // before adding the target that the symlink points to. remain
6473 // at the same position in the path.
6474 path.pop_dentry();
6475 path.append(more);
6476 }
6477 continue;
6478 }
6479 }
6480 cur.swap(next);
6481 i++;
6482 }
6483 if (!cur)
6484 return -ENOENT;
6485 if (end)
6486 end->swap(cur);
6487 return 0;
6488 }
6489
6490
6491 // namespace ops
6492
6493 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6494 {
6495 std::lock_guard lock(client_lock);
6496 tout(cct) << "link" << std::endl;
6497 tout(cct) << relexisting << std::endl;
6498 tout(cct) << relpath << std::endl;
6499
6500 if (unmounting)
6501 return -ENOTCONN;
6502
6503 filepath existing(relexisting);
6504
6505 InodeRef in, dir;
6506 int r = path_walk(existing, &in, perm, true);
6507 if (r < 0)
6508 return r;
6509 if (std::string(relpath) == "/") {
6510 r = -EEXIST;
6511 return r;
6512 }
6513 filepath path(relpath);
6514 string name = path.last_dentry();
6515 path.pop_dentry();
6516
6517 r = path_walk(path, &dir, perm, true);
6518 if (r < 0)
6519 return r;
6520 if (cct->_conf->client_permissions) {
6521 if (S_ISDIR(in->mode)) {
6522 r = -EPERM;
6523 return r;
6524 }
6525 r = may_hardlink(in.get(), perm);
6526 if (r < 0)
6527 return r;
6528 r = may_create(dir.get(), perm);
6529 if (r < 0)
6530 return r;
6531 }
6532 r = _link(in.get(), dir.get(), name.c_str(), perm);
6533 return r;
6534 }
6535
6536 int Client::unlink(const char *relpath, const UserPerm& perm)
6537 {
6538 std::lock_guard lock(client_lock);
6539 tout(cct) << __func__ << std::endl;
6540 tout(cct) << relpath << std::endl;
6541
6542 if (unmounting)
6543 return -ENOTCONN;
6544
6545 if (std::string(relpath) == "/")
6546 return -EISDIR;
6547
6548 filepath path(relpath);
6549 string name = path.last_dentry();
6550 path.pop_dentry();
6551 InodeRef dir;
6552 int r = path_walk(path, &dir, perm);
6553 if (r < 0)
6554 return r;
6555 if (cct->_conf->client_permissions) {
6556 r = may_delete(dir.get(), name.c_str(), perm);
6557 if (r < 0)
6558 return r;
6559 }
6560 return _unlink(dir.get(), name.c_str(), perm);
6561 }
6562
6563 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6564 {
6565 std::lock_guard lock(client_lock);
6566 tout(cct) << __func__ << std::endl;
6567 tout(cct) << relfrom << std::endl;
6568 tout(cct) << relto << std::endl;
6569
6570 if (unmounting)
6571 return -ENOTCONN;
6572
6573 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6574 return -EBUSY;
6575
6576 filepath from(relfrom);
6577 filepath to(relto);
6578 string fromname = from.last_dentry();
6579 from.pop_dentry();
6580 string toname = to.last_dentry();
6581 to.pop_dentry();
6582
6583 InodeRef fromdir, todir;
6584 int r = path_walk(from, &fromdir, perm);
6585 if (r < 0)
6586 goto out;
6587 r = path_walk(to, &todir, perm);
6588 if (r < 0)
6589 goto out;
6590
6591 if (cct->_conf->client_permissions) {
6592 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6593 if (r < 0)
6594 return r;
6595 r = may_delete(todir.get(), toname.c_str(), perm);
6596 if (r < 0 && r != -ENOENT)
6597 return r;
6598 }
6599 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6600 out:
6601 return r;
6602 }
6603
6604 // dirs
6605
6606 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6607 {
6608 std::lock_guard lock(client_lock);
6609 tout(cct) << __func__ << std::endl;
6610 tout(cct) << relpath << std::endl;
6611 tout(cct) << mode << std::endl;
6612 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6613
6614 if (unmounting)
6615 return -ENOTCONN;
6616
6617 if (std::string(relpath) == "/")
6618 return -EEXIST;
6619
6620 filepath path(relpath);
6621 string name = path.last_dentry();
6622 path.pop_dentry();
6623 InodeRef dir;
6624 int r = path_walk(path, &dir, perm);
6625 if (r < 0)
6626 return r;
6627 if (cct->_conf->client_permissions) {
6628 r = may_create(dir.get(), perm);
6629 if (r < 0)
6630 return r;
6631 }
6632 return _mkdir(dir.get(), name.c_str(), mode, perm);
6633 }
6634
6635 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6636 {
6637 std::lock_guard lock(client_lock);
6638 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6639 tout(cct) << __func__ << std::endl;
6640 tout(cct) << relpath << std::endl;
6641 tout(cct) << mode << std::endl;
6642
6643 if (unmounting)
6644 return -ENOTCONN;
6645
6646 //get through existing parts of path
6647 filepath path(relpath);
6648 unsigned int i;
6649 int r = 0, caps = 0;
6650 InodeRef cur, next;
6651 cur = cwd;
6652 for (i=0; i<path.depth(); ++i) {
6653 if (cct->_conf->client_permissions) {
6654 r = may_lookup(cur.get(), perms);
6655 if (r < 0)
6656 break;
6657 caps = CEPH_CAP_AUTH_SHARED;
6658 }
6659 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6660 if (r < 0)
6661 break;
6662 cur.swap(next);
6663 }
6664 if (r!=-ENOENT) return r;
6665 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6666 //make new directory at each level
6667 for (; i<path.depth(); ++i) {
6668 if (cct->_conf->client_permissions) {
6669 r = may_create(cur.get(), perms);
6670 if (r < 0)
6671 return r;
6672 }
6673 //make new dir
6674 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6675
6676 //check proper creation/existence
6677 if(-EEXIST == r && i < path.depth() - 1) {
6678 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6679 }
6680 if (r < 0)
6681 return r;
6682 //move to new dir and continue
6683 cur.swap(next);
6684 ldout(cct, 20) << __func__ << ": successfully created directory "
6685 << filepath(cur->ino).get_path() << dendl;
6686 }
6687 return 0;
6688 }
6689
6690 int Client::rmdir(const char *relpath, const UserPerm& perms)
6691 {
6692 std::lock_guard lock(client_lock);
6693 tout(cct) << __func__ << std::endl;
6694 tout(cct) << relpath << std::endl;
6695
6696 if (unmounting)
6697 return -ENOTCONN;
6698
6699 if (std::string(relpath) == "/")
6700 return -EBUSY;
6701
6702 filepath path(relpath);
6703 string name = path.last_dentry();
6704 path.pop_dentry();
6705 InodeRef dir;
6706 int r = path_walk(path, &dir, perms);
6707 if (r < 0)
6708 return r;
6709 if (cct->_conf->client_permissions) {
6710 int r = may_delete(dir.get(), name.c_str(), perms);
6711 if (r < 0)
6712 return r;
6713 }
6714 return _rmdir(dir.get(), name.c_str(), perms);
6715 }
6716
6717 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6718 {
6719 std::lock_guard lock(client_lock);
6720 tout(cct) << __func__ << std::endl;
6721 tout(cct) << relpath << std::endl;
6722 tout(cct) << mode << std::endl;
6723 tout(cct) << rdev << std::endl;
6724
6725 if (unmounting)
6726 return -ENOTCONN;
6727
6728 if (std::string(relpath) == "/")
6729 return -EEXIST;
6730
6731 filepath path(relpath);
6732 string name = path.last_dentry();
6733 path.pop_dentry();
6734 InodeRef dir;
6735 int r = path_walk(path, &dir, perms);
6736 if (r < 0)
6737 return r;
6738 if (cct->_conf->client_permissions) {
6739 int r = may_create(dir.get(), perms);
6740 if (r < 0)
6741 return r;
6742 }
6743 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6744 }
6745
6746 // symlinks
6747
6748 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6749 {
6750 std::lock_guard lock(client_lock);
6751 tout(cct) << __func__ << std::endl;
6752 tout(cct) << target << std::endl;
6753 tout(cct) << relpath << std::endl;
6754
6755 if (unmounting)
6756 return -ENOTCONN;
6757
6758 if (std::string(relpath) == "/")
6759 return -EEXIST;
6760
6761 filepath path(relpath);
6762 string name = path.last_dentry();
6763 path.pop_dentry();
6764 InodeRef dir;
6765 int r = path_walk(path, &dir, perms);
6766 if (r < 0)
6767 return r;
6768 if (cct->_conf->client_permissions) {
6769 int r = may_create(dir.get(), perms);
6770 if (r < 0)
6771 return r;
6772 }
6773 return _symlink(dir.get(), name.c_str(), target, perms);
6774 }
6775
6776 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6777 {
6778 std::lock_guard lock(client_lock);
6779 tout(cct) << __func__ << std::endl;
6780 tout(cct) << relpath << std::endl;
6781
6782 if (unmounting)
6783 return -ENOTCONN;
6784
6785 filepath path(relpath);
6786 InodeRef in;
6787 int r = path_walk(path, &in, perms, false);
6788 if (r < 0)
6789 return r;
6790
6791 return _readlink(in.get(), buf, size);
6792 }
6793
6794 int Client::_readlink(Inode *in, char *buf, size_t size)
6795 {
6796 if (!in->is_symlink())
6797 return -EINVAL;
6798
6799 // copy into buf (at most size bytes)
6800 int r = in->symlink.length();
6801 if (r > (int)size)
6802 r = size;
6803 memcpy(buf, in->symlink.c_str(), r);
6804 return r;
6805 }
6806
6807
6808 // inode stuff
6809
6810 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6811 {
6812 bool yes = in->caps_issued_mask(mask, true);
6813
6814 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6815 if (yes && !force)
6816 return 0;
6817
6818 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6819 filepath path;
6820 in->make_nosnap_relative_path(path);
6821 req->set_filepath(path);
6822 req->set_inode(in);
6823 req->head.args.getattr.mask = mask;
6824
6825 int res = make_request(req, perms);
6826 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6827 return res;
6828 }
6829
6830 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6831 const UserPerm& perms, InodeRef *inp)
6832 {
6833 int issued = in->caps_issued();
6834
6835 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6836 ccap_string(issued) << dendl;
6837
6838 if (in->snapid != CEPH_NOSNAP) {
6839 return -EROFS;
6840 }
6841 if ((mask & CEPH_SETATTR_SIZE) &&
6842 (unsigned long)stx->stx_size > in->size &&
6843 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6844 perms)) {
6845 return -EDQUOT;
6846 }
6847
6848 // make the change locally?
6849 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6850 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6851 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6852 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6853 << in->cap_dirtier_gid << ", forcing sync setattr"
6854 << dendl;
6855 /*
6856 * This works because we implicitly flush the caps as part of the
6857 * request, so the cap update check will happen with the writeback
6858 * cap context, and then the setattr check will happen with the
6859 * caller's context.
6860 *
6861 * In reality this pattern is likely pretty rare (different users
6862 * setattr'ing the same file). If that turns out not to be the
6863 * case later, we can build a more complex pipelined cap writeback
6864 * infrastructure...
6865 */
6866 if (!mask)
6867 mask |= CEPH_SETATTR_CTIME;
6868 goto force_request;
6869 }
6870
6871 if (!mask) {
6872 // caller just needs us to bump the ctime
6873 in->ctime = ceph_clock_now();
6874 in->cap_dirtier_uid = perms.uid();
6875 in->cap_dirtier_gid = perms.gid();
6876 if (issued & CEPH_CAP_AUTH_EXCL)
6877 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6878 else if (issued & CEPH_CAP_FILE_EXCL)
6879 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6880 else if (issued & CEPH_CAP_XATTR_EXCL)
6881 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6882 else
6883 mask |= CEPH_SETATTR_CTIME;
6884 }
6885
6886 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6887 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6888
6889 mask &= ~CEPH_SETATTR_KILL_SGUID;
6890
6891 if (mask & CEPH_SETATTR_UID) {
6892 in->ctime = ceph_clock_now();
6893 in->cap_dirtier_uid = perms.uid();
6894 in->cap_dirtier_gid = perms.gid();
6895 in->uid = stx->stx_uid;
6896 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6897 mask &= ~CEPH_SETATTR_UID;
6898 kill_sguid = true;
6899 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6900 }
6901 if (mask & CEPH_SETATTR_GID) {
6902 in->ctime = ceph_clock_now();
6903 in->cap_dirtier_uid = perms.uid();
6904 in->cap_dirtier_gid = perms.gid();
6905 in->gid = stx->stx_gid;
6906 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6907 mask &= ~CEPH_SETATTR_GID;
6908 kill_sguid = true;
6909 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6910 }
6911
6912 if (mask & CEPH_SETATTR_MODE) {
6913 in->ctime = ceph_clock_now();
6914 in->cap_dirtier_uid = perms.uid();
6915 in->cap_dirtier_gid = perms.gid();
6916 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6917 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6918 mask &= ~CEPH_SETATTR_MODE;
6919 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6920 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6921 /* Must squash the any setuid/setgid bits with an ownership change */
6922 in->mode &= ~(S_ISUID|S_ISGID);
6923 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6924 }
6925
6926 if (mask & CEPH_SETATTR_BTIME) {
6927 in->ctime = ceph_clock_now();
6928 in->cap_dirtier_uid = perms.uid();
6929 in->cap_dirtier_gid = perms.gid();
6930 in->btime = utime_t(stx->stx_btime);
6931 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6932 mask &= ~CEPH_SETATTR_BTIME;
6933 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6934 }
6935 } else if (mask & CEPH_SETATTR_SIZE) {
6936 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6937 mask |= CEPH_SETATTR_KILL_SGUID;
6938 }
6939
6940 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6941 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6942 if (mask & CEPH_SETATTR_MTIME)
6943 in->mtime = utime_t(stx->stx_mtime);
6944 if (mask & CEPH_SETATTR_ATIME)
6945 in->atime = utime_t(stx->stx_atime);
6946 in->ctime = ceph_clock_now();
6947 in->cap_dirtier_uid = perms.uid();
6948 in->cap_dirtier_gid = perms.gid();
6949 in->time_warp_seq++;
6950 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6951 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6952 }
6953 }
6954 if (!mask) {
6955 in->change_attr++;
6956 return 0;
6957 }
6958
6959 force_request:
6960 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6961
6962 filepath path;
6963
6964 in->make_nosnap_relative_path(path);
6965 req->set_filepath(path);
6966 req->set_inode(in);
6967
6968 if (mask & CEPH_SETATTR_KILL_SGUID) {
6969 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6970 }
6971 if (mask & CEPH_SETATTR_MODE) {
6972 req->head.args.setattr.mode = stx->stx_mode;
6973 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6974 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6975 }
6976 if (mask & CEPH_SETATTR_UID) {
6977 req->head.args.setattr.uid = stx->stx_uid;
6978 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6979 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6980 }
6981 if (mask & CEPH_SETATTR_GID) {
6982 req->head.args.setattr.gid = stx->stx_gid;
6983 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6984 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6985 }
6986 if (mask & CEPH_SETATTR_BTIME) {
6987 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6988 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6989 }
6990 if (mask & CEPH_SETATTR_MTIME) {
6991 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6992 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6993 CEPH_CAP_FILE_WR;
6994 }
6995 if (mask & CEPH_SETATTR_ATIME) {
6996 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6997 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6998 CEPH_CAP_FILE_WR;
6999 }
7000 if (mask & CEPH_SETATTR_SIZE) {
7001 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7002 req->head.args.setattr.size = stx->stx_size;
7003 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7004 } else { //too big!
7005 put_request(req);
7006 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7007 return -EFBIG;
7008 }
7009 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7010 CEPH_CAP_FILE_WR;
7011 }
7012 req->head.args.setattr.mask = mask;
7013
7014 req->regetattr_mask = mask;
7015
7016 int res = make_request(req, perms, inp);
7017 ldout(cct, 10) << "_setattr result=" << res << dendl;
7018 return res;
7019 }
7020
7021 /* Note that we only care about attrs that setattr cares about */
7022 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7023 {
7024 stx->stx_size = st->st_size;
7025 stx->stx_mode = st->st_mode;
7026 stx->stx_uid = st->st_uid;
7027 stx->stx_gid = st->st_gid;
7028 #ifdef __APPLE__
7029 stx->stx_mtime = st->st_mtimespec;
7030 stx->stx_atime = st->st_atimespec;
7031 #else
7032 stx->stx_mtime = st->st_mtim;
7033 stx->stx_atime = st->st_atim;
7034 #endif
7035 }
7036
7037 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7038 const UserPerm& perms, InodeRef *inp)
7039 {
7040 int ret = _do_setattr(in, stx, mask, perms, inp);
7041 if (ret < 0)
7042 return ret;
7043 if (mask & CEPH_SETATTR_MODE)
7044 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7045 return ret;
7046 }
7047
7048 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7049 const UserPerm& perms)
7050 {
7051 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7052 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7053 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7054 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7055 if (cct->_conf->client_permissions) {
7056 int r = may_setattr(in.get(), stx, mask, perms);
7057 if (r < 0)
7058 return r;
7059 }
7060 return __setattrx(in.get(), stx, mask, perms);
7061 }
7062
7063 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7064 const UserPerm& perms)
7065 {
7066 struct ceph_statx stx;
7067
7068 stat_to_statx(attr, &stx);
7069 mask &= ~CEPH_SETATTR_BTIME;
7070
7071 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7072 mask &= ~CEPH_SETATTR_UID;
7073 }
7074 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7075 mask &= ~CEPH_SETATTR_GID;
7076 }
7077
7078 return _setattrx(in, &stx, mask, perms);
7079 }
7080
7081 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7082 const UserPerm& perms)
7083 {
7084 std::lock_guard lock(client_lock);
7085 tout(cct) << __func__ << std::endl;
7086 tout(cct) << relpath << std::endl;
7087 tout(cct) << mask << std::endl;
7088
7089 if (unmounting)
7090 return -ENOTCONN;
7091
7092 filepath path(relpath);
7093 InodeRef in;
7094 int r = path_walk(path, &in, perms);
7095 if (r < 0)
7096 return r;
7097 return _setattr(in, attr, mask, perms);
7098 }
7099
7100 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7101 const UserPerm& perms, int flags)
7102 {
7103 std::lock_guard lock(client_lock);
7104 tout(cct) << __func__ << std::endl;
7105 tout(cct) << relpath << std::endl;
7106 tout(cct) << mask << std::endl;
7107
7108 if (unmounting)
7109 return -ENOTCONN;
7110
7111 filepath path(relpath);
7112 InodeRef in;
7113 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7114 if (r < 0)
7115 return r;
7116 return _setattrx(in, stx, mask, perms);
7117 }
7118
7119 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7120 {
7121 std::lock_guard lock(client_lock);
7122 tout(cct) << __func__ << std::endl;
7123 tout(cct) << fd << std::endl;
7124 tout(cct) << mask << std::endl;
7125
7126 if (unmounting)
7127 return -ENOTCONN;
7128
7129 Fh *f = get_filehandle(fd);
7130 if (!f)
7131 return -EBADF;
7132 #if defined(__linux__) && defined(O_PATH)
7133 if (f->flags & O_PATH)
7134 return -EBADF;
7135 #endif
7136 return _setattr(f->inode, attr, mask, perms);
7137 }
7138
7139 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7140 {
7141 std::lock_guard lock(client_lock);
7142 tout(cct) << __func__ << std::endl;
7143 tout(cct) << fd << std::endl;
7144 tout(cct) << mask << std::endl;
7145
7146 if (unmounting)
7147 return -ENOTCONN;
7148
7149 Fh *f = get_filehandle(fd);
7150 if (!f)
7151 return -EBADF;
7152 #if defined(__linux__) && defined(O_PATH)
7153 if (f->flags & O_PATH)
7154 return -EBADF;
7155 #endif
7156 return _setattrx(f->inode, stx, mask, perms);
7157 }
7158
7159 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7160 frag_info_t *dirstat, int mask)
7161 {
7162 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7163 std::lock_guard lock(client_lock);
7164 tout(cct) << "stat" << std::endl;
7165 tout(cct) << relpath << std::endl;
7166
7167 if (unmounting)
7168 return -ENOTCONN;
7169
7170 filepath path(relpath);
7171 InodeRef in;
7172 int r = path_walk(path, &in, perms, true, mask);
7173 if (r < 0)
7174 return r;
7175 r = _getattr(in, mask, perms);
7176 if (r < 0) {
7177 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7178 return r;
7179 }
7180 fill_stat(in, stbuf, dirstat);
7181 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7182 return r;
7183 }
7184
7185 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7186 {
7187 unsigned mask = 0;
7188
7189 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7190 if (flags & AT_NO_ATTR_SYNC)
7191 goto out;
7192
7193 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7194 mask |= CEPH_CAP_PIN;
7195 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7196 mask |= CEPH_CAP_AUTH_SHARED;
7197 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7198 mask |= CEPH_CAP_LINK_SHARED;
7199 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7200 mask |= CEPH_CAP_FILE_SHARED;
7201 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7202 mask |= CEPH_CAP_XATTR_SHARED;
7203 out:
7204 return mask;
7205 }
7206
7207 int Client::statx(const char *relpath, struct ceph_statx *stx,
7208 const UserPerm& perms,
7209 unsigned int want, unsigned int flags)
7210 {
7211 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7212 std::lock_guard lock(client_lock);
7213 tout(cct) << "statx" << std::endl;
7214 tout(cct) << relpath << std::endl;
7215
7216 if (unmounting)
7217 return -ENOTCONN;
7218
7219 filepath path(relpath);
7220 InodeRef in;
7221
7222 unsigned mask = statx_to_mask(flags, want);
7223
7224 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7225 if (r < 0)
7226 return r;
7227
7228 r = _getattr(in, mask, perms);
7229 if (r < 0) {
7230 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7231 return r;
7232 }
7233
7234 fill_statx(in, mask, stx);
7235 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7236 return r;
7237 }
7238
7239 int Client::lstat(const char *relpath, struct stat *stbuf,
7240 const UserPerm& perms, frag_info_t *dirstat, int mask)
7241 {
7242 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7243 std::lock_guard lock(client_lock);
7244 tout(cct) << __func__ << std::endl;
7245 tout(cct) << relpath << std::endl;
7246
7247 if (unmounting)
7248 return -ENOTCONN;
7249
7250 filepath path(relpath);
7251 InodeRef in;
7252 // don't follow symlinks
7253 int r = path_walk(path, &in, perms, false, mask);
7254 if (r < 0)
7255 return r;
7256 r = _getattr(in, mask, perms);
7257 if (r < 0) {
7258 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7259 return r;
7260 }
7261 fill_stat(in, stbuf, dirstat);
7262 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7263 return r;
7264 }
7265
7266 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7267 {
7268 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7269 << " mode 0" << oct << in->mode << dec
7270 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7271 memset(st, 0, sizeof(struct stat));
7272 if (use_faked_inos())
7273 st->st_ino = in->faked_ino;
7274 else
7275 st->st_ino = in->ino;
7276 st->st_dev = in->snapid;
7277 st->st_mode = in->mode;
7278 st->st_rdev = in->rdev;
7279 if (in->is_dir()) {
7280 switch (in->nlink) {
7281 case 0:
7282 st->st_nlink = 0; /* dir is unlinked */
7283 break;
7284 case 1:
7285 st->st_nlink = 1 /* parent dentry */
7286 + 1 /* <dir>/. */
7287 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7288 break;
7289 default:
7290 ceph_abort();
7291 }
7292 } else {
7293 st->st_nlink = in->nlink;
7294 }
7295 st->st_uid = in->uid;
7296 st->st_gid = in->gid;
7297 if (in->ctime > in->mtime) {
7298 stat_set_ctime_sec(st, in->ctime.sec());
7299 stat_set_ctime_nsec(st, in->ctime.nsec());
7300 } else {
7301 stat_set_ctime_sec(st, in->mtime.sec());
7302 stat_set_ctime_nsec(st, in->mtime.nsec());
7303 }
7304 stat_set_atime_sec(st, in->atime.sec());
7305 stat_set_atime_nsec(st, in->atime.nsec());
7306 stat_set_mtime_sec(st, in->mtime.sec());
7307 stat_set_mtime_nsec(st, in->mtime.nsec());
7308 if (in->is_dir()) {
7309 if (cct->_conf->client_dirsize_rbytes)
7310 st->st_size = in->rstat.rbytes;
7311 else
7312 st->st_size = in->dirstat.size();
7313 st->st_blocks = 1;
7314 } else {
7315 st->st_size = in->size;
7316 st->st_blocks = (in->size + 511) >> 9;
7317 }
7318 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7319
7320 if (dirstat)
7321 *dirstat = in->dirstat;
7322 if (rstat)
7323 *rstat = in->rstat;
7324
7325 return in->caps_issued();
7326 }
7327
7328 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7329 {
7330 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7331 << " mode 0" << oct << in->mode << dec
7332 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7333 memset(stx, 0, sizeof(struct ceph_statx));
7334
7335 /*
7336 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7337 * so that all bits are set.
7338 */
7339 if (!mask)
7340 mask = ~0;
7341
7342 /* These are always considered to be available */
7343 stx->stx_dev = in->snapid;
7344 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7345
7346 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7347 stx->stx_mode = S_IFMT & in->mode;
7348 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7349 stx->stx_rdev = in->rdev;
7350 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7351
7352 if (mask & CEPH_CAP_AUTH_SHARED) {
7353 stx->stx_uid = in->uid;
7354 stx->stx_gid = in->gid;
7355 stx->stx_mode = in->mode;
7356 in->btime.to_timespec(&stx->stx_btime);
7357 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7358 }
7359
7360 if (mask & CEPH_CAP_LINK_SHARED) {
7361 if (in->is_dir()) {
7362 switch (in->nlink) {
7363 case 0:
7364 stx->stx_nlink = 0; /* dir is unlinked */
7365 break;
7366 case 1:
7367 stx->stx_nlink = 1 /* parent dentry */
7368 + 1 /* <dir>/. */
7369 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7370 break;
7371 default:
7372 ceph_abort();
7373 }
7374 } else {
7375 stx->stx_nlink = in->nlink;
7376 }
7377 stx->stx_mask |= CEPH_STATX_NLINK;
7378 }
7379
7380 if (mask & CEPH_CAP_FILE_SHARED) {
7381
7382 in->atime.to_timespec(&stx->stx_atime);
7383 in->mtime.to_timespec(&stx->stx_mtime);
7384
7385 if (in->is_dir()) {
7386 if (cct->_conf->client_dirsize_rbytes)
7387 stx->stx_size = in->rstat.rbytes;
7388 else
7389 stx->stx_size = in->dirstat.size();
7390 stx->stx_blocks = 1;
7391 } else {
7392 stx->stx_size = in->size;
7393 stx->stx_blocks = (in->size + 511) >> 9;
7394 }
7395 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7396 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7397 }
7398
7399 /* Change time and change_attr both require all shared caps to view */
7400 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7401 stx->stx_version = in->change_attr;
7402 if (in->ctime > in->mtime)
7403 in->ctime.to_timespec(&stx->stx_ctime);
7404 else
7405 in->mtime.to_timespec(&stx->stx_ctime);
7406 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7407 }
7408
7409 }
7410
7411 void Client::touch_dn(Dentry *dn)
7412 {
7413 lru.lru_touch(dn);
7414 }
7415
7416 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7417 {
7418 std::lock_guard lock(client_lock);
7419 tout(cct) << __func__ << std::endl;
7420 tout(cct) << relpath << std::endl;
7421 tout(cct) << mode << std::endl;
7422
7423 if (unmounting)
7424 return -ENOTCONN;
7425
7426 filepath path(relpath);
7427 InodeRef in;
7428 int r = path_walk(path, &in, perms);
7429 if (r < 0)
7430 return r;
7431 struct stat attr;
7432 attr.st_mode = mode;
7433 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7434 }
7435
7436 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7437 {
7438 std::lock_guard lock(client_lock);
7439 tout(cct) << __func__ << std::endl;
7440 tout(cct) << fd << std::endl;
7441 tout(cct) << mode << std::endl;
7442
7443 if (unmounting)
7444 return -ENOTCONN;
7445
7446 Fh *f = get_filehandle(fd);
7447 if (!f)
7448 return -EBADF;
7449 #if defined(__linux__) && defined(O_PATH)
7450 if (f->flags & O_PATH)
7451 return -EBADF;
7452 #endif
7453 struct stat attr;
7454 attr.st_mode = mode;
7455 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7456 }
7457
7458 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7459 {
7460 std::lock_guard lock(client_lock);
7461 tout(cct) << __func__ << std::endl;
7462 tout(cct) << relpath << std::endl;
7463 tout(cct) << mode << std::endl;
7464
7465 if (unmounting)
7466 return -ENOTCONN;
7467
7468 filepath path(relpath);
7469 InodeRef in;
7470 // don't follow symlinks
7471 int r = path_walk(path, &in, perms, false);
7472 if (r < 0)
7473 return r;
7474 struct stat attr;
7475 attr.st_mode = mode;
7476 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7477 }
7478
7479 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7480 const UserPerm& perms)
7481 {
7482 std::lock_guard lock(client_lock);
7483 tout(cct) << __func__ << std::endl;
7484 tout(cct) << relpath << std::endl;
7485 tout(cct) << new_uid << std::endl;
7486 tout(cct) << new_gid << std::endl;
7487
7488 if (unmounting)
7489 return -ENOTCONN;
7490
7491 filepath path(relpath);
7492 InodeRef in;
7493 int r = path_walk(path, &in, perms);
7494 if (r < 0)
7495 return r;
7496 struct stat attr;
7497 attr.st_uid = new_uid;
7498 attr.st_gid = new_gid;
7499 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7500 }
7501
7502 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7503 {
7504 std::lock_guard lock(client_lock);
7505 tout(cct) << __func__ << std::endl;
7506 tout(cct) << fd << std::endl;
7507 tout(cct) << new_uid << std::endl;
7508 tout(cct) << new_gid << std::endl;
7509
7510 if (unmounting)
7511 return -ENOTCONN;
7512
7513 Fh *f = get_filehandle(fd);
7514 if (!f)
7515 return -EBADF;
7516 #if defined(__linux__) && defined(O_PATH)
7517 if (f->flags & O_PATH)
7518 return -EBADF;
7519 #endif
7520 struct stat attr;
7521 attr.st_uid = new_uid;
7522 attr.st_gid = new_gid;
7523 int mask = 0;
7524 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7525 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7526 return _setattr(f->inode, &attr, mask, perms);
7527 }
7528
7529 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7530 const UserPerm& perms)
7531 {
7532 std::lock_guard lock(client_lock);
7533 tout(cct) << __func__ << std::endl;
7534 tout(cct) << relpath << std::endl;
7535 tout(cct) << new_uid << std::endl;
7536 tout(cct) << new_gid << std::endl;
7537
7538 if (unmounting)
7539 return -ENOTCONN;
7540
7541 filepath path(relpath);
7542 InodeRef in;
7543 // don't follow symlinks
7544 int r = path_walk(path, &in, perms, false);
7545 if (r < 0)
7546 return r;
7547 struct stat attr;
7548 attr.st_uid = new_uid;
7549 attr.st_gid = new_gid;
7550 int mask = 0;
7551 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7552 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7553 return _setattr(in, &attr, mask, perms);
7554 }
7555
7556 static void attr_set_atime_and_mtime(struct stat *attr,
7557 const utime_t &atime,
7558 const utime_t &mtime)
7559 {
7560 stat_set_atime_sec(attr, atime.tv.tv_sec);
7561 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7562 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7563 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7564 }
7565
7566 // for [l]utime() invoke the timeval variant as the timespec
7567 // variant are not yet implemented. for futime[s](), invoke
7568 // the timespec variant.
7569 int Client::utime(const char *relpath, struct utimbuf *buf,
7570 const UserPerm& perms)
7571 {
7572 struct timeval tv[2];
7573 tv[0].tv_sec = buf->actime;
7574 tv[0].tv_usec = 0;
7575 tv[1].tv_sec = buf->modtime;
7576 tv[1].tv_usec = 0;
7577
7578 return utimes(relpath, tv, perms);
7579 }
7580
7581 int Client::lutime(const char *relpath, struct utimbuf *buf,
7582 const UserPerm& perms)
7583 {
7584 struct timeval tv[2];
7585 tv[0].tv_sec = buf->actime;
7586 tv[0].tv_usec = 0;
7587 tv[1].tv_sec = buf->modtime;
7588 tv[1].tv_usec = 0;
7589
7590 return lutimes(relpath, tv, perms);
7591 }
7592
7593 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7594 {
7595 struct timespec ts[2];
7596 ts[0].tv_sec = buf->actime;
7597 ts[0].tv_nsec = 0;
7598 ts[1].tv_sec = buf->modtime;
7599 ts[1].tv_nsec = 0;
7600
7601 return futimens(fd, ts, perms);
7602 }
7603
7604 int Client::utimes(const char *relpath, struct timeval times[2],
7605 const UserPerm& perms)
7606 {
7607 std::lock_guard lock(client_lock);
7608 tout(cct) << __func__ << std::endl;
7609 tout(cct) << relpath << std::endl;
7610 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7611 << std::endl;
7612 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7613 << std::endl;
7614
7615 if (unmounting)
7616 return -ENOTCONN;
7617
7618 filepath path(relpath);
7619 InodeRef in;
7620 int r = path_walk(path, &in, perms);
7621 if (r < 0)
7622 return r;
7623 struct stat attr;
7624 utime_t atime(times[0]);
7625 utime_t mtime(times[1]);
7626
7627 attr_set_atime_and_mtime(&attr, atime, mtime);
7628 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7629 }
7630
7631 int Client::lutimes(const char *relpath, struct timeval times[2],
7632 const UserPerm& perms)
7633 {
7634 std::lock_guard lock(client_lock);
7635 tout(cct) << __func__ << std::endl;
7636 tout(cct) << relpath << std::endl;
7637 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7638 << std::endl;
7639 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7640 << std::endl;
7641
7642 if (unmounting)
7643 return -ENOTCONN;
7644
7645 filepath path(relpath);
7646 InodeRef in;
7647 int r = path_walk(path, &in, perms, false);
7648 if (r < 0)
7649 return r;
7650 struct stat attr;
7651 utime_t atime(times[0]);
7652 utime_t mtime(times[1]);
7653
7654 attr_set_atime_and_mtime(&attr, atime, mtime);
7655 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7656 }
7657
7658 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7659 {
7660 struct timespec ts[2];
7661 ts[0].tv_sec = times[0].tv_sec;
7662 ts[0].tv_nsec = times[0].tv_usec * 1000;
7663 ts[1].tv_sec = times[1].tv_sec;
7664 ts[1].tv_nsec = times[1].tv_usec * 1000;
7665
7666 return futimens(fd, ts, perms);
7667 }
7668
7669 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7670 {
7671 std::lock_guard lock(client_lock);
7672 tout(cct) << __func__ << std::endl;
7673 tout(cct) << fd << std::endl;
7674 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7675 << std::endl;
7676 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7677 << std::endl;
7678
7679 if (unmounting)
7680 return -ENOTCONN;
7681
7682 Fh *f = get_filehandle(fd);
7683 if (!f)
7684 return -EBADF;
7685 #if defined(__linux__) && defined(O_PATH)
7686 if (f->flags & O_PATH)
7687 return -EBADF;
7688 #endif
7689 struct stat attr;
7690 utime_t atime(times[0]);
7691 utime_t mtime(times[1]);
7692
7693 attr_set_atime_and_mtime(&attr, atime, mtime);
7694 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7695 }
7696
7697 int Client::flock(int fd, int operation, uint64_t owner)
7698 {
7699 std::lock_guard lock(client_lock);
7700 tout(cct) << __func__ << std::endl;
7701 tout(cct) << fd << std::endl;
7702 tout(cct) << operation << std::endl;
7703 tout(cct) << owner << std::endl;
7704
7705 if (unmounting)
7706 return -ENOTCONN;
7707
7708 Fh *f = get_filehandle(fd);
7709 if (!f)
7710 return -EBADF;
7711
7712 return _flock(f, operation, owner);
7713 }
7714
7715 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7716 {
7717 std::lock_guard lock(client_lock);
7718 tout(cct) << __func__ << std::endl;
7719 tout(cct) << relpath << std::endl;
7720
7721 if (unmounting)
7722 return -ENOTCONN;
7723
7724 filepath path(relpath);
7725 InodeRef in;
7726 int r = path_walk(path, &in, perms, true);
7727 if (r < 0)
7728 return r;
7729 if (cct->_conf->client_permissions) {
7730 int r = may_open(in.get(), O_RDONLY, perms);
7731 if (r < 0)
7732 return r;
7733 }
7734 r = _opendir(in.get(), dirpp, perms);
7735 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7736 if (r != -ENOTDIR)
7737 tout(cct) << (unsigned long)*dirpp << std::endl;
7738 return r;
7739 }
7740
7741 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7742 {
7743 if (!in->is_dir())
7744 return -ENOTDIR;
7745 *dirpp = new dir_result_t(in, perms);
7746 opened_dirs.insert(*dirpp);
7747 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7748 return 0;
7749 }
7750
7751
7752 int Client::closedir(dir_result_t *dir)
7753 {
7754 std::lock_guard lock(client_lock);
7755 tout(cct) << __func__ << std::endl;
7756 tout(cct) << (unsigned long)dir << std::endl;
7757
7758 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7759 _closedir(dir);
7760 return 0;
7761 }
7762
7763 void Client::_closedir(dir_result_t *dirp)
7764 {
7765 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7766 if (dirp->inode) {
7767 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7768 dirp->inode.reset();
7769 }
7770 _readdir_drop_dirp_buffer(dirp);
7771 opened_dirs.erase(dirp);
7772 delete dirp;
7773 }
7774
7775 void Client::rewinddir(dir_result_t *dirp)
7776 {
7777 std::lock_guard lock(client_lock);
7778 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7779
7780 if (unmounting)
7781 return;
7782
7783 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7784 _readdir_drop_dirp_buffer(d);
7785 d->reset();
7786 }
7787
7788 loff_t Client::telldir(dir_result_t *dirp)
7789 {
7790 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7791 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7792 return d->offset;
7793 }
7794
7795 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7796 {
7797 std::lock_guard lock(client_lock);
7798
7799 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7800
7801 if (unmounting)
7802 return;
7803
7804 if (offset == dirp->offset)
7805 return;
7806
7807 if (offset > dirp->offset)
7808 dirp->release_count = 0; // bump if we do a forward seek
7809 else
7810 dirp->ordered_count = 0; // disable filling readdir cache
7811
7812 if (dirp->hash_order()) {
7813 if (dirp->offset > offset) {
7814 _readdir_drop_dirp_buffer(dirp);
7815 dirp->reset();
7816 }
7817 } else {
7818 if (offset == 0 ||
7819 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7820 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7821 _readdir_drop_dirp_buffer(dirp);
7822 dirp->reset();
7823 }
7824 }
7825
7826 dirp->offset = offset;
7827 }
7828
7829
7830 //struct dirent {
7831 // ino_t d_ino; /* inode number */
7832 // off_t d_off; /* offset to the next dirent */
7833 // unsigned short d_reclen; /* length of this record */
7834 // unsigned char d_type; /* type of file */
7835 // char d_name[256]; /* filename */
7836 //};
7837 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7838 {
7839 strncpy(de->d_name, name, 255);
7840 de->d_name[255] = '\0';
7841 #ifndef __CYGWIN__
7842 de->d_ino = ino;
7843 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7844 de->d_off = next_off;
7845 #endif
7846 de->d_reclen = 1;
7847 de->d_type = IFTODT(type);
7848 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7849 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7850 #endif
7851 }
7852
7853 void Client::_readdir_next_frag(dir_result_t *dirp)
7854 {
7855 frag_t fg = dirp->buffer_frag;
7856
7857 if (fg.is_rightmost()) {
7858 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7859 dirp->set_end();
7860 return;
7861 }
7862
7863 // advance
7864 fg = fg.next();
7865 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7866
7867 if (dirp->hash_order()) {
7868 // keep last_name
7869 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7870 if (dirp->offset < new_offset) // don't decrease offset
7871 dirp->offset = new_offset;
7872 } else {
7873 dirp->last_name.clear();
7874 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7875 _readdir_rechoose_frag(dirp);
7876 }
7877 }
7878
7879 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7880 {
7881 ceph_assert(dirp->inode);
7882
7883 if (dirp->hash_order())
7884 return;
7885
7886 frag_t cur = frag_t(dirp->offset_high());
7887 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7888 if (fg != cur) {
7889 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7890 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7891 dirp->last_name.clear();
7892 dirp->next_offset = 2;
7893 }
7894 }
7895
7896 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7897 {
7898 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7899 dirp->buffer.clear();
7900 }
7901
7902 int Client::_readdir_get_frag(dir_result_t *dirp)
7903 {
7904 ceph_assert(dirp);
7905 ceph_assert(dirp->inode);
7906
7907 // get the current frag.
7908 frag_t fg;
7909 if (dirp->hash_order())
7910 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7911 else
7912 fg = frag_t(dirp->offset_high());
7913
7914 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7915 << " offset " << hex << dirp->offset << dec << dendl;
7916
7917 int op = CEPH_MDS_OP_READDIR;
7918 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7919 op = CEPH_MDS_OP_LSSNAP;
7920
7921 InodeRef& diri = dirp->inode;
7922
7923 MetaRequest *req = new MetaRequest(op);
7924 filepath path;
7925 diri->make_nosnap_relative_path(path);
7926 req->set_filepath(path);
7927 req->set_inode(diri.get());
7928 req->head.args.readdir.frag = fg;
7929 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7930 if (dirp->last_name.length()) {
7931 req->path2.set_path(dirp->last_name);
7932 } else if (dirp->hash_order()) {
7933 req->head.args.readdir.offset_hash = dirp->offset_high();
7934 }
7935 req->dirp = dirp;
7936
7937 bufferlist dirbl;
7938 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7939
7940 if (res == -EAGAIN) {
7941 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7942 _readdir_rechoose_frag(dirp);
7943 return _readdir_get_frag(dirp);
7944 }
7945
7946 if (res == 0) {
7947 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7948 << " size " << dirp->buffer.size() << dendl;
7949 } else {
7950 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7951 dirp->set_end();
7952 }
7953
7954 return res;
7955 }
7956
7957 struct dentry_off_lt {
7958 bool operator()(const Dentry* dn, int64_t off) const {
7959 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7960 }
7961 };
7962
7963 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7964 int caps, bool getref)
7965 {
7966 ceph_assert(client_lock.is_locked());
7967 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7968 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7969 << dendl;
7970 Dir *dir = dirp->inode->dir;
7971
7972 if (!dir) {
7973 ldout(cct, 10) << " dir is empty" << dendl;
7974 dirp->set_end();
7975 return 0;
7976 }
7977
7978 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7979 dir->readdir_cache.end(),
7980 dirp->offset, dentry_off_lt());
7981
7982 string dn_name;
7983 while (true) {
7984 if (!dirp->inode->is_complete_and_ordered())
7985 return -EAGAIN;
7986 if (pd == dir->readdir_cache.end())
7987 break;
7988 Dentry *dn = *pd;
7989 if (dn->inode == NULL) {
7990 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7991 ++pd;
7992 continue;
7993 }
7994 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7995 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7996 ++pd;
7997 continue;
7998 }
7999
8000 int idx = pd - dir->readdir_cache.begin();
8001 int r = _getattr(dn->inode, caps, dirp->perms);
8002 if (r < 0)
8003 return r;
8004
8005 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8006 pd = dir->readdir_cache.begin() + idx;
8007 if (pd >= dir->readdir_cache.end() || *pd != dn)
8008 return -EAGAIN;
8009
8010 struct ceph_statx stx;
8011 struct dirent de;
8012 fill_statx(dn->inode, caps, &stx);
8013
8014 uint64_t next_off = dn->offset + 1;
8015 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8016 ++pd;
8017 if (pd == dir->readdir_cache.end())
8018 next_off = dir_result_t::END;
8019
8020 Inode *in = NULL;
8021 if (getref) {
8022 in = dn->inode.get();
8023 _ll_get(in);
8024 }
8025
8026 dn_name = dn->name; // fill in name while we have lock
8027
8028 client_lock.Unlock();
8029 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8030 client_lock.Lock();
8031 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8032 << " = " << r << dendl;
8033 if (r < 0) {
8034 return r;
8035 }
8036
8037 dirp->offset = next_off;
8038 if (dirp->at_end())
8039 dirp->next_offset = 2;
8040 else
8041 dirp->next_offset = dirp->offset_low();
8042 dirp->last_name = dn_name; // we successfully returned this one; update!
8043 dirp->release_count = 0; // last_name no longer match cache index
8044 if (r > 0)
8045 return r;
8046 }
8047
8048 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8049 dirp->set_end();
8050 return 0;
8051 }
8052
8053 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8054 unsigned want, unsigned flags, bool getref)
8055 {
8056 int caps = statx_to_mask(flags, want);
8057
8058 std::lock_guard lock(client_lock);
8059
8060 if (unmounting)
8061 return -ENOTCONN;
8062
8063 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8064
8065 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8066 << dec << " at_end=" << dirp->at_end()
8067 << " hash_order=" << dirp->hash_order() << dendl;
8068
8069 struct dirent de;
8070 struct ceph_statx stx;
8071 memset(&de, 0, sizeof(de));
8072 memset(&stx, 0, sizeof(stx));
8073
8074 InodeRef& diri = dirp->inode;
8075
8076 if (dirp->at_end())
8077 return 0;
8078
8079 if (dirp->offset == 0) {
8080 ldout(cct, 15) << " including ." << dendl;
8081 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8082 uint64_t next_off = 1;
8083
8084 int r;
8085 r = _getattr(diri, caps, dirp->perms);
8086 if (r < 0)
8087 return r;
8088
8089 fill_statx(diri, caps, &stx);
8090 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8091
8092 Inode *inode = NULL;
8093 if (getref) {
8094 inode = diri.get();
8095 _ll_get(inode);
8096 }
8097
8098 client_lock.Unlock();
8099 r = cb(p, &de, &stx, next_off, inode);
8100 client_lock.Lock();
8101 if (r < 0)
8102 return r;
8103
8104 dirp->offset = next_off;
8105 if (r > 0)
8106 return r;
8107 }
8108 if (dirp->offset == 1) {
8109 ldout(cct, 15) << " including .." << dendl;
8110 uint64_t next_off = 2;
8111 InodeRef in;
8112 if (diri->dentries.empty())
8113 in = diri;
8114 else
8115 in = diri->get_first_parent()->dir->parent_inode;
8116
8117 int r;
8118 r = _getattr(in, caps, dirp->perms);
8119 if (r < 0)
8120 return r;
8121
8122 fill_statx(in, caps, &stx);
8123 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8124
8125 Inode *inode = NULL;
8126 if (getref) {
8127 inode = in.get();
8128 _ll_get(inode);
8129 }
8130
8131 client_lock.Unlock();
8132 r = cb(p, &de, &stx, next_off, inode);
8133 client_lock.Lock();
8134 if (r < 0)
8135 return r;
8136
8137 dirp->offset = next_off;
8138 if (r > 0)
8139 return r;
8140 }
8141
8142 // can we read from our cache?
8143 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8144 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8145 << dirp->inode->is_complete_and_ordered()
8146 << " issued " << ccap_string(dirp->inode->caps_issued())
8147 << dendl;
8148 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8149 dirp->inode->is_complete_and_ordered() &&
8150 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8151 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8152 if (err != -EAGAIN)
8153 return err;
8154 }
8155
8156 while (1) {
8157 if (dirp->at_end())
8158 return 0;
8159
8160 bool check_caps = true;
8161 if (!dirp->is_cached()) {
8162 int r = _readdir_get_frag(dirp);
8163 if (r)
8164 return r;
8165 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8166 // different than the requested one. (our dirfragtree was outdated)
8167 check_caps = false;
8168 }
8169 frag_t fg = dirp->buffer_frag;
8170
8171 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8172 << " offset " << hex << dirp->offset << dendl;
8173
8174 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8175 dirp->offset, dir_result_t::dentry_off_lt());
8176 it != dirp->buffer.end();
8177 ++it) {
8178 dir_result_t::dentry &entry = *it;
8179
8180 uint64_t next_off = entry.offset + 1;
8181
8182 int r;
8183 if (check_caps) {
8184 r = _getattr(entry.inode, caps, dirp->perms);
8185 if (r < 0)
8186 return r;
8187 }
8188
8189 fill_statx(entry.inode, caps, &stx);
8190 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8191
8192 Inode *inode = NULL;
8193 if (getref) {
8194 inode = entry.inode.get();
8195 _ll_get(inode);
8196 }
8197
8198 client_lock.Unlock();
8199 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8200 client_lock.Lock();
8201
8202 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8203 << " = " << r << dendl;
8204 if (r < 0)
8205 return r;
8206
8207 dirp->offset = next_off;
8208 if (r > 0)
8209 return r;
8210 }
8211
8212 if (dirp->next_offset > 2) {
8213 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8214 _readdir_drop_dirp_buffer(dirp);
8215 continue; // more!
8216 }
8217
8218 if (!fg.is_rightmost()) {
8219 // next frag!
8220 _readdir_next_frag(dirp);
8221 continue;
8222 }
8223
8224 if (diri->shared_gen == dirp->start_shared_gen &&
8225 diri->dir_release_count == dirp->release_count) {
8226 if (diri->dir_ordered_count == dirp->ordered_count) {
8227 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8228 if (diri->dir) {
8229 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8230 diri->dir->readdir_cache.resize(dirp->cache_index);
8231 }
8232 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8233 } else {
8234 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8235 diri->flags |= I_COMPLETE;
8236 }
8237 }
8238
8239 dirp->set_end();
8240 return 0;
8241 }
8242 ceph_abort();
8243 return 0;
8244 }
8245
8246
8247 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8248 {
8249 return readdirplus_r(d, de, 0, 0, 0, NULL);
8250 }
8251
8252 /*
8253 * readdirplus_r
8254 *
8255 * returns
8256 * 1 if we got a dirent
8257 * 0 for end of directory
8258 * <0 on error
8259 */
8260
8261 struct single_readdir {
8262 struct dirent *de;
8263 struct ceph_statx *stx;
8264 Inode *inode;
8265 bool full;
8266 };
8267
8268 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8269 struct ceph_statx *stx, off_t off,
8270 Inode *in)
8271 {
8272 single_readdir *c = static_cast<single_readdir *>(p);
8273
8274 if (c->full)
8275 return -1; // already filled this dirent
8276
8277 *c->de = *de;
8278 if (c->stx)
8279 *c->stx = *stx;
8280 c->inode = in;
8281 c->full = true;
8282 return 1;
8283 }
8284
8285 struct dirent *Client::readdir(dir_result_t *d)
8286 {
8287 int ret;
8288 static struct dirent de;
8289 single_readdir sr;
8290 sr.de = &de;
8291 sr.stx = NULL;
8292 sr.inode = NULL;
8293 sr.full = false;
8294
8295 // our callback fills the dirent and sets sr.full=true on first
8296 // call, and returns -1 the second time around.
8297 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8298 if (ret < -1) {
8299 errno = -ret; // this sucks.
8300 return (dirent *) NULL;
8301 }
8302 if (sr.full) {
8303 return &de;
8304 }
8305 return (dirent *) NULL;
8306 }
8307
8308 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8309 struct ceph_statx *stx, unsigned want,
8310 unsigned flags, Inode **out)
8311 {
8312 single_readdir sr;
8313 sr.de = de;
8314 sr.stx = stx;
8315 sr.inode = NULL;
8316 sr.full = false;
8317
8318 // our callback fills the dirent and sets sr.full=true on first
8319 // call, and returns -1 the second time around.
8320 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8321 if (r < -1)
8322 return r;
8323 if (out)
8324 *out = sr.inode;
8325 if (sr.full)
8326 return 1;
8327 return 0;
8328 }
8329
8330
8331 /* getdents */
8332 struct getdents_result {
8333 char *buf;
8334 int buflen;
8335 int pos;
8336 bool fullent;
8337 };
8338
8339 static int _readdir_getdent_cb(void *p, struct dirent *de,
8340 struct ceph_statx *stx, off_t off, Inode *in)
8341 {
8342 struct getdents_result *c = static_cast<getdents_result *>(p);
8343
8344 int dlen;
8345 if (c->fullent)
8346 dlen = sizeof(*de);
8347 else
8348 dlen = strlen(de->d_name) + 1;
8349
8350 if (c->pos + dlen > c->buflen)
8351 return -1; // doesn't fit
8352
8353 if (c->fullent) {
8354 memcpy(c->buf + c->pos, de, sizeof(*de));
8355 } else {
8356 memcpy(c->buf + c->pos, de->d_name, dlen);
8357 }
8358 c->pos += dlen;
8359 return 0;
8360 }
8361
8362 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8363 {
8364 getdents_result gr;
8365 gr.buf = buf;
8366 gr.buflen = buflen;
8367 gr.fullent = fullent;
8368 gr.pos = 0;
8369
8370 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8371
8372 if (r < 0) { // some error
8373 if (r == -1) { // buffer ran out of space
8374 if (gr.pos) { // but we got some entries already!
8375 return gr.pos;
8376 } // or we need a larger buffer
8377 return -ERANGE;
8378 } else { // actual error, return it
8379 return r;
8380 }
8381 }
8382 return gr.pos;
8383 }
8384
8385
8386 /* getdir */
8387 struct getdir_result {
8388 list<string> *contents;
8389 int num;
8390 };
8391
8392 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8393 {
8394 getdir_result *r = static_cast<getdir_result *>(p);
8395
8396 r->contents->push_back(de->d_name);
8397 r->num++;
8398 return 0;
8399 }
8400
8401 int Client::getdir(const char *relpath, list<string>& contents,
8402 const UserPerm& perms)
8403 {
8404 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8405 {
8406 std::lock_guard lock(client_lock);
8407 tout(cct) << "getdir" << std::endl;
8408 tout(cct) << relpath << std::endl;
8409 }
8410
8411 dir_result_t *d;
8412 int r = opendir(relpath, &d, perms);
8413 if (r < 0)
8414 return r;
8415
8416 getdir_result gr;
8417 gr.contents = &contents;
8418 gr.num = 0;
8419 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8420
8421 closedir(d);
8422
8423 if (r < 0)
8424 return r;
8425 return gr.num;
8426 }
8427
8428
8429 /****** file i/o **********/
8430 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8431 mode_t mode, int stripe_unit, int stripe_count,
8432 int object_size, const char *data_pool)
8433 {
8434 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8435 std::lock_guard lock(client_lock);
8436 tout(cct) << "open" << std::endl;
8437 tout(cct) << relpath << std::endl;
8438 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8439
8440 if (unmounting)
8441 return -ENOTCONN;
8442
8443 Fh *fh = NULL;
8444
8445 #if defined(__linux__) && defined(O_PATH)
8446 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8447 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8448 * in kernel (fs/open.c). */
8449 if (flags & O_PATH)
8450 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8451 #endif
8452
8453 filepath path(relpath);
8454 InodeRef in;
8455 bool created = false;
8456 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8457 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8458 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8459
8460 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8461 return -EEXIST;
8462
8463 #if defined(__linux__) && defined(O_PATH)
8464 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8465 #else
8466 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8467 #endif
8468 return -ELOOP;
8469
8470 if (r == -ENOENT && (flags & O_CREAT)) {
8471 filepath dirpath = path;
8472 string dname = dirpath.last_dentry();
8473 dirpath.pop_dentry();
8474 InodeRef dir;
8475 r = path_walk(dirpath, &dir, perms, true,
8476 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8477 if (r < 0)
8478 goto out;
8479 if (cct->_conf->client_permissions) {
8480 r = may_create(dir.get(), perms);
8481 if (r < 0)
8482 goto out;
8483 }
8484 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8485 stripe_count, object_size, data_pool, &created, perms);
8486 }
8487 if (r < 0)
8488 goto out;
8489
8490 if (!created) {
8491 // posix says we can only check permissions of existing files
8492 if (cct->_conf->client_permissions) {
8493 r = may_open(in.get(), flags, perms);
8494 if (r < 0)
8495 goto out;
8496 }
8497 }
8498
8499 if (!fh)
8500 r = _open(in.get(), flags, mode, &fh, perms);
8501 if (r >= 0) {
8502 // allocate a integer file descriptor
8503 ceph_assert(fh);
8504 r = get_fd();
8505 ceph_assert(fd_map.count(r) == 0);
8506 fd_map[r] = fh;
8507 }
8508
8509 out:
8510 tout(cct) << r << std::endl;
8511 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8512 return r;
8513 }
8514
8515 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8516 {
8517 /* Use default file striping parameters */
8518 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8519 }
8520
8521 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8522 const UserPerm& perms)
8523 {
8524 std::lock_guard lock(client_lock);
8525 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8526
8527 if (unmounting)
8528 return -ENOTCONN;
8529
8530 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8531 filepath path(ino);
8532 req->set_filepath(path);
8533
8534 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8535 char f[30];
8536 sprintf(f, "%u", h);
8537 filepath path2(dirino);
8538 path2.push_dentry(string(f));
8539 req->set_filepath2(path2);
8540
8541 int r = make_request(req, perms, NULL, NULL,
8542 rand() % mdsmap->get_num_in_mds());
8543 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8544 return r;
8545 }
8546
8547
8548 /**
8549 * Load inode into local cache.
8550 *
8551 * If inode pointer is non-NULL, and take a reference on
8552 * the resulting Inode object in one operation, so that caller
8553 * can safely assume inode will still be there after return.
8554 */
8555 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8556 {
8557 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8558
8559 if (unmounting)
8560 return -ENOTCONN;
8561
8562 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8563 filepath path(ino);
8564 req->set_filepath(path);
8565
8566 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8567 if (r == 0 && inode != NULL) {
8568 vinodeno_t vino(ino, CEPH_NOSNAP);
8569 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8570 ceph_assert(p != inode_map.end());
8571 *inode = p->second;
8572 _ll_get(*inode);
8573 }
8574 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8575 return r;
8576 }
8577
8578 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8579 {
8580 std::lock_guard lock(client_lock);
8581 return _lookup_ino(ino, perms, inode);
8582 }
8583
8584 /**
8585 * Find the parent inode of `ino` and insert it into
8586 * our cache. Conditionally also set `parent` to a referenced
8587 * Inode* if caller provides non-NULL value.
8588 */
8589 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8590 {
8591 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8592
8593 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8594 filepath path(ino->ino);
8595 req->set_filepath(path);
8596
8597 InodeRef target;
8598 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8599 // Give caller a reference to the parent ino if they provided a pointer.
8600 if (parent != NULL) {
8601 if (r == 0) {
8602 *parent = target.get();
8603 _ll_get(*parent);
8604 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8605 } else {
8606 *parent = NULL;
8607 }
8608 }
8609 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8610 return r;
8611 }
8612
8613 /**
8614 * Populate the parent dentry for `ino`, provided it is
8615 * a child of `parent`.
8616 */
8617 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8618 {
8619 ceph_assert(parent->is_dir());
8620 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8621
8622 if (unmounting)
8623 return -ENOTCONN;
8624
8625 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8626 req->set_filepath2(filepath(parent->ino));
8627 req->set_filepath(filepath(ino->ino));
8628 req->set_inode(ino);
8629
8630 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8631 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8632 return r;
8633 }
8634
8635 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8636 {
8637 std::lock_guard lock(client_lock);
8638 return _lookup_name(ino, parent, perms);
8639 }
8640
8641 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8642 {
8643 ceph_assert(in);
8644 Fh *f = new Fh(in, flags, cmode, perms);
8645
8646 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8647
8648 if (in->snapid != CEPH_NOSNAP) {
8649 in->snap_cap_refs++;
8650 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8651 << ccap_string(in->caps_issued()) << dendl;
8652 }
8653
8654 const auto& conf = cct->_conf;
8655 f->readahead.set_trigger_requests(1);
8656 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8657 uint64_t max_readahead = Readahead::NO_LIMIT;
8658 if (conf->client_readahead_max_bytes) {
8659 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8660 }
8661 if (conf->client_readahead_max_periods) {
8662 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8663 }
8664 f->readahead.set_max_readahead_size(max_readahead);
8665 vector<uint64_t> alignments;
8666 alignments.push_back(in->layout.get_period());
8667 alignments.push_back(in->layout.stripe_unit);
8668 f->readahead.set_alignments(alignments);
8669
8670 return f;
8671 }
8672
8673 int Client::_release_fh(Fh *f)
8674 {
8675 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8676 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8677 Inode *in = f->inode.get();
8678 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8679
8680 in->unset_deleg(f);
8681
8682 if (in->snapid == CEPH_NOSNAP) {
8683 if (in->put_open_ref(f->mode)) {
8684 _flush(in, new C_Client_FlushComplete(this, in));
8685 check_caps(in, 0);
8686 }
8687 } else {
8688 ceph_assert(in->snap_cap_refs > 0);
8689 in->snap_cap_refs--;
8690 }
8691
8692 _release_filelocks(f);
8693
8694 // Finally, read any async err (i.e. from flushes)
8695 int err = f->take_async_err();
8696 if (err != 0) {
8697 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8698 << cpp_strerror(err) << dendl;
8699 } else {
8700 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8701 }
8702
8703 _put_fh(f);
8704
8705 return err;
8706 }
8707
8708 void Client::_put_fh(Fh *f)
8709 {
8710 int left = f->put();
8711 if (!left) {
8712 delete f;
8713 }
8714 }
8715
8716 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8717 const UserPerm& perms)
8718 {
8719 if (in->snapid != CEPH_NOSNAP &&
8720 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8721 return -EROFS;
8722 }
8723
8724 // use normalized flags to generate cmode
8725 int cflags = ceph_flags_sys2wire(flags);
8726 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8727 cflags |= CEPH_O_LAZY;
8728
8729 int cmode = ceph_flags_to_mode(cflags);
8730 int want = ceph_caps_for_mode(cmode);
8731 int result = 0;
8732
8733 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8734
8735 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8736 // update wanted?
8737 check_caps(in, CHECK_CAPS_NODELAY);
8738 } else {
8739
8740 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8741 filepath path;
8742 in->make_nosnap_relative_path(path);
8743 req->set_filepath(path);
8744 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8745 req->head.args.open.mode = mode;
8746 req->head.args.open.pool = -1;
8747 if (cct->_conf->client_debug_getattr_caps)
8748 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8749 else
8750 req->head.args.open.mask = 0;
8751 req->head.args.open.old_size = in->size; // for O_TRUNC
8752 req->set_inode(in);
8753 result = make_request(req, perms);
8754
8755 /*
8756 * NFS expects that delegations will be broken on a conflicting open,
8757 * not just when there is actual conflicting access to the file. SMB leases
8758 * and oplocks also have similar semantics.
8759 *
8760 * Ensure that clients that have delegations enabled will wait on minimal
8761 * caps during open, just to ensure that other clients holding delegations
8762 * return theirs first.
8763 */
8764 if (deleg_timeout && result == 0) {
8765 int need = 0, have;
8766
8767 if (cmode & CEPH_FILE_MODE_WR)
8768 need |= CEPH_CAP_FILE_WR;
8769 if (cmode & CEPH_FILE_MODE_RD)
8770 need |= CEPH_CAP_FILE_RD;
8771
8772 result = get_caps(in, need, want, &have, -1);
8773 if (result < 0) {
8774 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8775 " . Denying open: " <<
8776 cpp_strerror(result) << dendl;
8777 in->put_open_ref(cmode);
8778 } else {
8779 put_cap_ref(in, need);
8780 }
8781 }
8782 }
8783
8784 // success?
8785 if (result >= 0) {
8786 if (fhp)
8787 *fhp = _create_fh(in, flags, cmode, perms);
8788 } else {
8789 in->put_open_ref(cmode);
8790 }
8791
8792 trim_cache();
8793
8794 return result;
8795 }
8796
8797 int Client::_renew_caps(Inode *in)
8798 {
8799 int wanted = in->caps_file_wanted();
8800 if (in->is_any_caps() &&
8801 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8802 check_caps(in, CHECK_CAPS_NODELAY);
8803 return 0;
8804 }
8805
8806 int flags = 0;
8807 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8808 flags = O_RDWR;
8809 else if (wanted & CEPH_CAP_FILE_RD)
8810 flags = O_RDONLY;
8811 else if (wanted & CEPH_CAP_FILE_WR)
8812 flags = O_WRONLY;
8813
8814 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8815 filepath path;
8816 in->make_nosnap_relative_path(path);
8817 req->set_filepath(path);
8818 req->head.args.open.flags = flags;
8819 req->head.args.open.pool = -1;
8820 if (cct->_conf->client_debug_getattr_caps)
8821 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8822 else
8823 req->head.args.open.mask = 0;
8824 req->set_inode(in);
8825
8826 // duplicate in case Cap goes away; not sure if that race is a concern?
8827 const UserPerm *pperm = in->get_best_perms();
8828 UserPerm perms;
8829 if (pperm != NULL)
8830 perms = *pperm;
8831 int ret = make_request(req, perms);
8832 return ret;
8833 }
8834
8835 int Client::close(int fd)
8836 {
8837 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8838 std::lock_guard lock(client_lock);
8839 tout(cct) << "close" << std::endl;
8840 tout(cct) << fd << std::endl;
8841
8842 if (unmounting)
8843 return -ENOTCONN;
8844
8845 Fh *fh = get_filehandle(fd);
8846 if (!fh)
8847 return -EBADF;
8848 int err = _release_fh(fh);
8849 fd_map.erase(fd);
8850 put_fd(fd);
8851 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8852 return err;
8853 }
8854
8855
8856 // ------------
8857 // read, write
8858
8859 loff_t Client::lseek(int fd, loff_t offset, int whence)
8860 {
8861 std::lock_guard lock(client_lock);
8862 tout(cct) << "lseek" << std::endl;
8863 tout(cct) << fd << std::endl;
8864 tout(cct) << offset << std::endl;
8865 tout(cct) << whence << std::endl;
8866
8867 if (unmounting)
8868 return -ENOTCONN;
8869
8870 Fh *f = get_filehandle(fd);
8871 if (!f)
8872 return -EBADF;
8873 #if defined(__linux__) && defined(O_PATH)
8874 if (f->flags & O_PATH)
8875 return -EBADF;
8876 #endif
8877 return _lseek(f, offset, whence);
8878 }
8879
8880 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8881 {
8882 Inode *in = f->inode.get();
8883 int r;
8884 loff_t pos = -1;
8885
8886 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
8887 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8888 if (r < 0) {
8889 return r;
8890 }
8891 }
8892
8893 switch (whence) {
8894 case SEEK_SET:
8895 pos = offset;
8896 break;
8897
8898 case SEEK_CUR:
8899 pos = f->pos + offset;
8900 break;
8901
8902 case SEEK_END:
8903 pos = in->size + offset;
8904 break;
8905
8906 case SEEK_DATA:
8907 if (offset < 0 || offset >= in->size) {
8908 r = -ENXIO;
8909 return offset;
8910 }
8911 pos = offset;
8912 break;
8913
8914 case SEEK_HOLE:
8915 if (offset < 0 || offset >= in->size) {
8916 r = -ENXIO;
8917 pos = offset;
8918 } else {
8919 pos = in->size;
8920 }
8921 break;
8922
8923 default:
8924 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
8925 return -EINVAL;
8926 }
8927
8928 if (pos < 0) {
8929 return -EINVAL;
8930 } else {
8931 f->pos = pos;
8932 }
8933
8934 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8935 return f->pos;
8936 }
8937
8938
8939 void Client::lock_fh_pos(Fh *f)
8940 {
8941 ldout(cct, 10) << __func__ << " " << f << dendl;
8942
8943 if (f->pos_locked || !f->pos_waiters.empty()) {
8944 Cond cond;
8945 f->pos_waiters.push_back(&cond);
8946 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
8947 while (f->pos_locked || f->pos_waiters.front() != &cond)
8948 cond.Wait(client_lock);
8949 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8950 ceph_assert(f->pos_waiters.front() == &cond);
8951 f->pos_waiters.pop_front();
8952 }
8953
8954 f->pos_locked = true;
8955 }
8956
8957 void Client::unlock_fh_pos(Fh *f)
8958 {
8959 ldout(cct, 10) << __func__ << " " << f << dendl;
8960 f->pos_locked = false;
8961 }
8962
8963 int Client::uninline_data(Inode *in, Context *onfinish)
8964 {
8965 if (!in->inline_data.length()) {
8966 onfinish->complete(0);
8967 return 0;
8968 }
8969
8970 char oid_buf[32];
8971 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8972 object_t oid = oid_buf;
8973
8974 ObjectOperation create_ops;
8975 create_ops.create(false);
8976
8977 objecter->mutate(oid,
8978 OSDMap::file_to_object_locator(in->layout),
8979 create_ops,
8980 in->snaprealm->get_snap_context(),
8981 ceph::real_clock::now(),
8982 0,
8983 NULL);
8984
8985 bufferlist inline_version_bl;
8986 encode(in->inline_version, inline_version_bl);
8987
8988 ObjectOperation uninline_ops;
8989 uninline_ops.cmpxattr("inline_version",
8990 CEPH_OSD_CMPXATTR_OP_GT,
8991 CEPH_OSD_CMPXATTR_MODE_U64,
8992 inline_version_bl);
8993 bufferlist inline_data = in->inline_data;
8994 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8995 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8996
8997 objecter->mutate(oid,
8998 OSDMap::file_to_object_locator(in->layout),
8999 uninline_ops,
9000 in->snaprealm->get_snap_context(),
9001 ceph::real_clock::now(),
9002 0,
9003 onfinish);
9004
9005 return 0;
9006 }
9007
9008 //
9009
9010 // blocking osd interface
9011
9012 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9013 {
9014 std::lock_guard lock(client_lock);
9015 tout(cct) << "read" << std::endl;
9016 tout(cct) << fd << std::endl;
9017 tout(cct) << size << std::endl;
9018 tout(cct) << offset << std::endl;
9019
9020 if (unmounting)
9021 return -ENOTCONN;
9022
9023 Fh *f = get_filehandle(fd);
9024 if (!f)
9025 return -EBADF;
9026 #if defined(__linux__) && defined(O_PATH)
9027 if (f->flags & O_PATH)
9028 return -EBADF;
9029 #endif
9030 bufferlist bl;
9031 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9032 size = std::min(size, (loff_t)INT_MAX);
9033 int r = _read(f, offset, size, &bl);
9034 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9035 if (r >= 0) {
9036 bl.copy(0, bl.length(), buf);
9037 r = bl.length();
9038 }
9039 return r;
9040 }
9041
9042 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9043 {
9044 if (iovcnt < 0)
9045 return -EINVAL;
9046 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9047 }
9048
9049 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9050 {
9051 int want, have = 0;
9052 bool movepos = false;
9053 std::unique_ptr<C_SaferCond> onuninline;
9054 int64_t r = 0;
9055 const auto& conf = cct->_conf;
9056 Inode *in = f->inode.get();
9057 utime_t lat;
9058 utime_t start = ceph_clock_now();
9059
9060 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9061 return -EBADF;
9062 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9063
9064 if (offset < 0) {
9065 lock_fh_pos(f);
9066 offset = f->pos;
9067 movepos = true;
9068 }
9069 loff_t start_pos = offset;
9070
9071 if (in->inline_version == 0) {
9072 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9073 if (r < 0) {
9074 goto done;
9075 }
9076 ceph_assert(in->inline_version > 0);
9077 }
9078
9079 retry:
9080 if (f->mode & CEPH_FILE_MODE_LAZY)
9081 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9082 else
9083 want = CEPH_CAP_FILE_CACHE;
9084 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
9085 if (r < 0) {
9086 goto done;
9087 }
9088 if (f->flags & O_DIRECT)
9089 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9090
9091 if (in->inline_version < CEPH_INLINE_NONE) {
9092 if (!(have & CEPH_CAP_FILE_CACHE)) {
9093 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9094 uninline_data(in, onuninline.get());
9095 } else {
9096 uint32_t len = in->inline_data.length();
9097 uint64_t endoff = offset + size;
9098 if (endoff > in->size)
9099 endoff = in->size;
9100
9101 if (offset < len) {
9102 if (endoff <= len) {
9103 bl->substr_of(in->inline_data, offset, endoff - offset);
9104 } else {
9105 bl->substr_of(in->inline_data, offset, len - offset);
9106 bl->append_zero(endoff - len);
9107 }
9108 r = endoff - offset;
9109 } else if ((uint64_t)offset < endoff) {
9110 bl->append_zero(endoff - offset);
9111 r = endoff - offset;
9112 } else {
9113 r = 0;
9114 }
9115 goto success;
9116 }
9117 }
9118
9119 if (!conf->client_debug_force_sync_read &&
9120 conf->client_oc &&
9121 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9122
9123 if (f->flags & O_RSYNC) {
9124 _flush_range(in, offset, size);
9125 }
9126 r = _read_async(f, offset, size, bl);
9127 if (r < 0)
9128 goto done;
9129 } else {
9130 if (f->flags & O_DIRECT)
9131 _flush_range(in, offset, size);
9132
9133 bool checkeof = false;
9134 r = _read_sync(f, offset, size, bl, &checkeof);
9135 if (r < 0)
9136 goto done;
9137 if (checkeof) {
9138 offset += r;
9139 size -= r;
9140
9141 put_cap_ref(in, CEPH_CAP_FILE_RD);
9142 have = 0;
9143 // reverify size
9144 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9145 if (r < 0)
9146 goto done;
9147
9148 // eof? short read.
9149 if ((uint64_t)offset < in->size)
9150 goto retry;
9151 }
9152 }
9153
9154 success:
9155 ceph_assert(r >= 0);
9156 if (movepos) {
9157 // adjust fd pos
9158 f->pos = start_pos + r;
9159 }
9160
9161 lat = ceph_clock_now();
9162 lat -= start;
9163 logger->tinc(l_c_read, lat);
9164
9165 done:
9166 // done!
9167
9168 if (onuninline) {
9169 client_lock.Unlock();
9170 int ret = onuninline->wait();
9171 client_lock.Lock();
9172 if (ret >= 0 || ret == -ECANCELED) {
9173 in->inline_data.clear();
9174 in->inline_version = CEPH_INLINE_NONE;
9175 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9176 check_caps(in, 0);
9177 } else
9178 r = ret;
9179 }
9180 if (have) {
9181 put_cap_ref(in, CEPH_CAP_FILE_RD);
9182 }
9183 if (movepos) {
9184 unlock_fh_pos(f);
9185 }
9186 return r;
9187 }
9188
9189 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9190 client(c), f(f) {
9191 f->get();
9192 f->readahead.inc_pending();
9193 }
9194
9195 Client::C_Readahead::~C_Readahead() {
9196 f->readahead.dec_pending();
9197 client->_put_fh(f);
9198 }
9199
9200 void Client::C_Readahead::finish(int r) {
9201 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9202 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9203 }
9204
9205 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9206 {
9207 const auto& conf = cct->_conf;
9208 Inode *in = f->inode.get();
9209
9210 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9211
9212 // trim read based on file size?
9213 if (off >= in->size)
9214 return 0;
9215 if (len == 0)
9216 return 0;
9217 if (off + len > in->size) {
9218 len = in->size - off;
9219 }
9220
9221 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9222 << " max_bytes=" << f->readahead.get_max_readahead_size()
9223 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9224
9225 // read (and possibly block)
9226 int r = 0;
9227 C_SaferCond onfinish("Client::_read_async flock");
9228 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9229 off, len, bl, 0, &onfinish);
9230 if (r == 0) {
9231 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9232 client_lock.Unlock();
9233 r = onfinish.wait();
9234 client_lock.Lock();
9235 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9236 }
9237
9238 if(f->readahead.get_min_readahead_size() > 0) {
9239 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9240 if (readahead_extent.second > 0) {
9241 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9242 << " (caller wants " << off << "~" << len << ")" << dendl;
9243 Context *onfinish2 = new C_Readahead(this, f);
9244 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9245 readahead_extent.first, readahead_extent.second,
9246 NULL, 0, onfinish2);
9247 if (r2 == 0) {
9248 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9249 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9250 } else {
9251 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9252 delete onfinish2;
9253 }
9254 }
9255 }
9256
9257 return r;
9258 }
9259
9260 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9261 bool *checkeof)
9262 {
9263 Inode *in = f->inode.get();
9264 uint64_t pos = off;
9265 int left = len;
9266 int read = 0;
9267
9268 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9269
9270 Mutex flock("Client::_read_sync flock");
9271 Cond cond;
9272 while (left > 0) {
9273 C_SaferCond onfinish("Client::_read_sync flock");
9274 bufferlist tbl;
9275
9276 int wanted = left;
9277 filer->read_trunc(in->ino, &in->layout, in->snapid,
9278 pos, left, &tbl, 0,
9279 in->truncate_size, in->truncate_seq,
9280 &onfinish);
9281 client_lock.Unlock();
9282 int r = onfinish.wait();
9283 client_lock.Lock();
9284
9285 // if we get ENOENT from OSD, assume 0 bytes returned
9286 if (r == -ENOENT)
9287 r = 0;
9288 if (r < 0)
9289 return r;
9290 if (tbl.length()) {
9291 r = tbl.length();
9292
9293 read += r;
9294 pos += r;
9295 left -= r;
9296 bl->claim_append(tbl);
9297 }
9298 // short read?
9299 if (r >= 0 && r < wanted) {
9300 if (pos < in->size) {
9301 // zero up to known EOF
9302 int64_t some = in->size - pos;
9303 if (some > left)
9304 some = left;
9305 auto z = buffer::ptr_node::create(some);
9306 z->zero();
9307 bl->push_back(std::move(z));
9308 read += some;
9309 pos += some;
9310 left -= some;
9311 if (left == 0)
9312 return read;
9313 }
9314
9315 *checkeof = true;
9316 return read;
9317 }
9318 }
9319 return read;
9320 }
9321
9322
9323 /*
9324 * we keep count of uncommitted sync writes on the inode, so that
9325 * fsync can DDRT.
9326 */
9327 void Client::_sync_write_commit(Inode *in)
9328 {
9329 ceph_assert(unsafe_sync_write > 0);
9330 unsafe_sync_write--;
9331
9332 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9333
9334 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9335 if (unsafe_sync_write == 0 && unmounting) {
9336 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9337 mount_cond.Signal();
9338 }
9339 }
9340
9341 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9342 {
9343 std::lock_guard lock(client_lock);
9344 tout(cct) << "write" << std::endl;
9345 tout(cct) << fd << std::endl;
9346 tout(cct) << size << std::endl;
9347 tout(cct) << offset << std::endl;
9348
9349 if (unmounting)
9350 return -ENOTCONN;
9351
9352 Fh *fh = get_filehandle(fd);
9353 if (!fh)
9354 return -EBADF;
9355 #if defined(__linux__) && defined(O_PATH)
9356 if (fh->flags & O_PATH)
9357 return -EBADF;
9358 #endif
9359 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9360 size = std::min(size, (loff_t)INT_MAX);
9361 int r = _write(fh, offset, size, buf, NULL, false);
9362 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9363 return r;
9364 }
9365
9366 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9367 {
9368 if (iovcnt < 0)
9369 return -EINVAL;
9370 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9371 }
9372
9373 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9374 unsigned iovcnt, int64_t offset, bool write,
9375 bool clamp_to_int)
9376 {
9377 #if defined(__linux__) && defined(O_PATH)
9378 if (fh->flags & O_PATH)
9379 return -EBADF;
9380 #endif
9381 loff_t totallen = 0;
9382 for (unsigned i = 0; i < iovcnt; i++) {
9383 totallen += iov[i].iov_len;
9384 }
9385
9386 /*
9387 * Some of the API functions take 64-bit size values, but only return
9388 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9389 * we don't do I/Os larger than the values we can return.
9390 */
9391 if (clamp_to_int) {
9392 totallen = std::min(totallen, (loff_t)INT_MAX);
9393 }
9394 if (write) {
9395 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9396 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9397 return w;
9398 } else {
9399 bufferlist bl;
9400 int64_t r = _read(fh, offset, totallen, &bl);
9401 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9402 if (r <= 0)
9403 return r;
9404
9405 int bufoff = 0;
9406 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9407 /*
9408 * This piece of code aims to handle the case that bufferlist does not have enough data
9409 * to fill in the iov
9410 */
9411 if (resid < iov[j].iov_len) {
9412 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9413 break;
9414 } else {
9415 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9416 }
9417 resid -= iov[j].iov_len;
9418 bufoff += iov[j].iov_len;
9419 }
9420 return r;
9421 }
9422 }
9423
9424 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9425 {
9426 std::lock_guard lock(client_lock);
9427 tout(cct) << fd << std::endl;
9428 tout(cct) << offset << std::endl;
9429
9430 if (unmounting)
9431 return -ENOTCONN;
9432
9433 Fh *fh = get_filehandle(fd);
9434 if (!fh)
9435 return -EBADF;
9436 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9437 }
9438
9439 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9440 const struct iovec *iov, int iovcnt)
9441 {
9442 uint64_t fpos = 0;
9443
9444 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9445 return -EFBIG;
9446
9447 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9448 Inode *in = f->inode.get();
9449
9450 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9451 return -ENOSPC;
9452 }
9453
9454 ceph_assert(in->snapid == CEPH_NOSNAP);
9455
9456 // was Fh opened as writeable?
9457 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9458 return -EBADF;
9459
9460 // use/adjust fd pos?
9461 if (offset < 0) {
9462 lock_fh_pos(f);
9463 /*
9464 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9465 * change out from under us.
9466 */
9467 if (f->flags & O_APPEND) {
9468 int r = _lseek(f, 0, SEEK_END);
9469 if (r < 0) {
9470 unlock_fh_pos(f);
9471 return r;
9472 }
9473 }
9474 offset = f->pos;
9475 fpos = offset+size;
9476 unlock_fh_pos(f);
9477 }
9478
9479 // check quota
9480 uint64_t endoff = offset + size;
9481 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9482 f->actor_perms)) {
9483 return -EDQUOT;
9484 }
9485
9486 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9487
9488 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9489
9490 // time it.
9491 utime_t start = ceph_clock_now();
9492
9493 if (in->inline_version == 0) {
9494 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9495 if (r < 0)
9496 return r;
9497 ceph_assert(in->inline_version > 0);
9498 }
9499
9500 // copy into fresh buffer (since our write may be resub, async)
9501 bufferlist bl;
9502 if (buf) {
9503 if (size > 0)
9504 bl.append(buf, size);
9505 } else if (iov){
9506 for (int i = 0; i < iovcnt; i++) {
9507 if (iov[i].iov_len > 0) {
9508 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9509 }
9510 }
9511 }
9512
9513 utime_t lat;
9514 uint64_t totalwritten;
9515 int want, have;
9516 if (f->mode & CEPH_FILE_MODE_LAZY)
9517 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9518 else
9519 want = CEPH_CAP_FILE_BUFFER;
9520 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9521 if (r < 0)
9522 return r;
9523
9524 /* clear the setuid/setgid bits, if any */
9525 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9526 struct ceph_statx stx = { 0 };
9527
9528 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9529 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9530 if (r < 0)
9531 return r;
9532 } else {
9533 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9534 }
9535
9536 if (f->flags & O_DIRECT)
9537 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9538
9539 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9540
9541 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9542
9543 if (in->inline_version < CEPH_INLINE_NONE) {
9544 if (endoff > cct->_conf->client_max_inline_size ||
9545 endoff > CEPH_INLINE_MAX_SIZE ||
9546 !(have & CEPH_CAP_FILE_BUFFER)) {
9547 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9548 uninline_data(in, onuninline.get());
9549 } else {
9550 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9551
9552 uint32_t len = in->inline_data.length();
9553
9554 if (endoff < len)
9555 in->inline_data.copy(endoff, len - endoff, bl);
9556
9557 if (offset < len)
9558 in->inline_data.splice(offset, len - offset);
9559 else if (offset > len)
9560 in->inline_data.append_zero(offset - len);
9561
9562 in->inline_data.append(bl);
9563 in->inline_version++;
9564
9565 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9566
9567 goto success;
9568 }
9569 }
9570
9571 if (cct->_conf->client_oc &&
9572 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9573 // do buffered write
9574 if (!in->oset.dirty_or_tx)
9575 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9576
9577 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9578
9579 // async, caching, non-blocking.
9580 r = objectcacher->file_write(&in->oset, &in->layout,
9581 in->snaprealm->get_snap_context(),
9582 offset, size, bl, ceph::real_clock::now(),
9583 0);
9584 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9585
9586 if (r < 0)
9587 goto done;
9588
9589 // flush cached write if O_SYNC is set on file fh
9590 // O_DSYNC == O_SYNC on linux < 2.6.33
9591 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9592 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9593 _flush_range(in, offset, size);
9594 }
9595 } else {
9596 if (f->flags & O_DIRECT)
9597 _flush_range(in, offset, size);
9598
9599 // simple, non-atomic sync write
9600 C_SaferCond onfinish("Client::_write flock");
9601 unsafe_sync_write++;
9602 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9603
9604 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9605 offset, size, bl, ceph::real_clock::now(), 0,
9606 in->truncate_size, in->truncate_seq,
9607 &onfinish);
9608 client_lock.Unlock();
9609 onfinish.wait();
9610 client_lock.Lock();
9611 _sync_write_commit(in);
9612 }
9613
9614 // if we get here, write was successful, update client metadata
9615 success:
9616 // time
9617 lat = ceph_clock_now();
9618 lat -= start;
9619 logger->tinc(l_c_wrlat, lat);
9620
9621 if (fpos) {
9622 lock_fh_pos(f);
9623 f->pos = fpos;
9624 unlock_fh_pos(f);
9625 }
9626 totalwritten = size;
9627 r = (int64_t)totalwritten;
9628
9629 // extend file?
9630 if (totalwritten + offset > in->size) {
9631 in->size = totalwritten + offset;
9632 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9633
9634 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9635 check_caps(in, CHECK_CAPS_NODELAY);
9636 } else if (is_max_size_approaching(in)) {
9637 check_caps(in, 0);
9638 }
9639
9640 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9641 } else {
9642 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9643 }
9644
9645 // mtime
9646 in->mtime = in->ctime = ceph_clock_now();
9647 in->change_attr++;
9648 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9649
9650 done:
9651
9652 if (nullptr != onuninline) {
9653 client_lock.Unlock();
9654 int uninline_ret = onuninline->wait();
9655 client_lock.Lock();
9656
9657 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9658 in->inline_data.clear();
9659 in->inline_version = CEPH_INLINE_NONE;
9660 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9661 check_caps(in, 0);
9662 } else
9663 r = uninline_ret;
9664 }
9665
9666 put_cap_ref(in, CEPH_CAP_FILE_WR);
9667 return r;
9668 }
9669
9670 int Client::_flush(Fh *f)
9671 {
9672 Inode *in = f->inode.get();
9673 int err = f->take_async_err();
9674 if (err != 0) {
9675 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9676 << cpp_strerror(err) << dendl;
9677 } else {
9678 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9679 }
9680
9681 return err;
9682 }
9683
9684 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9685 {
9686 struct ceph_statx stx;
9687 stx.stx_size = length;
9688 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9689 }
9690
9691 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9692 {
9693 std::lock_guard lock(client_lock);
9694 tout(cct) << __func__ << std::endl;
9695 tout(cct) << fd << std::endl;
9696 tout(cct) << length << std::endl;
9697
9698 if (unmounting)
9699 return -ENOTCONN;
9700
9701 Fh *f = get_filehandle(fd);
9702 if (!f)
9703 return -EBADF;
9704 #if defined(__linux__) && defined(O_PATH)
9705 if (f->flags & O_PATH)
9706 return -EBADF;
9707 #endif
9708 struct stat attr;
9709 attr.st_size = length;
9710 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9711 }
9712
9713 int Client::fsync(int fd, bool syncdataonly)
9714 {
9715 std::lock_guard lock(client_lock);
9716 tout(cct) << "fsync" << std::endl;
9717 tout(cct) << fd << std::endl;
9718 tout(cct) << syncdataonly << std::endl;
9719
9720 if (unmounting)
9721 return -ENOTCONN;
9722
9723 Fh *f = get_filehandle(fd);
9724 if (!f)
9725 return -EBADF;
9726 #if defined(__linux__) && defined(O_PATH)
9727 if (f->flags & O_PATH)
9728 return -EBADF;
9729 #endif
9730 int r = _fsync(f, syncdataonly);
9731 if (r == 0) {
9732 // The IOs in this fsync were okay, but maybe something happened
9733 // in the background that we shoudl be reporting?
9734 r = f->take_async_err();
9735 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9736 << ") = 0, async_err = " << r << dendl;
9737 } else {
9738 // Assume that an error we encountered during fsync, even reported
9739 // synchronously, would also have applied the error to the Fh, and we
9740 // should clear it here to avoid returning the same error again on next
9741 // call.
9742 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9743 << r << dendl;
9744 f->take_async_err();
9745 }
9746 return r;
9747 }
9748
9749 int Client::_fsync(Inode *in, bool syncdataonly)
9750 {
9751 int r = 0;
9752 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9753 ceph_tid_t flush_tid = 0;
9754 InodeRef tmp_ref;
9755 utime_t lat;
9756 utime_t start = ceph_clock_now();
9757
9758 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9759
9760 if (cct->_conf->client_oc) {
9761 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9762 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9763 _flush(in, object_cacher_completion.get());
9764 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9765 }
9766
9767 if (!syncdataonly && in->dirty_caps) {
9768 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9769 if (in->flushing_caps)
9770 flush_tid = last_flush_tid;
9771 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9772
9773 if (!syncdataonly && !in->unsafe_ops.empty()) {
9774 flush_mdlog_sync();
9775
9776 MetaRequest *req = in->unsafe_ops.back();
9777 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9778
9779 req->get();
9780 wait_on_list(req->waitfor_safe);
9781 put_request(req);
9782 }
9783
9784 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9785 client_lock.Unlock();
9786 ldout(cct, 15) << "waiting on data to flush" << dendl;
9787 r = object_cacher_completion->wait();
9788 client_lock.Lock();
9789 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9790 } else {
9791 // FIXME: this can starve
9792 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9793 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9794 << " uncommitted, waiting" << dendl;
9795 wait_on_list(in->waitfor_commit);
9796 }
9797 }
9798
9799 if (!r) {
9800 if (flush_tid > 0)
9801 wait_sync_caps(in, flush_tid);
9802
9803 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9804 } else {
9805 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9806 << cpp_strerror(-r) << dendl;
9807 }
9808
9809 lat = ceph_clock_now();
9810 lat -= start;
9811 logger->tinc(l_c_fsync, lat);
9812
9813 return r;
9814 }
9815
9816 int Client::_fsync(Fh *f, bool syncdataonly)
9817 {
9818 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9819 return _fsync(f->inode.get(), syncdataonly);
9820 }
9821
9822 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9823 {
9824 std::lock_guard lock(client_lock);
9825 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9826 tout(cct) << fd << std::endl;
9827
9828 if (unmounting)
9829 return -ENOTCONN;
9830
9831 Fh *f = get_filehandle(fd);
9832 if (!f)
9833 return -EBADF;
9834 int r = _getattr(f->inode, mask, perms);
9835 if (r < 0)
9836 return r;
9837 fill_stat(f->inode, stbuf, NULL);
9838 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9839 return r;
9840 }
9841
9842 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9843 unsigned int want, unsigned int flags)
9844 {
9845 std::lock_guard lock(client_lock);
9846 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9847 tout(cct) << fd << std::endl;
9848
9849 if (unmounting)
9850 return -ENOTCONN;
9851
9852 Fh *f = get_filehandle(fd);
9853 if (!f)
9854 return -EBADF;
9855
9856 unsigned mask = statx_to_mask(flags, want);
9857
9858 int r = 0;
9859 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9860 r = _getattr(f->inode, mask, perms);
9861 if (r < 0) {
9862 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9863 return r;
9864 }
9865 }
9866
9867 fill_statx(f->inode, mask, stx);
9868 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9869 return r;
9870 }
9871
9872 // not written yet, but i want to link!
9873
9874 int Client::chdir(const char *relpath, std::string &new_cwd,
9875 const UserPerm& perms)
9876 {
9877 std::lock_guard lock(client_lock);
9878 tout(cct) << "chdir" << std::endl;
9879 tout(cct) << relpath << std::endl;
9880
9881 if (unmounting)
9882 return -ENOTCONN;
9883
9884 filepath path(relpath);
9885 InodeRef in;
9886 int r = path_walk(path, &in, perms);
9887 if (r < 0)
9888 return r;
9889
9890 if (!(in.get()->is_dir()))
9891 return -ENOTDIR;
9892
9893 if (cwd != in)
9894 cwd.swap(in);
9895 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9896
9897 _getcwd(new_cwd, perms);
9898 return 0;
9899 }
9900
9901 void Client::_getcwd(string& dir, const UserPerm& perms)
9902 {
9903 filepath path;
9904 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
9905
9906 Inode *in = cwd.get();
9907 while (in != root) {
9908 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
9909
9910 // A cwd or ancester is unlinked
9911 if (in->dentries.empty()) {
9912 return;
9913 }
9914
9915 Dentry *dn = in->get_first_parent();
9916
9917
9918 if (!dn) {
9919 // look it up
9920 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
9921 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9922 filepath path(in->ino);
9923 req->set_filepath(path);
9924 req->set_inode(in);
9925 int res = make_request(req, perms);
9926 if (res < 0)
9927 break;
9928
9929 // start over
9930 path = filepath();
9931 in = cwd.get();
9932 continue;
9933 }
9934 path.push_front_dentry(dn->name);
9935 in = dn->dir->parent_inode;
9936 }
9937 dir = "/";
9938 dir += path.get_path();
9939 }
9940
9941 void Client::getcwd(string& dir, const UserPerm& perms)
9942 {
9943 std::lock_guard l(client_lock);
9944 if (!unmounting)
9945 _getcwd(dir, perms);
9946 }
9947
9948 int Client::statfs(const char *path, struct statvfs *stbuf,
9949 const UserPerm& perms)
9950 {
9951 std::lock_guard l(client_lock);
9952 tout(cct) << __func__ << std::endl;
9953 unsigned long int total_files_on_fs;
9954
9955 if (unmounting)
9956 return -ENOTCONN;
9957
9958 ceph_statfs stats;
9959 C_SaferCond cond;
9960
9961 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9962 if (data_pools.size() == 1) {
9963 objecter->get_fs_stats(stats, data_pools[0], &cond);
9964 } else {
9965 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9966 }
9967
9968 client_lock.Unlock();
9969 int rval = cond.wait();
9970 assert(root);
9971 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9972 client_lock.Lock();
9973
9974 if (rval < 0) {
9975 ldout(cct, 1) << "underlying call to statfs returned error: "
9976 << cpp_strerror(rval)
9977 << dendl;
9978 return rval;
9979 }
9980
9981 memset(stbuf, 0, sizeof(*stbuf));
9982
9983 /*
9984 * we're going to set a block size of 4MB so we can represent larger
9985 * FSes without overflowing. Additionally convert the space
9986 * measurements from KB to bytes while making them in terms of
9987 * blocks. We use 4MB only because it is big enough, and because it
9988 * actually *is* the (ceph) default block size.
9989 */
9990 const int CEPH_BLOCK_SHIFT = 22;
9991 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9992 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9993 stbuf->f_files = total_files_on_fs;
9994 stbuf->f_ffree = 0;
9995 stbuf->f_favail = -1;
9996 stbuf->f_fsid = -1; // ??
9997 stbuf->f_flag = 0; // ??
9998 stbuf->f_namemax = NAME_MAX;
9999
10000 // Usually quota_root will == root_ancestor, but if the mount root has no
10001 // quota but we can see a parent of it that does have a quota, we'll
10002 // respect that one instead.
10003 ceph_assert(root != nullptr);
10004 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10005
10006 // get_quota_root should always give us something
10007 // because client quotas are always enabled
10008 ceph_assert(quota_root != nullptr);
10009
10010 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10011
10012 // Skip the getattr if any sessions are stale, as we don't want to
10013 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10014 // is unhealthy.
10015 if (!_any_stale_sessions()) {
10016 int r = _getattr(quota_root, 0, perms, true);
10017 if (r != 0) {
10018 // Ignore return value: error getting latest inode metadata is not a good
10019 // reason to break "df".
10020 lderr(cct) << "Error in getattr on quota root 0x"
10021 << std::hex << quota_root->ino << std::dec
10022 << " statfs result may be outdated" << dendl;
10023 }
10024 }
10025
10026 // Special case: if there is a size quota set on the Inode acting
10027 // as the root for this client mount, then report the quota status
10028 // as the filesystem statistics.
10029 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10030 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10031 // It is possible for a quota to be exceeded: arithmetic here must
10032 // handle case where used > total.
10033 const fsblkcnt_t free = total > used ? total - used : 0;
10034
10035 stbuf->f_blocks = total;
10036 stbuf->f_bfree = free;
10037 stbuf->f_bavail = free;
10038 } else {
10039 // General case: report the cluster statistics returned from RADOS. Because
10040 // multiple pools may be used without one filesystem namespace via
10041 // layouts, this is the most correct thing we can do.
10042 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10043 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10044 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10045 }
10046
10047 return rval;
10048 }
10049
10050 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10051 struct flock *fl, uint64_t owner, bool removing)
10052 {
10053 ldout(cct, 10) << __func__ << " ino " << in->ino
10054 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10055 << " type " << fl->l_type << " owner " << owner
10056 << " " << fl->l_start << "~" << fl->l_len << dendl;
10057
10058 int lock_cmd;
10059 if (F_RDLCK == fl->l_type)
10060 lock_cmd = CEPH_LOCK_SHARED;
10061 else if (F_WRLCK == fl->l_type)
10062 lock_cmd = CEPH_LOCK_EXCL;
10063 else if (F_UNLCK == fl->l_type)
10064 lock_cmd = CEPH_LOCK_UNLOCK;
10065 else
10066 return -EIO;
10067
10068 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10069 sleep = 0;
10070
10071 /*
10072 * Set the most significant bit, so that MDS knows the 'owner'
10073 * is sufficient to identify the owner of lock. (old code uses
10074 * both 'owner' and 'pid')
10075 */
10076 owner |= (1ULL << 63);
10077
10078 MetaRequest *req = new MetaRequest(op);
10079 filepath path;
10080 in->make_nosnap_relative_path(path);
10081 req->set_filepath(path);
10082 req->set_inode(in);
10083
10084 req->head.args.filelock_change.rule = lock_type;
10085 req->head.args.filelock_change.type = lock_cmd;
10086 req->head.args.filelock_change.owner = owner;
10087 req->head.args.filelock_change.pid = fl->l_pid;
10088 req->head.args.filelock_change.start = fl->l_start;
10089 req->head.args.filelock_change.length = fl->l_len;
10090 req->head.args.filelock_change.wait = sleep;
10091
10092 int ret;
10093 bufferlist bl;
10094
10095 if (sleep && switch_interrupt_cb) {
10096 // enable interrupt
10097 switch_interrupt_cb(callback_handle, req->get());
10098 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10099 // disable interrupt
10100 switch_interrupt_cb(callback_handle, NULL);
10101 if (ret == 0 && req->aborted()) {
10102 // effect of this lock request has been revoked by the 'lock intr' request
10103 ret = req->get_abort_code();
10104 }
10105 put_request(req);
10106 } else {
10107 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10108 }
10109
10110 if (ret == 0) {
10111 if (op == CEPH_MDS_OP_GETFILELOCK) {
10112 ceph_filelock filelock;
10113 auto p = bl.cbegin();
10114 decode(filelock, p);
10115
10116 if (CEPH_LOCK_SHARED == filelock.type)
10117 fl->l_type = F_RDLCK;
10118 else if (CEPH_LOCK_EXCL == filelock.type)
10119 fl->l_type = F_WRLCK;
10120 else
10121 fl->l_type = F_UNLCK;
10122
10123 fl->l_whence = SEEK_SET;
10124 fl->l_start = filelock.start;
10125 fl->l_len = filelock.length;
10126 fl->l_pid = filelock.pid;
10127 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10128 ceph_lock_state_t *lock_state;
10129 if (lock_type == CEPH_LOCK_FCNTL) {
10130 if (!in->fcntl_locks)
10131 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10132 lock_state = in->fcntl_locks.get();
10133 } else if (lock_type == CEPH_LOCK_FLOCK) {
10134 if (!in->flock_locks)
10135 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10136 lock_state = in->flock_locks.get();
10137 } else {
10138 ceph_abort();
10139 return -EINVAL;
10140 }
10141 _update_lock_state(fl, owner, lock_state);
10142
10143 if (!removing) {
10144 if (lock_type == CEPH_LOCK_FCNTL) {
10145 if (!fh->fcntl_locks)
10146 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10147 lock_state = fh->fcntl_locks.get();
10148 } else {
10149 if (!fh->flock_locks)
10150 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10151 lock_state = fh->flock_locks.get();
10152 }
10153 _update_lock_state(fl, owner, lock_state);
10154 }
10155 } else
10156 ceph_abort();
10157 }
10158 return ret;
10159 }
10160
10161 int Client::_interrupt_filelock(MetaRequest *req)
10162 {
10163 // Set abort code, but do not kick. The abort code prevents the request
10164 // from being re-sent.
10165 req->abort(-EINTR);
10166 if (req->mds < 0)
10167 return 0; // haven't sent the request
10168
10169 Inode *in = req->inode();
10170
10171 int lock_type;
10172 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10173 lock_type = CEPH_LOCK_FLOCK_INTR;
10174 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10175 lock_type = CEPH_LOCK_FCNTL_INTR;
10176 else {
10177 ceph_abort();
10178 return -EINVAL;
10179 }
10180
10181 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10182 filepath path;
10183 in->make_nosnap_relative_path(path);
10184 intr_req->set_filepath(path);
10185 intr_req->set_inode(in);
10186 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10187 intr_req->head.args.filelock_change.rule = lock_type;
10188 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10189
10190 UserPerm perms(req->get_uid(), req->get_gid());
10191 return make_request(intr_req, perms, NULL, NULL, -1);
10192 }
10193
10194 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10195 {
10196 if (!in->fcntl_locks && !in->flock_locks)
10197 return;
10198
10199 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10200 encode(nr_fcntl_locks, bl);
10201 if (nr_fcntl_locks) {
10202 auto &lock_state = in->fcntl_locks;
10203 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10204 p != lock_state->held_locks.end();
10205 ++p)
10206 encode(p->second, bl);
10207 }
10208
10209 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10210 encode(nr_flock_locks, bl);
10211 if (nr_flock_locks) {
10212 auto &lock_state = in->flock_locks;
10213 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10214 p != lock_state->held_locks.end();
10215 ++p)
10216 encode(p->second, bl);
10217 }
10218
10219 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10220 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10221 }
10222
10223 void Client::_release_filelocks(Fh *fh)
10224 {
10225 if (!fh->fcntl_locks && !fh->flock_locks)
10226 return;
10227
10228 Inode *in = fh->inode.get();
10229 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10230
10231 list<pair<int, ceph_filelock> > to_release;
10232
10233 if (fh->fcntl_locks) {
10234 auto &lock_state = fh->fcntl_locks;
10235 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10236 p != lock_state->held_locks.end();
10237 ++p)
10238 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10239 lock_state.reset();
10240 }
10241 if (fh->flock_locks) {
10242 auto &lock_state = fh->flock_locks;
10243 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10244 p != lock_state->held_locks.end();
10245 ++p)
10246 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10247 lock_state.reset();
10248 }
10249
10250 if (to_release.empty())
10251 return;
10252
10253 // mds has already released filelocks if session was closed.
10254 if (in->caps.empty())
10255 return;
10256
10257 struct flock fl;
10258 memset(&fl, 0, sizeof(fl));
10259 fl.l_whence = SEEK_SET;
10260 fl.l_type = F_UNLCK;
10261
10262 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10263 p != to_release.end();
10264 ++p) {
10265 fl.l_start = p->second.start;
10266 fl.l_len = p->second.length;
10267 fl.l_pid = p->second.pid;
10268 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10269 p->second.owner, true);
10270 }
10271 }
10272
10273 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10274 ceph_lock_state_t *lock_state)
10275 {
10276 int lock_cmd;
10277 if (F_RDLCK == fl->l_type)
10278 lock_cmd = CEPH_LOCK_SHARED;
10279 else if (F_WRLCK == fl->l_type)
10280 lock_cmd = CEPH_LOCK_EXCL;
10281 else
10282 lock_cmd = CEPH_LOCK_UNLOCK;;
10283
10284 ceph_filelock filelock;
10285 filelock.start = fl->l_start;
10286 filelock.length = fl->l_len;
10287 filelock.client = 0;
10288 // see comment in _do_filelock()
10289 filelock.owner = owner | (1ULL << 63);
10290 filelock.pid = fl->l_pid;
10291 filelock.type = lock_cmd;
10292
10293 if (filelock.type == CEPH_LOCK_UNLOCK) {
10294 list<ceph_filelock> activated_locks;
10295 lock_state->remove_lock(filelock, activated_locks);
10296 } else {
10297 bool r = lock_state->add_lock(filelock, false, false, NULL);
10298 ceph_assert(r);
10299 }
10300 }
10301
10302 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10303 {
10304 Inode *in = fh->inode.get();
10305 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10306 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10307 return ret;
10308 }
10309
10310 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10311 {
10312 Inode *in = fh->inode.get();
10313 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10314 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10315 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10316 return ret;
10317 }
10318
10319 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10320 {
10321 Inode *in = fh->inode.get();
10322 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10323
10324 int sleep = !(cmd & LOCK_NB);
10325 cmd &= ~LOCK_NB;
10326
10327 int type;
10328 switch (cmd) {
10329 case LOCK_SH:
10330 type = F_RDLCK;
10331 break;
10332 case LOCK_EX:
10333 type = F_WRLCK;
10334 break;
10335 case LOCK_UN:
10336 type = F_UNLCK;
10337 break;
10338 default:
10339 return -EINVAL;
10340 }
10341
10342 struct flock fl;
10343 memset(&fl, 0, sizeof(fl));
10344 fl.l_type = type;
10345 fl.l_whence = SEEK_SET;
10346
10347 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10348 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10349 return ret;
10350 }
10351
10352 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10353 {
10354 /* Since the only thing this does is wrap a call to statfs, and
10355 statfs takes a lock, it doesn't seem we have a need to split it
10356 out. */
10357 return statfs(0, stbuf, perms);
10358 }
10359
10360 void Client::ll_register_callbacks(struct client_callback_args *args)
10361 {
10362 if (!args)
10363 return;
10364 std::lock_guard l(client_lock);
10365 ldout(cct, 10) << __func__ << " cb " << args->handle
10366 << " invalidate_ino_cb " << args->ino_cb
10367 << " invalidate_dentry_cb " << args->dentry_cb
10368 << " switch_interrupt_cb " << args->switch_intr_cb
10369 << " remount_cb " << args->remount_cb
10370 << dendl;
10371 callback_handle = args->handle;
10372 if (args->ino_cb) {
10373 ino_invalidate_cb = args->ino_cb;
10374 async_ino_invalidator.start();
10375 }
10376 if (args->dentry_cb) {
10377 dentry_invalidate_cb = args->dentry_cb;
10378 async_dentry_invalidator.start();
10379 }
10380 if (args->switch_intr_cb) {
10381 switch_interrupt_cb = args->switch_intr_cb;
10382 interrupt_finisher.start();
10383 }
10384 if (args->remount_cb) {
10385 remount_cb = args->remount_cb;
10386 remount_finisher.start();
10387 }
10388 umask_cb = args->umask_cb;
10389 }
10390
10391 int Client::test_dentry_handling(bool can_invalidate)
10392 {
10393 int r = 0;
10394
10395 can_invalidate_dentries = can_invalidate;
10396
10397 if (can_invalidate_dentries) {
10398 ceph_assert(dentry_invalidate_cb);
10399 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10400 r = 0;
10401 } else {
10402 ceph_assert(remount_cb);
10403 ldout(cct, 1) << "using remount_cb" << dendl;
10404 r = _do_remount(false);
10405 }
10406
10407 return r;
10408 }
10409
10410 int Client::_sync_fs()
10411 {
10412 ldout(cct, 10) << __func__ << dendl;
10413
10414 // flush file data
10415 std::unique_ptr<C_SaferCond> cond = nullptr;
10416 if (cct->_conf->client_oc) {
10417 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10418 objectcacher->flush_all(cond.get());
10419 }
10420
10421 // flush caps
10422 flush_caps_sync();
10423 ceph_tid_t flush_tid = last_flush_tid;
10424
10425 // wait for unsafe mds requests
10426 wait_unsafe_requests();
10427
10428 wait_sync_caps(flush_tid);
10429
10430 if (nullptr != cond) {
10431 client_lock.Unlock();
10432 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10433 cond->wait();
10434 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10435 client_lock.Lock();
10436 }
10437
10438 return 0;
10439 }
10440
10441 int Client::sync_fs()
10442 {
10443 std::lock_guard l(client_lock);
10444
10445 if (unmounting)
10446 return -ENOTCONN;
10447
10448 return _sync_fs();
10449 }
10450
10451 int64_t Client::drop_caches()
10452 {
10453 std::lock_guard l(client_lock);
10454 return objectcacher->release_all();
10455 }
10456
10457 int Client::_lazyio(Fh *fh, int enable)
10458 {
10459 Inode *in = fh->inode.get();
10460 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10461
10462 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10463 return 0;
10464
10465 int orig_mode = fh->mode;
10466 if (enable) {
10467 fh->mode |= CEPH_FILE_MODE_LAZY;
10468 in->get_open_ref(fh->mode);
10469 in->put_open_ref(orig_mode);
10470 check_caps(in, CHECK_CAPS_NODELAY);
10471 } else {
10472 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10473 in->get_open_ref(fh->mode);
10474 in->put_open_ref(orig_mode);
10475 check_caps(in, 0);
10476 }
10477
10478 return 0;
10479 }
10480
10481 int Client::lazyio(int fd, int enable)
10482 {
10483 std::lock_guard l(client_lock);
10484 Fh *f = get_filehandle(fd);
10485 if (!f)
10486 return -EBADF;
10487
10488 return _lazyio(f, enable);
10489 }
10490
10491 int Client::ll_lazyio(Fh *fh, int enable)
10492 {
10493 std::lock_guard lock(client_lock);
10494 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10495 tout(cct) << __func__ << std::endl;
10496
10497 return _lazyio(fh, enable);
10498 }
10499
10500 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10501 {
10502 std::lock_guard l(client_lock);
10503 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10504 << ", " << offset << ", " << count << ")" << dendl;
10505
10506 Fh *f = get_filehandle(fd);
10507 if (!f)
10508 return -EBADF;
10509
10510 // for now
10511 _fsync(f, true);
10512
10513 return 0;
10514 }
10515
10516 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10517 {
10518 std::lock_guard l(client_lock);
10519 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10520 << ", " << offset << ", " << count << ")" << dendl;
10521
10522 Fh *f = get_filehandle(fd);
10523 if (!f)
10524 return -EBADF;
10525 Inode *in = f->inode.get();
10526
10527 _fsync(f, true);
10528 if (_release(in)) {
10529 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10530 if (r < 0)
10531 return r;
10532 }
10533 return 0;
10534 }
10535
10536
10537 // =============================
10538 // snaps
10539
10540 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10541 {
10542 std::lock_guard l(client_lock);
10543
10544 if (unmounting)
10545 return -ENOTCONN;
10546
10547 filepath path(relpath);
10548 InodeRef in;
10549 int r = path_walk(path, &in, perm);
10550 if (r < 0)
10551 return r;
10552 if (cct->_conf->client_permissions) {
10553 r = may_create(in.get(), perm);
10554 if (r < 0)
10555 return r;
10556 }
10557 Inode *snapdir = open_snapdir(in.get());
10558 return _mkdir(snapdir, name, 0, perm);
10559 }
10560
10561 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10562 {
10563 std::lock_guard l(client_lock);
10564
10565 if (unmounting)
10566 return -ENOTCONN;
10567
10568 filepath path(relpath);
10569 InodeRef in;
10570 int r = path_walk(path, &in, perms);
10571 if (r < 0)
10572 return r;
10573 if (cct->_conf->client_permissions) {
10574 r = may_delete(in.get(), NULL, perms);
10575 if (r < 0)
10576 return r;
10577 }
10578 Inode *snapdir = open_snapdir(in.get());
10579 return _rmdir(snapdir, name, perms);
10580 }
10581
10582 // =============================
10583 // expose caps
10584
10585 int Client::get_caps_issued(int fd) {
10586
10587 std::lock_guard lock(client_lock);
10588
10589 if (unmounting)
10590 return -ENOTCONN;
10591
10592 Fh *f = get_filehandle(fd);
10593 if (!f)
10594 return -EBADF;
10595
10596 return f->inode->caps_issued();
10597 }
10598
10599 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10600 {
10601 std::lock_guard lock(client_lock);
10602
10603 if (unmounting)
10604 return -ENOTCONN;
10605
10606 filepath p(path);
10607 InodeRef in;
10608 int r = path_walk(p, &in, perms, true);
10609 if (r < 0)
10610 return r;
10611 return in->caps_issued();
10612 }
10613
10614 // =========================================
10615 // low level
10616
10617 Inode *Client::open_snapdir(Inode *diri)
10618 {
10619 Inode *in;
10620 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10621 if (!inode_map.count(vino)) {
10622 in = new Inode(this, vino, &diri->layout);
10623
10624 in->ino = diri->ino;
10625 in->snapid = CEPH_SNAPDIR;
10626 in->mode = diri->mode;
10627 in->uid = diri->uid;
10628 in->gid = diri->gid;
10629 in->nlink = 1;
10630 in->mtime = diri->mtime;
10631 in->ctime = diri->ctime;
10632 in->btime = diri->btime;
10633 in->size = diri->size;
10634 in->change_attr = diri->change_attr;
10635
10636 in->dirfragtree.clear();
10637 in->snapdir_parent = diri;
10638 diri->flags |= I_SNAPDIR_OPEN;
10639 inode_map[vino] = in;
10640 if (use_faked_inos())
10641 _assign_faked_ino(in);
10642 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10643 } else {
10644 in = inode_map[vino];
10645 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10646 }
10647 return in;
10648 }
10649
10650 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10651 Inode **out, const UserPerm& perms)
10652 {
10653 std::lock_guard lock(client_lock);
10654 vinodeno_t vparent = _get_vino(parent);
10655 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10656 tout(cct) << __func__ << std::endl;
10657 tout(cct) << name << std::endl;
10658
10659 if (unmounting)
10660 return -ENOTCONN;
10661
10662 int r = 0;
10663 if (!fuse_default_permissions) {
10664 if (strcmp(name, ".") && strcmp(name, "..")) {
10665 r = may_lookup(parent, perms);
10666 if (r < 0)
10667 return r;
10668 }
10669 }
10670
10671 string dname(name);
10672 InodeRef in;
10673
10674 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10675 if (r < 0) {
10676 attr->st_ino = 0;
10677 goto out;
10678 }
10679
10680 ceph_assert(in);
10681 fill_stat(in, attr);
10682 _ll_get(in.get());
10683
10684 out:
10685 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10686 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10687 tout(cct) << attr->st_ino << std::endl;
10688 *out = in.get();
10689 return r;
10690 }
10691
10692 int Client::ll_lookup_inode(
10693 struct inodeno_t ino,
10694 const UserPerm& perms,
10695 Inode **inode)
10696 {
10697 ceph_assert(inode != NULL);
10698 std::lock_guard lock(client_lock);
10699 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10700
10701 if (unmounting)
10702 return -ENOTCONN;
10703
10704 // Num1: get inode and *inode
10705 int r = _lookup_ino(ino, perms, inode);
10706 if (r)
10707 return r;
10708
10709 ceph_assert(*inode != NULL);
10710
10711 if (!(*inode)->dentries.empty()) {
10712 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10713 return 0;
10714 }
10715
10716 if ((*inode)->is_root()) {
10717 ldout(cct, 8) << "ino is root, no parent" << dendl;
10718 return 0;
10719 }
10720
10721 // Num2: Request the parent inode, so that we can look up the name
10722 Inode *parent;
10723 r = _lookup_parent(*inode, perms, &parent);
10724 if (r) {
10725 _ll_forget(*inode, 1);
10726 return r;
10727 }
10728
10729 ceph_assert(parent != NULL);
10730
10731 // Num3: Finally, get the name (dentry) of the requested inode
10732 r = _lookup_name(*inode, parent, perms);
10733 if (r) {
10734 // Unexpected error
10735 _ll_forget(parent, 1);
10736 _ll_forget(*inode, 1);
10737 return r;
10738 }
10739
10740 _ll_forget(parent, 1);
10741 return 0;
10742 }
10743
10744 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10745 struct ceph_statx *stx, unsigned want, unsigned flags,
10746 const UserPerm& perms)
10747 {
10748 std::lock_guard lock(client_lock);
10749 vinodeno_t vparent = _get_vino(parent);
10750 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10751 tout(cct) << "ll_lookupx" << std::endl;
10752 tout(cct) << name << std::endl;
10753
10754 if (unmounting)
10755 return -ENOTCONN;
10756
10757 int r = 0;
10758 if (!fuse_default_permissions) {
10759 r = may_lookup(parent, perms);
10760 if (r < 0)
10761 return r;
10762 }
10763
10764 string dname(name);
10765 InodeRef in;
10766
10767 unsigned mask = statx_to_mask(flags, want);
10768 r = _lookup(parent, dname, mask, &in, perms);
10769 if (r < 0) {
10770 stx->stx_ino = 0;
10771 stx->stx_mask = 0;
10772 } else {
10773 ceph_assert(in);
10774 fill_statx(in, mask, stx);
10775 _ll_get(in.get());
10776 }
10777
10778 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10779 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10780 tout(cct) << stx->stx_ino << std::endl;
10781 *out = in.get();
10782 return r;
10783 }
10784
10785 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10786 unsigned int want, unsigned int flags, const UserPerm& perms)
10787 {
10788 std::lock_guard lock(client_lock);
10789
10790 if (unmounting)
10791 return -ENOTCONN;
10792
10793 filepath fp(name, 0);
10794 InodeRef in;
10795 int rc;
10796 unsigned mask = statx_to_mask(flags, want);
10797
10798 ldout(cct, 3) << __func__ << " " << name << dendl;
10799 tout(cct) << __func__ << std::endl;
10800 tout(cct) << name << std::endl;
10801
10802 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10803 if (rc < 0) {
10804 /* zero out mask, just in case... */
10805 stx->stx_mask = 0;
10806 stx->stx_ino = 0;
10807 *out = NULL;
10808 return rc;
10809 } else {
10810 ceph_assert(in);
10811 fill_statx(in, mask, stx);
10812 _ll_get(in.get());
10813 *out = in.get();
10814 return 0;
10815 }
10816 }
10817
10818 void Client::_ll_get(Inode *in)
10819 {
10820 if (in->ll_ref == 0) {
10821 in->get();
10822 if (in->is_dir() && !in->dentries.empty()) {
10823 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10824 in->get_first_parent()->get(); // pin dentry
10825 }
10826 if (in->snapid != CEPH_NOSNAP)
10827 ll_snap_ref[in->snapid]++;
10828 }
10829 in->ll_get();
10830 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10831 }
10832
10833 int Client::_ll_put(Inode *in, uint64_t num)
10834 {
10835 in->ll_put(num);
10836 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10837 if (in->ll_ref == 0) {
10838 if (in->is_dir() && !in->dentries.empty()) {
10839 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10840 in->get_first_parent()->put(); // unpin dentry
10841 }
10842 if (in->snapid != CEPH_NOSNAP) {
10843 auto p = ll_snap_ref.find(in->snapid);
10844 ceph_assert(p != ll_snap_ref.end());
10845 ceph_assert(p->second > 0);
10846 if (--p->second == 0)
10847 ll_snap_ref.erase(p);
10848 }
10849 put_inode(in);
10850 return 0;
10851 } else {
10852 return in->ll_ref;
10853 }
10854 }
10855
10856 void Client::_ll_drop_pins()
10857 {
10858 ldout(cct, 10) << __func__ << dendl;
10859 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10860 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10861 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10862 it != inode_map.end();
10863 it = next) {
10864 Inode *in = it->second;
10865 next = it;
10866 ++next;
10867 if (in->ll_ref){
10868 to_be_put.insert(in);
10869 _ll_put(in, in->ll_ref);
10870 }
10871 }
10872 }
10873
10874 bool Client::_ll_forget(Inode *in, uint64_t count)
10875 {
10876 inodeno_t ino = in->ino;
10877
10878 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10879 tout(cct) << __func__ << std::endl;
10880 tout(cct) << ino.val << std::endl;
10881 tout(cct) << count << std::endl;
10882
10883 // Ignore forget if we're no longer mounted
10884 if (unmounting)
10885 return true;
10886
10887 if (ino == 1) return true; // ignore forget on root.
10888
10889 bool last = false;
10890 if (in->ll_ref < count) {
10891 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10892 << ", which only has ll_ref=" << in->ll_ref << dendl;
10893 _ll_put(in, in->ll_ref);
10894 last = true;
10895 } else {
10896 if (_ll_put(in, count) == 0)
10897 last = true;
10898 }
10899
10900 return last;
10901 }
10902
10903 bool Client::ll_forget(Inode *in, uint64_t count)
10904 {
10905 std::lock_guard lock(client_lock);
10906 return _ll_forget(in, count);
10907 }
10908
10909 bool Client::ll_put(Inode *in)
10910 {
10911 /* ll_forget already takes the lock */
10912 return ll_forget(in, 1);
10913 }
10914
10915 int Client::ll_get_snap_ref(snapid_t snap)
10916 {
10917 std::lock_guard lock(client_lock);
10918 auto p = ll_snap_ref.find(snap);
10919 if (p != ll_snap_ref.end())
10920 return p->second;
10921 return 0;
10922 }
10923
10924 snapid_t Client::ll_get_snapid(Inode *in)
10925 {
10926 std::lock_guard lock(client_lock);
10927 return in->snapid;
10928 }
10929
10930 Inode *Client::ll_get_inode(ino_t ino)
10931 {
10932 std::lock_guard lock(client_lock);
10933
10934 if (unmounting)
10935 return NULL;
10936
10937 vinodeno_t vino = _map_faked_ino(ino);
10938 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10939 if (p == inode_map.end())
10940 return NULL;
10941 Inode *in = p->second;
10942 _ll_get(in);
10943 return in;
10944 }
10945
10946 Inode *Client::ll_get_inode(vinodeno_t vino)
10947 {
10948 std::lock_guard lock(client_lock);
10949
10950 if (unmounting)
10951 return NULL;
10952
10953 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10954 if (p == inode_map.end())
10955 return NULL;
10956 Inode *in = p->second;
10957 _ll_get(in);
10958 return in;
10959 }
10960
10961 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10962 {
10963 vinodeno_t vino = _get_vino(in);
10964
10965 ldout(cct, 8) << __func__ << " " << vino << dendl;
10966 tout(cct) << __func__ << std::endl;
10967 tout(cct) << vino.ino.val << std::endl;
10968
10969 if (vino.snapid < CEPH_NOSNAP)
10970 return 0;
10971 else
10972 return _getattr(in, caps, perms);
10973 }
10974
10975 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10976 {
10977 std::lock_guard lock(client_lock);
10978
10979 if (unmounting)
10980 return -ENOTCONN;
10981
10982 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10983
10984 if (res == 0)
10985 fill_stat(in, attr);
10986 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
10987 return res;
10988 }
10989
10990 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10991 unsigned int flags, const UserPerm& perms)
10992 {
10993 std::lock_guard lock(client_lock);
10994
10995 if (unmounting)
10996 return -ENOTCONN;
10997
10998 int res = 0;
10999 unsigned mask = statx_to_mask(flags, want);
11000
11001 if (mask && !in->caps_issued_mask(mask, true))
11002 res = _ll_getattr(in, mask, perms);
11003
11004 if (res == 0)
11005 fill_statx(in, mask, stx);
11006 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11007 return res;
11008 }
11009
11010 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11011 const UserPerm& perms, InodeRef *inp)
11012 {
11013 vinodeno_t vino = _get_vino(in);
11014
11015 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11016 << dendl;
11017 tout(cct) << __func__ << std::endl;
11018 tout(cct) << vino.ino.val << std::endl;
11019 tout(cct) << stx->stx_mode << std::endl;
11020 tout(cct) << stx->stx_uid << std::endl;
11021 tout(cct) << stx->stx_gid << std::endl;
11022 tout(cct) << stx->stx_size << std::endl;
11023 tout(cct) << stx->stx_mtime << std::endl;
11024 tout(cct) << stx->stx_atime << std::endl;
11025 tout(cct) << stx->stx_btime << std::endl;
11026 tout(cct) << mask << std::endl;
11027
11028 if (!fuse_default_permissions) {
11029 int res = may_setattr(in, stx, mask, perms);
11030 if (res < 0)
11031 return res;
11032 }
11033
11034 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11035
11036 return __setattrx(in, stx, mask, perms, inp);
11037 }
11038
11039 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11040 const UserPerm& perms)
11041 {
11042 std::lock_guard lock(client_lock);
11043
11044 if (unmounting)
11045 return -ENOTCONN;
11046
11047 InodeRef target(in);
11048 int res = _ll_setattrx(in, stx, mask, perms, &target);
11049 if (res == 0) {
11050 ceph_assert(in == target.get());
11051 fill_statx(in, in->caps_issued(), stx);
11052 }
11053
11054 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11055 return res;
11056 }
11057
11058 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11059 const UserPerm& perms)
11060 {
11061 struct ceph_statx stx;
11062 stat_to_statx(attr, &stx);
11063
11064 std::lock_guard lock(client_lock);
11065
11066 if (unmounting)
11067 return -ENOTCONN;
11068
11069 InodeRef target(in);
11070 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11071 if (res == 0) {
11072 ceph_assert(in == target.get());
11073 fill_stat(in, attr);
11074 }
11075
11076 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11077 return res;
11078 }
11079
11080
11081 // ----------
11082 // xattrs
11083
11084 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11085 const UserPerm& perms)
11086 {
11087 std::lock_guard lock(client_lock);
11088
11089 if (unmounting)
11090 return -ENOTCONN;
11091
11092 InodeRef in;
11093 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11094 if (r < 0)
11095 return r;
11096 return _getxattr(in, name, value, size, perms);
11097 }
11098
11099 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11100 const UserPerm& perms)
11101 {
11102 std::lock_guard lock(client_lock);
11103
11104 if (unmounting)
11105 return -ENOTCONN;
11106
11107 InodeRef in;
11108 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11109 if (r < 0)
11110 return r;
11111 return _getxattr(in, name, value, size, perms);
11112 }
11113
11114 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11115 const UserPerm& perms)
11116 {
11117 std::lock_guard lock(client_lock);
11118
11119 if (unmounting)
11120 return -ENOTCONN;
11121
11122 Fh *f = get_filehandle(fd);
11123 if (!f)
11124 return -EBADF;
11125 return _getxattr(f->inode, name, value, size, perms);
11126 }
11127
11128 int Client::listxattr(const char *path, char *list, size_t size,
11129 const UserPerm& perms)
11130 {
11131 std::lock_guard lock(client_lock);
11132
11133 if (unmounting)
11134 return -ENOTCONN;
11135
11136 InodeRef in;
11137 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11138 if (r < 0)
11139 return r;
11140 return Client::_listxattr(in.get(), list, size, perms);
11141 }
11142
11143 int Client::llistxattr(const char *path, char *list, size_t size,
11144 const UserPerm& perms)
11145 {
11146 std::lock_guard lock(client_lock);
11147
11148 if (unmounting)
11149 return -ENOTCONN;
11150
11151 InodeRef in;
11152 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11153 if (r < 0)
11154 return r;
11155 return Client::_listxattr(in.get(), list, size, perms);
11156 }
11157
11158 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11159 {
11160 std::lock_guard lock(client_lock);
11161
11162 if (unmounting)
11163 return -ENOTCONN;
11164
11165 Fh *f = get_filehandle(fd);
11166 if (!f)
11167 return -EBADF;
11168 return Client::_listxattr(f->inode.get(), list, size, perms);
11169 }
11170
11171 int Client::removexattr(const char *path, const char *name,
11172 const UserPerm& perms)
11173 {
11174 std::lock_guard lock(client_lock);
11175
11176 if (unmounting)
11177 return -ENOTCONN;
11178
11179 InodeRef in;
11180 int r = Client::path_walk(path, &in, perms, true);
11181 if (r < 0)
11182 return r;
11183 return _removexattr(in, name, perms);
11184 }
11185
11186 int Client::lremovexattr(const char *path, const char *name,
11187 const UserPerm& perms)
11188 {
11189 std::lock_guard lock(client_lock);
11190
11191 if (unmounting)
11192 return -ENOTCONN;
11193
11194 InodeRef in;
11195 int r = Client::path_walk(path, &in, perms, false);
11196 if (r < 0)
11197 return r;
11198 return _removexattr(in, name, perms);
11199 }
11200
11201 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11202 {
11203 std::lock_guard lock(client_lock);
11204
11205 if (unmounting)
11206 return -ENOTCONN;
11207
11208 Fh *f = get_filehandle(fd);
11209 if (!f)
11210 return -EBADF;
11211 return _removexattr(f->inode, name, perms);
11212 }
11213
11214 int Client::setxattr(const char *path, const char *name, const void *value,
11215 size_t size, int flags, const UserPerm& perms)
11216 {
11217 _setxattr_maybe_wait_for_osdmap(name, value, size);
11218
11219 std::lock_guard lock(client_lock);
11220
11221 if (unmounting)
11222 return -ENOTCONN;
11223
11224 InodeRef in;
11225 int r = Client::path_walk(path, &in, perms, true);
11226 if (r < 0)
11227 return r;
11228 return _setxattr(in, name, value, size, flags, perms);
11229 }
11230
11231 int Client::lsetxattr(const char *path, const char *name, const void *value,
11232 size_t size, int flags, const UserPerm& perms)
11233 {
11234 _setxattr_maybe_wait_for_osdmap(name, value, size);
11235
11236 std::lock_guard lock(client_lock);
11237
11238 if (unmounting)
11239 return -ENOTCONN;
11240
11241 InodeRef in;
11242 int r = Client::path_walk(path, &in, perms, false);
11243 if (r < 0)
11244 return r;
11245 return _setxattr(in, name, value, size, flags, perms);
11246 }
11247
11248 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11249 int flags, const UserPerm& perms)
11250 {
11251 _setxattr_maybe_wait_for_osdmap(name, value, size);
11252
11253 std::lock_guard lock(client_lock);
11254
11255 if (unmounting)
11256 return -ENOTCONN;
11257
11258 Fh *f = get_filehandle(fd);
11259 if (!f)
11260 return -EBADF;
11261 return _setxattr(f->inode, name, value, size, flags, perms);
11262 }
11263
11264 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11265 const UserPerm& perms)
11266 {
11267 int r;
11268
11269 const VXattr *vxattr = _match_vxattr(in, name);
11270 if (vxattr) {
11271 r = -ENODATA;
11272
11273 // Do a force getattr to get the latest quota before returning
11274 // a value to userspace.
11275 int flags = 0;
11276 if (vxattr->flags & VXATTR_RSTAT) {
11277 flags |= CEPH_STAT_RSTAT;
11278 }
11279 r = _getattr(in, flags, perms, true);
11280 if (r != 0) {
11281 // Error from getattr!
11282 return r;
11283 }
11284
11285 // call pointer-to-member function
11286 char buf[256];
11287 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11288 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11289 } else {
11290 r = -ENODATA;
11291 }
11292
11293 if (size != 0) {
11294 if (r > (int)size) {
11295 r = -ERANGE;
11296 } else if (r > 0) {
11297 memcpy(value, buf, r);
11298 }
11299 }
11300 goto out;
11301 }
11302
11303 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11304 r = -EOPNOTSUPP;
11305 goto out;
11306 }
11307
11308 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11309 if (r == 0) {
11310 string n(name);
11311 r = -ENODATA;
11312 if (in->xattrs.count(n)) {
11313 r = in->xattrs[n].length();
11314 if (r > 0 && size != 0) {
11315 if (size >= (unsigned)r)
11316 memcpy(value, in->xattrs[n].c_str(), r);
11317 else
11318 r = -ERANGE;
11319 }
11320 }
11321 }
11322 out:
11323 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11324 return r;
11325 }
11326
11327 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11328 const UserPerm& perms)
11329 {
11330 if (cct->_conf->client_permissions) {
11331 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11332 if (r < 0)
11333 return r;
11334 }
11335 return _getxattr(in.get(), name, value, size, perms);
11336 }
11337
11338 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11339 size_t size, const UserPerm& perms)
11340 {
11341 std::lock_guard lock(client_lock);
11342
11343 if (unmounting)
11344 return -ENOTCONN;
11345
11346 vinodeno_t vino = _get_vino(in);
11347
11348 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11349 tout(cct) << __func__ << std::endl;
11350 tout(cct) << vino.ino.val << std::endl;
11351 tout(cct) << name << std::endl;
11352
11353 if (!fuse_default_permissions) {
11354 int r = xattr_permission(in, name, MAY_READ, perms);
11355 if (r < 0)
11356 return r;
11357 }
11358
11359 return _getxattr(in, name, value, size, perms);
11360 }
11361
11362 int Client::_listxattr(Inode *in, char *name, size_t size,
11363 const UserPerm& perms)
11364 {
11365 bool len_only = (size == 0);
11366 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11367 if (r != 0) {
11368 goto out;
11369 }
11370
11371 r = 0;
11372 for (const auto& p : in->xattrs) {
11373 size_t this_len = p.first.length() + 1;
11374 r += this_len;
11375 if (len_only)
11376 continue;
11377
11378 if (this_len > size) {
11379 r = -ERANGE;
11380 goto out;
11381 }
11382
11383 memcpy(name, p.first.c_str(), this_len);
11384 name += this_len;
11385 size -= this_len;
11386 }
11387
11388 const VXattr *vxattr;
11389 for (vxattr = _get_vxattrs(in); vxattr && !vxattr->name.empty(); vxattr++) {
11390 if (vxattr->hidden)
11391 continue;
11392 // call pointer-to-member function
11393 if (vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))
11394 continue;
11395
11396 size_t this_len = vxattr->name.length() + 1;
11397 r += this_len;
11398 if (len_only)
11399 continue;
11400
11401 if (this_len > size) {
11402 r = -ERANGE;
11403 goto out;
11404 }
11405
11406 memcpy(name, vxattr->name.c_str(), this_len);
11407 name += this_len;
11408 size -= this_len;
11409 }
11410 out:
11411 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11412 return r;
11413 }
11414
11415 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11416 const UserPerm& perms)
11417 {
11418 std::lock_guard lock(client_lock);
11419
11420 if (unmounting)
11421 return -ENOTCONN;
11422
11423 vinodeno_t vino = _get_vino(in);
11424
11425 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11426 tout(cct) << __func__ << std::endl;
11427 tout(cct) << vino.ino.val << std::endl;
11428 tout(cct) << size << std::endl;
11429
11430 return _listxattr(in, names, size, perms);
11431 }
11432
11433 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11434 size_t size, int flags, const UserPerm& perms)
11435 {
11436
11437 int xattr_flags = 0;
11438 if (!value)
11439 xattr_flags |= CEPH_XATTR_REMOVE;
11440 if (flags & XATTR_CREATE)
11441 xattr_flags |= CEPH_XATTR_CREATE;
11442 if (flags & XATTR_REPLACE)
11443 xattr_flags |= CEPH_XATTR_REPLACE;
11444
11445 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11446 filepath path;
11447 in->make_nosnap_relative_path(path);
11448 req->set_filepath(path);
11449 req->set_string2(name);
11450 req->set_inode(in);
11451 req->head.args.setxattr.flags = xattr_flags;
11452
11453 bufferlist bl;
11454 assert (value || size == 0);
11455 bl.append((const char*)value, size);
11456 req->set_data(bl);
11457
11458 int res = make_request(req, perms);
11459
11460 trim_cache();
11461 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11462 res << dendl;
11463 return res;
11464 }
11465
11466 int Client::_setxattr(Inode *in, const char *name, const void *value,
11467 size_t size, int flags, const UserPerm& perms)
11468 {
11469 if (in->snapid != CEPH_NOSNAP) {
11470 return -EROFS;
11471 }
11472
11473 bool posix_acl_xattr = false;
11474 if (acl_type == POSIX_ACL)
11475 posix_acl_xattr = !strncmp(name, "system.", 7);
11476
11477 if (strncmp(name, "user.", 5) &&
11478 strncmp(name, "security.", 9) &&
11479 strncmp(name, "trusted.", 8) &&
11480 strncmp(name, "ceph.", 5) &&
11481 !posix_acl_xattr)
11482 return -EOPNOTSUPP;
11483
11484 bool check_realm = false;
11485
11486 if (posix_acl_xattr) {
11487 if (!strcmp(name, ACL_EA_ACCESS)) {
11488 mode_t new_mode = in->mode;
11489 if (value) {
11490 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11491 if (ret < 0)
11492 return ret;
11493 if (ret == 0) {
11494 value = NULL;
11495 size = 0;
11496 }
11497 if (new_mode != in->mode) {
11498 struct ceph_statx stx;
11499 stx.stx_mode = new_mode;
11500 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11501 if (ret < 0)
11502 return ret;
11503 }
11504 }
11505 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11506 if (value) {
11507 if (!S_ISDIR(in->mode))
11508 return -EACCES;
11509 int ret = posix_acl_check(value, size);
11510 if (ret < 0)
11511 return -EINVAL;
11512 if (ret == 0) {
11513 value = NULL;
11514 size = 0;
11515 }
11516 }
11517 } else {
11518 return -EOPNOTSUPP;
11519 }
11520 } else {
11521 const VXattr *vxattr = _match_vxattr(in, name);
11522 if (vxattr) {
11523 if (vxattr->readonly)
11524 return -EOPNOTSUPP;
11525 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11526 check_realm = true;
11527 }
11528 }
11529
11530 int ret = _do_setxattr(in, name, value, size, flags, perms);
11531 if (ret >= 0 && check_realm) {
11532 // check if snaprealm was created for quota inode
11533 if (in->quota.is_enable() &&
11534 !(in->snaprealm && in->snaprealm->ino == in->ino))
11535 ret = -EOPNOTSUPP;
11536 }
11537
11538 return ret;
11539 }
11540
11541 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11542 size_t size, int flags, const UserPerm& perms)
11543 {
11544 if (cct->_conf->client_permissions) {
11545 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11546 if (r < 0)
11547 return r;
11548 }
11549 return _setxattr(in.get(), name, value, size, flags, perms);
11550 }
11551
11552 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11553 {
11554 string tmp;
11555 if (name == "layout") {
11556 string::iterator begin = value.begin();
11557 string::iterator end = value.end();
11558 keys_and_values<string::iterator> p; // create instance of parser
11559 std::map<string, string> m; // map to receive results
11560 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11561 return -EINVAL;
11562 }
11563 if (begin != end)
11564 return -EINVAL;
11565 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11566 if (q->first == "pool") {
11567 tmp = q->second;
11568 break;
11569 }
11570 }
11571 } else if (name == "layout.pool") {
11572 tmp = value;
11573 }
11574
11575 if (tmp.length()) {
11576 int64_t pool;
11577 try {
11578 pool = boost::lexical_cast<unsigned>(tmp);
11579 if (!osdmap->have_pg_pool(pool))
11580 return -ENOENT;
11581 } catch (boost::bad_lexical_cast const&) {
11582 pool = osdmap->lookup_pg_pool_name(tmp);
11583 if (pool < 0) {
11584 return -ENOENT;
11585 }
11586 }
11587 }
11588
11589 return 0;
11590 }
11591
11592 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11593 {
11594 // For setting pool of layout, MetaRequest need osdmap epoch.
11595 // There is a race which create a new data pool but client and mds both don't have.
11596 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11597 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11598 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11599 string rest(strstr(name, "layout"));
11600 string v((const char*)value, size);
11601 int r = objecter->with_osdmap([&](const OSDMap& o) {
11602 return _setxattr_check_data_pool(rest, v, &o);
11603 });
11604
11605 if (r == -ENOENT) {
11606 C_SaferCond ctx;
11607 objecter->wait_for_latest_osdmap(&ctx);
11608 ctx.wait();
11609 }
11610 }
11611 }
11612
11613 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11614 size_t size, int flags, const UserPerm& perms)
11615 {
11616 _setxattr_maybe_wait_for_osdmap(name, value, size);
11617
11618 std::lock_guard lock(client_lock);
11619
11620 if (unmounting)
11621 return -ENOTCONN;
11622
11623 vinodeno_t vino = _get_vino(in);
11624
11625 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11626 tout(cct) << __func__ << std::endl;
11627 tout(cct) << vino.ino.val << std::endl;
11628 tout(cct) << name << std::endl;
11629
11630 if (!fuse_default_permissions) {
11631 int r = xattr_permission(in, name, MAY_WRITE, perms);
11632 if (r < 0)
11633 return r;
11634 }
11635 return _setxattr(in, name, value, size, flags, perms);
11636 }
11637
11638 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11639 {
11640 if (in->snapid != CEPH_NOSNAP) {
11641 return -EROFS;
11642 }
11643
11644 // same xattrs supported by kernel client
11645 if (strncmp(name, "user.", 5) &&
11646 strncmp(name, "system.", 7) &&
11647 strncmp(name, "security.", 9) &&
11648 strncmp(name, "trusted.", 8) &&
11649 strncmp(name, "ceph.", 5))
11650 return -EOPNOTSUPP;
11651
11652 const VXattr *vxattr = _match_vxattr(in, name);
11653 if (vxattr && vxattr->readonly)
11654 return -EOPNOTSUPP;
11655
11656 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11657 filepath path;
11658 in->make_nosnap_relative_path(path);
11659 req->set_filepath(path);
11660 req->set_filepath2(name);
11661 req->set_inode(in);
11662
11663 int res = make_request(req, perms);
11664
11665 trim_cache();
11666 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11667 return res;
11668 }
11669
11670 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11671 {
11672 if (cct->_conf->client_permissions) {
11673 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11674 if (r < 0)
11675 return r;
11676 }
11677 return _removexattr(in.get(), name, perms);
11678 }
11679
11680 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11681 {
11682 std::lock_guard lock(client_lock);
11683
11684 if (unmounting)
11685 return -ENOTCONN;
11686
11687 vinodeno_t vino = _get_vino(in);
11688
11689 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11690 tout(cct) << "ll_removexattr" << std::endl;
11691 tout(cct) << vino.ino.val << std::endl;
11692 tout(cct) << name << std::endl;
11693
11694 if (!fuse_default_permissions) {
11695 int r = xattr_permission(in, name, MAY_WRITE, perms);
11696 if (r < 0)
11697 return r;
11698 }
11699
11700 return _removexattr(in, name, perms);
11701 }
11702
11703 bool Client::_vxattrcb_quota_exists(Inode *in)
11704 {
11705 return in->quota.is_enable() &&
11706 in->snaprealm && in->snaprealm->ino == in->ino;
11707 }
11708 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11709 {
11710 return snprintf(val, size,
11711 "max_bytes=%lld max_files=%lld",
11712 (long long int)in->quota.max_bytes,
11713 (long long int)in->quota.max_files);
11714 }
11715 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11716 {
11717 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11718 }
11719 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11720 {
11721 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11722 }
11723
11724 bool Client::_vxattrcb_layout_exists(Inode *in)
11725 {
11726 return in->layout != file_layout_t();
11727 }
11728 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11729 {
11730 int r = snprintf(val, size,
11731 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11732 (unsigned long long)in->layout.stripe_unit,
11733 (unsigned long long)in->layout.stripe_count,
11734 (unsigned long long)in->layout.object_size);
11735 objecter->with_osdmap([&](const OSDMap& o) {
11736 if (o.have_pg_pool(in->layout.pool_id))
11737 r += snprintf(val + r, size - r, "%s",
11738 o.get_pool_name(in->layout.pool_id).c_str());
11739 else
11740 r += snprintf(val + r, size - r, "%" PRIu64,
11741 (uint64_t)in->layout.pool_id);
11742 });
11743 if (in->layout.pool_ns.length())
11744 r += snprintf(val + r, size - r, " pool_namespace=%s",
11745 in->layout.pool_ns.c_str());
11746 return r;
11747 }
11748 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11749 {
11750 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11751 }
11752 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11753 {
11754 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11755 }
11756 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11757 {
11758 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11759 }
11760 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11761 {
11762 size_t r;
11763 objecter->with_osdmap([&](const OSDMap& o) {
11764 if (o.have_pg_pool(in->layout.pool_id))
11765 r = snprintf(val, size, "%s", o.get_pool_name(
11766 in->layout.pool_id).c_str());
11767 else
11768 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11769 });
11770 return r;
11771 }
11772 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11773 {
11774 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11775 }
11776 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11777 {
11778 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11779 }
11780 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11781 {
11782 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11783 }
11784 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11785 {
11786 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11787 }
11788 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11789 {
11790 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11791 }
11792 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11793 {
11794 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11795 }
11796 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11797 {
11798 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11799 }
11800 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11801 {
11802 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11803 }
11804 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11805 {
11806 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11807 (long)in->rstat.rctime.nsec());
11808 }
11809 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11810 {
11811 return in->dir_pin != -ENODATA;
11812 }
11813 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11814 {
11815 return snprintf(val, size, "%ld", (long)in->dir_pin);
11816 }
11817
11818 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11819 {
11820 return !in->snap_btime.is_zero();
11821 }
11822
11823 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11824 {
11825 return snprintf(val, size, "%llu.%09lu",
11826 (long long unsigned)in->snap_btime.sec(),
11827 (long unsigned)in->snap_btime.nsec());
11828 }
11829
11830 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11831 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11832
11833 #define XATTR_NAME_CEPH(_type, _name) \
11834 { \
11835 name: CEPH_XATTR_NAME(_type, _name), \
11836 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11837 readonly: true, \
11838 hidden: false, \
11839 exists_cb: NULL, \
11840 flags: 0, \
11841 }
11842 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11843 { \
11844 name: CEPH_XATTR_NAME(_type, _name), \
11845 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11846 readonly: true, \
11847 hidden: false, \
11848 exists_cb: NULL, \
11849 flags: _flags, \
11850 }
11851 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11852 { \
11853 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11854 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11855 readonly: false, \
11856 hidden: true, \
11857 exists_cb: &Client::_vxattrcb_layout_exists, \
11858 flags: 0, \
11859 }
11860 #define XATTR_QUOTA_FIELD(_type, _name) \
11861 { \
11862 name: CEPH_XATTR_NAME(_type, _name), \
11863 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11864 readonly: false, \
11865 hidden: true, \
11866 exists_cb: &Client::_vxattrcb_quota_exists, \
11867 flags: 0, \
11868 }
11869
11870 const Client::VXattr Client::_dir_vxattrs[] = {
11871 {
11872 name: "ceph.dir.layout",
11873 getxattr_cb: &Client::_vxattrcb_layout,
11874 readonly: false,
11875 hidden: true,
11876 exists_cb: &Client::_vxattrcb_layout_exists,
11877 flags: 0,
11878 },
11879 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11880 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11881 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11882 XATTR_LAYOUT_FIELD(dir, layout, pool),
11883 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11884 XATTR_NAME_CEPH(dir, entries),
11885 XATTR_NAME_CEPH(dir, files),
11886 XATTR_NAME_CEPH(dir, subdirs),
11887 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11888 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11889 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11890 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11891 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11892 {
11893 name: "ceph.quota",
11894 getxattr_cb: &Client::_vxattrcb_quota,
11895 readonly: false,
11896 hidden: true,
11897 exists_cb: &Client::_vxattrcb_quota_exists,
11898 flags: 0,
11899 },
11900 XATTR_QUOTA_FIELD(quota, max_bytes),
11901 XATTR_QUOTA_FIELD(quota, max_files),
11902 {
11903 name: "ceph.dir.pin",
11904 getxattr_cb: &Client::_vxattrcb_dir_pin,
11905 readonly: false,
11906 hidden: true,
11907 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11908 flags: 0,
11909 },
11910 {
11911 name: "ceph.snap.btime",
11912 getxattr_cb: &Client::_vxattrcb_snap_btime,
11913 readonly: true,
11914 hidden: false,
11915 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11916 flags: 0,
11917 },
11918 { name: "" } /* Required table terminator */
11919 };
11920
11921 const Client::VXattr Client::_file_vxattrs[] = {
11922 {
11923 name: "ceph.file.layout",
11924 getxattr_cb: &Client::_vxattrcb_layout,
11925 readonly: false,
11926 hidden: true,
11927 exists_cb: &Client::_vxattrcb_layout_exists,
11928 flags: 0,
11929 },
11930 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11931 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11932 XATTR_LAYOUT_FIELD(file, layout, object_size),
11933 XATTR_LAYOUT_FIELD(file, layout, pool),
11934 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11935 {
11936 name: "ceph.snap.btime",
11937 getxattr_cb: &Client::_vxattrcb_snap_btime,
11938 readonly: true,
11939 hidden: false,
11940 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11941 flags: 0,
11942 },
11943 { name: "" } /* Required table terminator */
11944 };
11945
11946 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11947 {
11948 if (in->is_dir())
11949 return _dir_vxattrs;
11950 else if (in->is_file())
11951 return _file_vxattrs;
11952 return NULL;
11953 }
11954
11955 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11956 {
11957 if (strncmp(name, "ceph.", 5) == 0) {
11958 const VXattr *vxattr = _get_vxattrs(in);
11959 if (vxattr) {
11960 while (!vxattr->name.empty()) {
11961 if (vxattr->name == name)
11962 return vxattr;
11963 vxattr++;
11964 }
11965 }
11966 }
11967 return NULL;
11968 }
11969
11970 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11971 {
11972 std::lock_guard lock(client_lock);
11973
11974 if (unmounting)
11975 return -ENOTCONN;
11976
11977 vinodeno_t vino = _get_vino(in);
11978
11979 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11980 tout(cct) << "ll_readlink" << std::endl;
11981 tout(cct) << vino.ino.val << std::endl;
11982
11983 for (auto dn : in->dentries) {
11984 touch_dn(dn);
11985 }
11986
11987 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11988 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11989 return r;
11990 }
11991
11992 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11993 const UserPerm& perms, InodeRef *inp)
11994 {
11995 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11996 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11997 << ", gid " << perms.gid() << ")" << dendl;
11998
11999 if (strlen(name) > NAME_MAX)
12000 return -ENAMETOOLONG;
12001
12002 if (dir->snapid != CEPH_NOSNAP) {
12003 return -EROFS;
12004 }
12005 if (is_quota_files_exceeded(dir, perms)) {
12006 return -EDQUOT;
12007 }
12008
12009 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12010
12011 filepath path;
12012 dir->make_nosnap_relative_path(path);
12013 path.push_dentry(name);
12014 req->set_filepath(path);
12015 req->set_inode(dir);
12016 req->head.args.mknod.rdev = rdev;
12017 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12018 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12019
12020 bufferlist xattrs_bl;
12021 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12022 if (res < 0)
12023 goto fail;
12024 req->head.args.mknod.mode = mode;
12025 if (xattrs_bl.length() > 0)
12026 req->set_data(xattrs_bl);
12027
12028 Dentry *de;
12029 res = get_or_create(dir, name, &de);
12030 if (res < 0)
12031 goto fail;
12032 req->set_dentry(de);
12033
12034 res = make_request(req, perms, inp);
12035
12036 trim_cache();
12037
12038 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12039 return res;
12040
12041 fail:
12042 put_request(req);
12043 return res;
12044 }
12045
12046 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12047 dev_t rdev, struct stat *attr, Inode **out,
12048 const UserPerm& perms)
12049 {
12050 std::lock_guard lock(client_lock);
12051
12052 if (unmounting)
12053 return -ENOTCONN;
12054
12055 vinodeno_t vparent = _get_vino(parent);
12056
12057 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12058 tout(cct) << "ll_mknod" << std::endl;
12059 tout(cct) << vparent.ino.val << std::endl;
12060 tout(cct) << name << std::endl;
12061 tout(cct) << mode << std::endl;
12062 tout(cct) << rdev << std::endl;
12063
12064 if (!fuse_default_permissions) {
12065 int r = may_create(parent, perms);
12066 if (r < 0)
12067 return r;
12068 }
12069
12070 InodeRef in;
12071 int r = _mknod(parent, name, mode, rdev, perms, &in);
12072 if (r == 0) {
12073 fill_stat(in, attr);
12074 _ll_get(in.get());
12075 }
12076 tout(cct) << attr->st_ino << std::endl;
12077 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12078 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12079 *out = in.get();
12080 return r;
12081 }
12082
12083 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12084 dev_t rdev, Inode **out,
12085 struct ceph_statx *stx, unsigned want, unsigned flags,
12086 const UserPerm& perms)
12087 {
12088 unsigned caps = statx_to_mask(flags, want);
12089 std::lock_guard lock(client_lock);
12090
12091 if (unmounting)
12092 return -ENOTCONN;
12093
12094 vinodeno_t vparent = _get_vino(parent);
12095
12096 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12097 tout(cct) << "ll_mknodx" << std::endl;
12098 tout(cct) << vparent.ino.val << std::endl;
12099 tout(cct) << name << std::endl;
12100 tout(cct) << mode << std::endl;
12101 tout(cct) << rdev << std::endl;
12102
12103 if (!fuse_default_permissions) {
12104 int r = may_create(parent, perms);
12105 if (r < 0)
12106 return r;
12107 }
12108
12109 InodeRef in;
12110 int r = _mknod(parent, name, mode, rdev, perms, &in);
12111 if (r == 0) {
12112 fill_statx(in, caps, stx);
12113 _ll_get(in.get());
12114 }
12115 tout(cct) << stx->stx_ino << std::endl;
12116 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12117 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12118 *out = in.get();
12119 return r;
12120 }
12121
12122 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12123 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12124 int object_size, const char *data_pool, bool *created,
12125 const UserPerm& perms)
12126 {
12127 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12128 mode << dec << ")" << dendl;
12129
12130 if (strlen(name) > NAME_MAX)
12131 return -ENAMETOOLONG;
12132 if (dir->snapid != CEPH_NOSNAP) {
12133 return -EROFS;
12134 }
12135 if (is_quota_files_exceeded(dir, perms)) {
12136 return -EDQUOT;
12137 }
12138
12139 // use normalized flags to generate cmode
12140 int cflags = ceph_flags_sys2wire(flags);
12141 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12142 cflags |= CEPH_O_LAZY;
12143
12144 int cmode = ceph_flags_to_mode(cflags);
12145
12146 int64_t pool_id = -1;
12147 if (data_pool && *data_pool) {
12148 pool_id = objecter->with_osdmap(
12149 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12150 if (pool_id < 0)
12151 return -EINVAL;
12152 if (pool_id > 0xffffffffll)
12153 return -ERANGE; // bummer!
12154 }
12155
12156 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12157
12158 filepath path;
12159 dir->make_nosnap_relative_path(path);
12160 path.push_dentry(name);
12161 req->set_filepath(path);
12162 req->set_inode(dir);
12163 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12164
12165 req->head.args.open.stripe_unit = stripe_unit;
12166 req->head.args.open.stripe_count = stripe_count;
12167 req->head.args.open.object_size = object_size;
12168 if (cct->_conf->client_debug_getattr_caps)
12169 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12170 else
12171 req->head.args.open.mask = 0;
12172 req->head.args.open.pool = pool_id;
12173 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12174 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12175
12176 mode |= S_IFREG;
12177 bufferlist xattrs_bl;
12178 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12179 if (res < 0)
12180 goto fail;
12181 req->head.args.open.mode = mode;
12182 if (xattrs_bl.length() > 0)
12183 req->set_data(xattrs_bl);
12184
12185 Dentry *de;
12186 res = get_or_create(dir, name, &de);
12187 if (res < 0)
12188 goto fail;
12189 req->set_dentry(de);
12190
12191 res = make_request(req, perms, inp, created);
12192 if (res < 0) {
12193 goto reply_error;
12194 }
12195
12196 /* If the caller passed a value in fhp, do the open */
12197 if(fhp) {
12198 (*inp)->get_open_ref(cmode);
12199 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12200 }
12201
12202 reply_error:
12203 trim_cache();
12204
12205 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12206 << " layout " << stripe_unit
12207 << ' ' << stripe_count
12208 << ' ' << object_size
12209 <<") = " << res << dendl;
12210 return res;
12211
12212 fail:
12213 put_request(req);
12214 return res;
12215 }
12216
12217
12218 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12219 InodeRef *inp)
12220 {
12221 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12222 << mode << dec << ", uid " << perm.uid()
12223 << ", gid " << perm.gid() << ")" << dendl;
12224
12225 if (strlen(name) > NAME_MAX)
12226 return -ENAMETOOLONG;
12227
12228 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12229 return -EROFS;
12230 }
12231 if (is_quota_files_exceeded(dir, perm)) {
12232 return -EDQUOT;
12233 }
12234 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12235 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12236
12237 filepath path;
12238 dir->make_nosnap_relative_path(path);
12239 path.push_dentry(name);
12240 req->set_filepath(path);
12241 req->set_inode(dir);
12242 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12243 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12244
12245 mode |= S_IFDIR;
12246 bufferlist xattrs_bl;
12247 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12248 if (res < 0)
12249 goto fail;
12250 req->head.args.mkdir.mode = mode;
12251 if (xattrs_bl.length() > 0)
12252 req->set_data(xattrs_bl);
12253
12254 Dentry *de;
12255 res = get_or_create(dir, name, &de);
12256 if (res < 0)
12257 goto fail;
12258 req->set_dentry(de);
12259
12260 ldout(cct, 10) << "_mkdir: making request" << dendl;
12261 res = make_request(req, perm, inp);
12262 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12263
12264 trim_cache();
12265
12266 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12267 return res;
12268
12269 fail:
12270 put_request(req);
12271 return res;
12272 }
12273
12274 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12275 struct stat *attr, Inode **out, const UserPerm& perm)
12276 {
12277 std::lock_guard lock(client_lock);
12278
12279 if (unmounting)
12280 return -ENOTCONN;
12281
12282 vinodeno_t vparent = _get_vino(parent);
12283
12284 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12285 tout(cct) << "ll_mkdir" << std::endl;
12286 tout(cct) << vparent.ino.val << std::endl;
12287 tout(cct) << name << std::endl;
12288 tout(cct) << mode << std::endl;
12289
12290 if (!fuse_default_permissions) {
12291 int r = may_create(parent, perm);
12292 if (r < 0)
12293 return r;
12294 }
12295
12296 InodeRef in;
12297 int r = _mkdir(parent, name, mode, perm, &in);
12298 if (r == 0) {
12299 fill_stat(in, attr);
12300 _ll_get(in.get());
12301 }
12302 tout(cct) << attr->st_ino << std::endl;
12303 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12304 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12305 *out = in.get();
12306 return r;
12307 }
12308
12309 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12310 struct ceph_statx *stx, unsigned want, unsigned flags,
12311 const UserPerm& perms)
12312 {
12313 std::lock_guard lock(client_lock);
12314
12315 if (unmounting)
12316 return -ENOTCONN;
12317
12318 vinodeno_t vparent = _get_vino(parent);
12319
12320 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12321 tout(cct) << "ll_mkdirx" << std::endl;
12322 tout(cct) << vparent.ino.val << std::endl;
12323 tout(cct) << name << std::endl;
12324 tout(cct) << mode << std::endl;
12325
12326 if (!fuse_default_permissions) {
12327 int r = may_create(parent, perms);
12328 if (r < 0)
12329 return r;
12330 }
12331
12332 InodeRef in;
12333 int r = _mkdir(parent, name, mode, perms, &in);
12334 if (r == 0) {
12335 fill_statx(in, statx_to_mask(flags, want), stx);
12336 _ll_get(in.get());
12337 } else {
12338 stx->stx_ino = 0;
12339 stx->stx_mask = 0;
12340 }
12341 tout(cct) << stx->stx_ino << std::endl;
12342 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12343 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12344 *out = in.get();
12345 return r;
12346 }
12347
12348 int Client::_symlink(Inode *dir, const char *name, const char *target,
12349 const UserPerm& perms, InodeRef *inp)
12350 {
12351 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12352 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12353 << dendl;
12354
12355 if (strlen(name) > NAME_MAX)
12356 return -ENAMETOOLONG;
12357
12358 if (dir->snapid != CEPH_NOSNAP) {
12359 return -EROFS;
12360 }
12361 if (is_quota_files_exceeded(dir, perms)) {
12362 return -EDQUOT;
12363 }
12364
12365 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12366
12367 filepath path;
12368 dir->make_nosnap_relative_path(path);
12369 path.push_dentry(name);
12370 req->set_filepath(path);
12371 req->set_inode(dir);
12372 req->set_string2(target);
12373 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12374 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12375
12376 Dentry *de;
12377 int res = get_or_create(dir, name, &de);
12378 if (res < 0)
12379 goto fail;
12380 req->set_dentry(de);
12381
12382 res = make_request(req, perms, inp);
12383
12384 trim_cache();
12385 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12386 res << dendl;
12387 return res;
12388
12389 fail:
12390 put_request(req);
12391 return res;
12392 }
12393
12394 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12395 struct stat *attr, Inode **out, const UserPerm& perms)
12396 {
12397 std::lock_guard lock(client_lock);
12398
12399 if (unmounting)
12400 return -ENOTCONN;
12401
12402 vinodeno_t vparent = _get_vino(parent);
12403
12404 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12405 << dendl;
12406 tout(cct) << "ll_symlink" << std::endl;
12407 tout(cct) << vparent.ino.val << std::endl;
12408 tout(cct) << name << std::endl;
12409 tout(cct) << value << std::endl;
12410
12411 if (!fuse_default_permissions) {
12412 int r = may_create(parent, perms);
12413 if (r < 0)
12414 return r;
12415 }
12416
12417 InodeRef in;
12418 int r = _symlink(parent, name, value, perms, &in);
12419 if (r == 0) {
12420 fill_stat(in, attr);
12421 _ll_get(in.get());
12422 }
12423 tout(cct) << attr->st_ino << std::endl;
12424 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12425 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12426 *out = in.get();
12427 return r;
12428 }
12429
12430 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12431 Inode **out, struct ceph_statx *stx, unsigned want,
12432 unsigned flags, const UserPerm& perms)
12433 {
12434 std::lock_guard lock(client_lock);
12435
12436 if (unmounting)
12437 return -ENOTCONN;
12438
12439 vinodeno_t vparent = _get_vino(parent);
12440
12441 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12442 << dendl;
12443 tout(cct) << "ll_symlinkx" << std::endl;
12444 tout(cct) << vparent.ino.val << std::endl;
12445 tout(cct) << name << std::endl;
12446 tout(cct) << value << std::endl;
12447
12448 if (!fuse_default_permissions) {
12449 int r = may_create(parent, perms);
12450 if (r < 0)
12451 return r;
12452 }
12453
12454 InodeRef in;
12455 int r = _symlink(parent, name, value, perms, &in);
12456 if (r == 0) {
12457 fill_statx(in, statx_to_mask(flags, want), stx);
12458 _ll_get(in.get());
12459 }
12460 tout(cct) << stx->stx_ino << std::endl;
12461 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12462 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12463 *out = in.get();
12464 return r;
12465 }
12466
12467 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12468 {
12469 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12470 << " uid " << perm.uid() << " gid " << perm.gid()
12471 << ")" << dendl;
12472
12473 if (dir->snapid != CEPH_NOSNAP) {
12474 return -EROFS;
12475 }
12476
12477 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12478
12479 filepath path;
12480 dir->make_nosnap_relative_path(path);
12481 path.push_dentry(name);
12482 req->set_filepath(path);
12483
12484 InodeRef otherin;
12485 Inode *in;
12486 Dentry *de;
12487
12488 int res = get_or_create(dir, name, &de);
12489 if (res < 0)
12490 goto fail;
12491 req->set_dentry(de);
12492 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12493 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12494
12495 res = _lookup(dir, name, 0, &otherin, perm);
12496 if (res < 0)
12497 goto fail;
12498
12499 in = otherin.get();
12500 req->set_other_inode(in);
12501 in->break_all_delegs();
12502 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12503
12504 req->set_inode(dir);
12505
12506 res = make_request(req, perm);
12507
12508 trim_cache();
12509 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12510 return res;
12511
12512 fail:
12513 put_request(req);
12514 return res;
12515 }
12516
12517 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12518 {
12519 std::lock_guard lock(client_lock);
12520
12521 if (unmounting)
12522 return -ENOTCONN;
12523
12524 vinodeno_t vino = _get_vino(in);
12525
12526 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12527 tout(cct) << "ll_unlink" << std::endl;
12528 tout(cct) << vino.ino.val << std::endl;
12529 tout(cct) << name << std::endl;
12530
12531 if (!fuse_default_permissions) {
12532 int r = may_delete(in, name, perm);
12533 if (r < 0)
12534 return r;
12535 }
12536 return _unlink(in, name, perm);
12537 }
12538
12539 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12540 {
12541 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12542 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12543
12544 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12545 return -EROFS;
12546 }
12547
12548 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12549 MetaRequest *req = new MetaRequest(op);
12550 filepath path;
12551 dir->make_nosnap_relative_path(path);
12552 path.push_dentry(name);
12553 req->set_filepath(path);
12554 req->set_inode(dir);
12555
12556 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12557 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12558 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12559
12560 InodeRef in;
12561
12562 Dentry *de;
12563 int res = get_or_create(dir, name, &de);
12564 if (res < 0)
12565 goto fail;
12566 if (op == CEPH_MDS_OP_RMDIR)
12567 req->set_dentry(de);
12568 else
12569 de->get();
12570
12571 res = _lookup(dir, name, 0, &in, perms);
12572 if (res < 0)
12573 goto fail;
12574
12575 if (op == CEPH_MDS_OP_RMSNAP) {
12576 unlink(de, true, true);
12577 de->put();
12578 }
12579 req->set_other_inode(in.get());
12580
12581 res = make_request(req, perms);
12582
12583 trim_cache();
12584 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12585 return res;
12586
12587 fail:
12588 put_request(req);
12589 return res;
12590 }
12591
12592 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12593 {
12594 std::lock_guard lock(client_lock);
12595
12596 if (unmounting)
12597 return -ENOTCONN;
12598
12599 vinodeno_t vino = _get_vino(in);
12600
12601 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12602 tout(cct) << "ll_rmdir" << std::endl;
12603 tout(cct) << vino.ino.val << std::endl;
12604 tout(cct) << name << std::endl;
12605
12606 if (!fuse_default_permissions) {
12607 int r = may_delete(in, name, perms);
12608 if (r < 0)
12609 return r;
12610 }
12611
12612 return _rmdir(in, name, perms);
12613 }
12614
12615 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12616 {
12617 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12618 << todir->ino << " " << toname
12619 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12620 << dendl;
12621
12622 if (fromdir->snapid != todir->snapid)
12623 return -EXDEV;
12624
12625 int op = CEPH_MDS_OP_RENAME;
12626 if (fromdir->snapid != CEPH_NOSNAP) {
12627 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12628 op = CEPH_MDS_OP_RENAMESNAP;
12629 else
12630 return -EROFS;
12631 }
12632 if (fromdir != todir) {
12633 Inode *fromdir_root =
12634 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12635 Inode *todir_root =
12636 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12637 if (fromdir_root != todir_root) {
12638 return -EXDEV;
12639 }
12640 }
12641
12642 InodeRef target;
12643 MetaRequest *req = new MetaRequest(op);
12644
12645 filepath from;
12646 fromdir->make_nosnap_relative_path(from);
12647 from.push_dentry(fromname);
12648 filepath to;
12649 todir->make_nosnap_relative_path(to);
12650 to.push_dentry(toname);
12651 req->set_filepath(to);
12652 req->set_filepath2(from);
12653
12654 Dentry *oldde;
12655 int res = get_or_create(fromdir, fromname, &oldde);
12656 if (res < 0)
12657 goto fail;
12658 Dentry *de;
12659 res = get_or_create(todir, toname, &de);
12660 if (res < 0)
12661 goto fail;
12662
12663 if (op == CEPH_MDS_OP_RENAME) {
12664 req->set_old_dentry(oldde);
12665 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12666 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12667
12668 req->set_dentry(de);
12669 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12670 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12671
12672 InodeRef oldin, otherin;
12673 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12674 if (res < 0)
12675 goto fail;
12676
12677 Inode *oldinode = oldin.get();
12678 oldinode->break_all_delegs();
12679 req->set_old_inode(oldinode);
12680 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12681
12682 res = _lookup(todir, toname, 0, &otherin, perm);
12683 switch (res) {
12684 case 0:
12685 {
12686 Inode *in = otherin.get();
12687 req->set_other_inode(in);
12688 in->break_all_delegs();
12689 }
12690 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12691 break;
12692 case -ENOENT:
12693 break;
12694 default:
12695 goto fail;
12696 }
12697
12698 req->set_inode(todir);
12699 } else {
12700 // renamesnap reply contains no tracedn, so we need to invalidate
12701 // dentry manually
12702 unlink(oldde, true, true);
12703 unlink(de, true, true);
12704
12705 req->set_inode(todir);
12706 }
12707
12708 res = make_request(req, perm, &target);
12709 ldout(cct, 10) << "rename result is " << res << dendl;
12710
12711 // renamed item from our cache
12712
12713 trim_cache();
12714 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12715 return res;
12716
12717 fail:
12718 put_request(req);
12719 return res;
12720 }
12721
12722 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12723 const char *newname, const UserPerm& perm)
12724 {
12725 std::lock_guard lock(client_lock);
12726
12727 if (unmounting)
12728 return -ENOTCONN;
12729
12730 vinodeno_t vparent = _get_vino(parent);
12731 vinodeno_t vnewparent = _get_vino(newparent);
12732
12733 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12734 << vnewparent << " " << newname << dendl;
12735 tout(cct) << "ll_rename" << std::endl;
12736 tout(cct) << vparent.ino.val << std::endl;
12737 tout(cct) << name << std::endl;
12738 tout(cct) << vnewparent.ino.val << std::endl;
12739 tout(cct) << newname << std::endl;
12740
12741 if (!fuse_default_permissions) {
12742 int r = may_delete(parent, name, perm);
12743 if (r < 0)
12744 return r;
12745 r = may_delete(newparent, newname, perm);
12746 if (r < 0 && r != -ENOENT)
12747 return r;
12748 }
12749
12750 return _rename(parent, name, newparent, newname, perm);
12751 }
12752
12753 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12754 {
12755 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12756 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12757
12758 if (strlen(newname) > NAME_MAX)
12759 return -ENAMETOOLONG;
12760
12761 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12762 return -EROFS;
12763 }
12764 if (is_quota_files_exceeded(dir, perm)) {
12765 return -EDQUOT;
12766 }
12767
12768 in->break_all_delegs();
12769 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12770
12771 filepath path(newname, dir->ino);
12772 req->set_filepath(path);
12773 filepath existing(in->ino);
12774 req->set_filepath2(existing);
12775
12776 req->set_inode(dir);
12777 req->inode_drop = CEPH_CAP_FILE_SHARED;
12778 req->inode_unless = CEPH_CAP_FILE_EXCL;
12779
12780 Dentry *de;
12781 int res = get_or_create(dir, newname, &de);
12782 if (res < 0)
12783 goto fail;
12784 req->set_dentry(de);
12785
12786 res = make_request(req, perm, inp);
12787 ldout(cct, 10) << "link result is " << res << dendl;
12788
12789 trim_cache();
12790 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12791 return res;
12792
12793 fail:
12794 put_request(req);
12795 return res;
12796 }
12797
12798 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12799 const UserPerm& perm)
12800 {
12801 std::lock_guard lock(client_lock);
12802
12803 if (unmounting)
12804 return -ENOTCONN;
12805
12806 vinodeno_t vino = _get_vino(in);
12807 vinodeno_t vnewparent = _get_vino(newparent);
12808
12809 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12810 newname << dendl;
12811 tout(cct) << "ll_link" << std::endl;
12812 tout(cct) << vino.ino.val << std::endl;
12813 tout(cct) << vnewparent << std::endl;
12814 tout(cct) << newname << std::endl;
12815
12816 InodeRef target;
12817
12818 if (!fuse_default_permissions) {
12819 if (S_ISDIR(in->mode))
12820 return -EPERM;
12821
12822 int r = may_hardlink(in, perm);
12823 if (r < 0)
12824 return r;
12825
12826 r = may_create(newparent, perm);
12827 if (r < 0)
12828 return r;
12829 }
12830
12831 return _link(in, newparent, newname, perm, &target);
12832 }
12833
12834 int Client::ll_num_osds(void)
12835 {
12836 std::lock_guard lock(client_lock);
12837 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12838 }
12839
12840 int Client::ll_osdaddr(int osd, uint32_t *addr)
12841 {
12842 std::lock_guard lock(client_lock);
12843
12844 entity_addr_t g;
12845 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12846 if (!o.exists(osd))
12847 return false;
12848 g = o.get_addrs(osd).front();
12849 return true;
12850 });
12851 if (!exists)
12852 return -1;
12853 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12854 *addr = ntohl(nb_addr);
12855 return 0;
12856 }
12857
12858 uint32_t Client::ll_stripe_unit(Inode *in)
12859 {
12860 std::lock_guard lock(client_lock);
12861 return in->layout.stripe_unit;
12862 }
12863
12864 uint64_t Client::ll_snap_seq(Inode *in)
12865 {
12866 std::lock_guard lock(client_lock);
12867 return in->snaprealm->seq;
12868 }
12869
12870 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12871 {
12872 std::lock_guard lock(client_lock);
12873 *layout = in->layout;
12874 return 0;
12875 }
12876
12877 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12878 {
12879 return ll_file_layout(fh->inode.get(), layout);
12880 }
12881
12882 /* Currently we cannot take advantage of redundancy in reads, since we
12883 would have to go through all possible placement groups (a
12884 potentially quite large number determined by a hash), and use CRUSH
12885 to calculate the appropriate set of OSDs for each placement group,
12886 then index into that. An array with one entry per OSD is much more
12887 tractable and works for demonstration purposes. */
12888
12889 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12890 file_layout_t* layout)
12891 {
12892 std::lock_guard lock(client_lock);
12893
12894 inodeno_t ino = in->ino;
12895 uint32_t object_size = layout->object_size;
12896 uint32_t su = layout->stripe_unit;
12897 uint32_t stripe_count = layout->stripe_count;
12898 uint64_t stripes_per_object = object_size / su;
12899 uint64_t stripeno = 0, stripepos = 0;
12900
12901 if(stripe_count) {
12902 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12903 stripepos = blockno % stripe_count; // which object in the object set (X)
12904 }
12905 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12906 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12907
12908 object_t oid = file_object_t(ino, objectno);
12909 return objecter->with_osdmap([&](const OSDMap& o) {
12910 ceph_object_layout olayout =
12911 o.file_to_object_layout(oid, *layout);
12912 pg_t pg = (pg_t)olayout.ol_pgid;
12913 vector<int> osds;
12914 int primary;
12915 o.pg_to_acting_osds(pg, &osds, &primary);
12916 return primary;
12917 });
12918 }
12919
12920 /* Return the offset of the block, internal to the object */
12921
12922 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12923 {
12924 std::lock_guard lock(client_lock);
12925 file_layout_t *layout=&(in->layout);
12926 uint32_t object_size = layout->object_size;
12927 uint32_t su = layout->stripe_unit;
12928 uint64_t stripes_per_object = object_size / su;
12929
12930 return (blockno % stripes_per_object) * su;
12931 }
12932
12933 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12934 const UserPerm& perms)
12935 {
12936 std::lock_guard lock(client_lock);
12937
12938 if (unmounting)
12939 return -ENOTCONN;
12940
12941 vinodeno_t vino = _get_vino(in);
12942
12943 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12944 tout(cct) << "ll_opendir" << std::endl;
12945 tout(cct) << vino.ino.val << std::endl;
12946
12947 if (!fuse_default_permissions) {
12948 int r = may_open(in, flags, perms);
12949 if (r < 0)
12950 return r;
12951 }
12952
12953 int r = _opendir(in, dirpp, perms);
12954 tout(cct) << (unsigned long)*dirpp << std::endl;
12955
12956 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12957 << dendl;
12958 return r;
12959 }
12960
12961 int Client::ll_releasedir(dir_result_t *dirp)
12962 {
12963 std::lock_guard lock(client_lock);
12964 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12965 tout(cct) << "ll_releasedir" << std::endl;
12966 tout(cct) << (unsigned long)dirp << std::endl;
12967
12968 if (unmounting)
12969 return -ENOTCONN;
12970
12971 _closedir(dirp);
12972 return 0;
12973 }
12974
12975 int Client::ll_fsyncdir(dir_result_t *dirp)
12976 {
12977 std::lock_guard lock(client_lock);
12978 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12979 tout(cct) << "ll_fsyncdir" << std::endl;
12980 tout(cct) << (unsigned long)dirp << std::endl;
12981
12982 if (unmounting)
12983 return -ENOTCONN;
12984
12985 return _fsync(dirp->inode.get(), false);
12986 }
12987
12988 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12989 {
12990 ceph_assert(!(flags & O_CREAT));
12991
12992 std::lock_guard lock(client_lock);
12993
12994 if (unmounting)
12995 return -ENOTCONN;
12996
12997 vinodeno_t vino = _get_vino(in);
12998
12999 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13000 tout(cct) << "ll_open" << std::endl;
13001 tout(cct) << vino.ino.val << std::endl;
13002 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13003
13004 int r;
13005 if (!fuse_default_permissions) {
13006 r = may_open(in, flags, perms);
13007 if (r < 0)
13008 goto out;
13009 }
13010
13011 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13012
13013 out:
13014 Fh *fhptr = fhp ? *fhp : NULL;
13015 if (fhptr) {
13016 ll_unclosed_fh_set.insert(fhptr);
13017 }
13018 tout(cct) << (unsigned long)fhptr << std::endl;
13019 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13020 " = " << r << " (" << fhptr << ")" << dendl;
13021 return r;
13022 }
13023
13024 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13025 int flags, InodeRef *in, int caps, Fh **fhp,
13026 const UserPerm& perms)
13027 {
13028 *fhp = NULL;
13029
13030 vinodeno_t vparent = _get_vino(parent);
13031
13032 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13033 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13034 << ", gid " << perms.gid() << dendl;
13035 tout(cct) << "ll_create" << std::endl;
13036 tout(cct) << vparent.ino.val << std::endl;
13037 tout(cct) << name << std::endl;
13038 tout(cct) << mode << std::endl;
13039 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13040
13041 bool created = false;
13042 int r = _lookup(parent, name, caps, in, perms);
13043
13044 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13045 return -EEXIST;
13046
13047 if (r == -ENOENT && (flags & O_CREAT)) {
13048 if (!fuse_default_permissions) {
13049 r = may_create(parent, perms);
13050 if (r < 0)
13051 goto out;
13052 }
13053 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13054 perms);
13055 if (r < 0)
13056 goto out;
13057 }
13058
13059 if (r < 0)
13060 goto out;
13061
13062 ceph_assert(*in);
13063
13064 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13065 if (!created) {
13066 if (!fuse_default_permissions) {
13067 r = may_open(in->get(), flags, perms);
13068 if (r < 0) {
13069 if (*fhp) {
13070 int release_r = _release_fh(*fhp);
13071 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13072 }
13073 goto out;
13074 }
13075 }
13076 if (*fhp == NULL) {
13077 r = _open(in->get(), flags, mode, fhp, perms);
13078 if (r < 0)
13079 goto out;
13080 }
13081 }
13082
13083 out:
13084 if (*fhp) {
13085 ll_unclosed_fh_set.insert(*fhp);
13086 }
13087
13088 ino_t ino = 0;
13089 if (r >= 0) {
13090 Inode *inode = in->get();
13091 if (use_faked_inos())
13092 ino = inode->faked_ino;
13093 else
13094 ino = inode->ino;
13095 }
13096
13097 tout(cct) << (unsigned long)*fhp << std::endl;
13098 tout(cct) << ino << std::endl;
13099 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13100 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13101 *fhp << " " << hex << ino << dec << ")" << dendl;
13102
13103 return r;
13104 }
13105
13106 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13107 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13108 const UserPerm& perms)
13109 {
13110 std::lock_guard lock(client_lock);
13111 InodeRef in;
13112
13113 if (unmounting)
13114 return -ENOTCONN;
13115
13116 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13117 fhp, perms);
13118 if (r >= 0) {
13119 ceph_assert(in);
13120
13121 // passing an Inode in outp requires an additional ref
13122 if (outp) {
13123 _ll_get(in.get());
13124 *outp = in.get();
13125 }
13126 fill_stat(in, attr);
13127 } else {
13128 attr->st_ino = 0;
13129 }
13130
13131 return r;
13132 }
13133
13134 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13135 int oflags, Inode **outp, Fh **fhp,
13136 struct ceph_statx *stx, unsigned want, unsigned lflags,
13137 const UserPerm& perms)
13138 {
13139 unsigned caps = statx_to_mask(lflags, want);
13140 std::lock_guard lock(client_lock);
13141 InodeRef in;
13142
13143 if (unmounting)
13144 return -ENOTCONN;
13145
13146 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13147 if (r >= 0) {
13148 ceph_assert(in);
13149
13150 // passing an Inode in outp requires an additional ref
13151 if (outp) {
13152 _ll_get(in.get());
13153 *outp = in.get();
13154 }
13155 fill_statx(in, caps, stx);
13156 } else {
13157 stx->stx_ino = 0;
13158 stx->stx_mask = 0;
13159 }
13160
13161 return r;
13162 }
13163
13164 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13165 {
13166 std::lock_guard lock(client_lock);
13167 tout(cct) << "ll_lseek" << std::endl;
13168 tout(cct) << offset << std::endl;
13169 tout(cct) << whence << std::endl;
13170
13171 if (unmounting)
13172 return -ENOTCONN;
13173
13174 return _lseek(fh, offset, whence);
13175 }
13176
13177 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13178 {
13179 std::lock_guard lock(client_lock);
13180 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13181 tout(cct) << "ll_read" << std::endl;
13182 tout(cct) << (unsigned long)fh << std::endl;
13183 tout(cct) << off << std::endl;
13184 tout(cct) << len << std::endl;
13185
13186 if (unmounting)
13187 return -ENOTCONN;
13188
13189 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13190 len = std::min(len, (loff_t)INT_MAX);
13191 return _read(fh, off, len, bl);
13192 }
13193
13194 int Client::ll_read_block(Inode *in, uint64_t blockid,
13195 char *buf,
13196 uint64_t offset,
13197 uint64_t length,
13198 file_layout_t* layout)
13199 {
13200 std::lock_guard lock(client_lock);
13201
13202 if (unmounting)
13203 return -ENOTCONN;
13204
13205 vinodeno_t vino = _get_vino(in);
13206 object_t oid = file_object_t(vino.ino, blockid);
13207 C_SaferCond onfinish;
13208 bufferlist bl;
13209
13210 objecter->read(oid,
13211 object_locator_t(layout->pool_id),
13212 offset,
13213 length,
13214 vino.snapid,
13215 &bl,
13216 CEPH_OSD_FLAG_READ,
13217 &onfinish);
13218
13219 client_lock.Unlock();
13220 int r = onfinish.wait();
13221 client_lock.Lock();
13222
13223 if (r >= 0) {
13224 bl.copy(0, bl.length(), buf);
13225 r = bl.length();
13226 }
13227
13228 return r;
13229 }
13230
13231 /* It appears that the OSD doesn't return success unless the entire
13232 buffer was written, return the write length on success. */
13233
13234 int Client::ll_write_block(Inode *in, uint64_t blockid,
13235 char* buf, uint64_t offset,
13236 uint64_t length, file_layout_t* layout,
13237 uint64_t snapseq, uint32_t sync)
13238 {
13239 vinodeno_t vino = ll_get_vino(in);
13240 int r = 0;
13241 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13242
13243 if (length == 0) {
13244 return -EINVAL;
13245 }
13246 if (true || sync) {
13247 /* if write is stable, the epilogue is waiting on
13248 * flock */
13249 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13250 }
13251 object_t oid = file_object_t(vino.ino, blockid);
13252 SnapContext fakesnap;
13253 ceph::bufferlist bl;
13254 if (length > 0) {
13255 bl.push_back(buffer::copy(buf, length));
13256 }
13257
13258 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13259 << dendl;
13260
13261 fakesnap.seq = snapseq;
13262
13263 /* lock just in time */
13264 client_lock.Lock();
13265 if (unmounting) {
13266 client_lock.Unlock();
13267 return -ENOTCONN;
13268 }
13269
13270 objecter->write(oid,
13271 object_locator_t(layout->pool_id),
13272 offset,
13273 length,
13274 fakesnap,
13275 bl,
13276 ceph::real_clock::now(),
13277 0,
13278 onsafe.get());
13279
13280 client_lock.Unlock();
13281 if (nullptr != onsafe) {
13282 r = onsafe->wait();
13283 }
13284
13285 if (r < 0) {
13286 return r;
13287 } else {
13288 return length;
13289 }
13290 }
13291
13292 int Client::ll_commit_blocks(Inode *in,
13293 uint64_t offset,
13294 uint64_t length)
13295 {
13296 std::lock_guard lock(client_lock);
13297 /*
13298 BarrierContext *bctx;
13299 vinodeno_t vino = _get_vino(in);
13300 uint64_t ino = vino.ino;
13301
13302 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13303 << offset << " to " << length << dendl;
13304
13305 if (length == 0) {
13306 return -EINVAL;
13307 }
13308
13309 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13310 if (p != barriers.end()) {
13311 barrier_interval civ(offset, offset + length);
13312 p->second->commit_barrier(civ);
13313 }
13314 */
13315 return 0;
13316 }
13317
13318 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13319 {
13320 std::lock_guard lock(client_lock);
13321 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13322 "~" << len << dendl;
13323 tout(cct) << "ll_write" << std::endl;
13324 tout(cct) << (unsigned long)fh << std::endl;
13325 tout(cct) << off << std::endl;
13326 tout(cct) << len << std::endl;
13327
13328 if (unmounting)
13329 return -ENOTCONN;
13330
13331 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13332 len = std::min(len, (loff_t)INT_MAX);
13333 int r = _write(fh, off, len, data, NULL, 0);
13334 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13335 << dendl;
13336 return r;
13337 }
13338
13339 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13340 {
13341 std::lock_guard lock(client_lock);
13342 if (unmounting)
13343 return -ENOTCONN;
13344 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13345 }
13346
13347 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13348 {
13349 std::lock_guard lock(client_lock);
13350 if (unmounting)
13351 return -ENOTCONN;
13352 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13353 }
13354
13355 int Client::ll_flush(Fh *fh)
13356 {
13357 std::lock_guard lock(client_lock);
13358 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13359 tout(cct) << "ll_flush" << std::endl;
13360 tout(cct) << (unsigned long)fh << std::endl;
13361
13362 if (unmounting)
13363 return -ENOTCONN;
13364
13365 return _flush(fh);
13366 }
13367
13368 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13369 {
13370 std::lock_guard lock(client_lock);
13371 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13372 tout(cct) << "ll_fsync" << std::endl;
13373 tout(cct) << (unsigned long)fh << std::endl;
13374
13375 if (unmounting)
13376 return -ENOTCONN;
13377
13378 int r = _fsync(fh, syncdataonly);
13379 if (r) {
13380 // If we're returning an error, clear it from the FH
13381 fh->take_async_err();
13382 }
13383 return r;
13384 }
13385
13386 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13387 {
13388 std::lock_guard lock(client_lock);
13389 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13390 tout(cct) << "ll_sync_inode" << std::endl;
13391 tout(cct) << (unsigned long)in << std::endl;
13392
13393 if (unmounting)
13394 return -ENOTCONN;
13395
13396 return _fsync(in, syncdataonly);
13397 }
13398
13399 #ifdef FALLOC_FL_PUNCH_HOLE
13400
13401 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13402 {
13403 if (offset < 0 || length <= 0)
13404 return -EINVAL;
13405
13406 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13407 return -EOPNOTSUPP;
13408
13409 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13410 return -EOPNOTSUPP;
13411
13412 Inode *in = fh->inode.get();
13413
13414 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13415 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13416 return -ENOSPC;
13417 }
13418
13419 if (in->snapid != CEPH_NOSNAP)
13420 return -EROFS;
13421
13422 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13423 return -EBADF;
13424
13425 uint64_t size = offset + length;
13426 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13427 size > in->size &&
13428 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13429 return -EDQUOT;
13430 }
13431
13432 int have;
13433 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13434 if (r < 0)
13435 return r;
13436
13437 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13438 if (mode & FALLOC_FL_PUNCH_HOLE) {
13439 if (in->inline_version < CEPH_INLINE_NONE &&
13440 (have & CEPH_CAP_FILE_BUFFER)) {
13441 bufferlist bl;
13442 int len = in->inline_data.length();
13443 if (offset < len) {
13444 if (offset > 0)
13445 in->inline_data.copy(0, offset, bl);
13446 int size = length;
13447 if (offset + size > len)
13448 size = len - offset;
13449 if (size > 0)
13450 bl.append_zero(size);
13451 if (offset + size < len)
13452 in->inline_data.copy(offset + size, len - offset - size, bl);
13453 in->inline_data = bl;
13454 in->inline_version++;
13455 }
13456 in->mtime = in->ctime = ceph_clock_now();
13457 in->change_attr++;
13458 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13459 } else {
13460 if (in->inline_version < CEPH_INLINE_NONE) {
13461 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13462 uninline_data(in, onuninline.get());
13463 }
13464
13465 C_SaferCond onfinish("Client::_punch_hole flock");
13466
13467 unsafe_sync_write++;
13468 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13469
13470 _invalidate_inode_cache(in, offset, length);
13471 filer->zero(in->ino, &in->layout,
13472 in->snaprealm->get_snap_context(),
13473 offset, length,
13474 ceph::real_clock::now(),
13475 0, true, &onfinish);
13476 in->mtime = in->ctime = ceph_clock_now();
13477 in->change_attr++;
13478 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13479
13480 client_lock.Unlock();
13481 onfinish.wait();
13482 client_lock.Lock();
13483 _sync_write_commit(in);
13484 }
13485 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13486 uint64_t size = offset + length;
13487 if (size > in->size) {
13488 in->size = size;
13489 in->mtime = in->ctime = ceph_clock_now();
13490 in->change_attr++;
13491 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13492
13493 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13494 check_caps(in, CHECK_CAPS_NODELAY);
13495 } else if (is_max_size_approaching(in)) {
13496 check_caps(in, 0);
13497 }
13498 }
13499 }
13500
13501 if (nullptr != onuninline) {
13502 client_lock.Unlock();
13503 int ret = onuninline->wait();
13504 client_lock.Lock();
13505
13506 if (ret >= 0 || ret == -ECANCELED) {
13507 in->inline_data.clear();
13508 in->inline_version = CEPH_INLINE_NONE;
13509 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13510 check_caps(in, 0);
13511 } else
13512 r = ret;
13513 }
13514
13515 put_cap_ref(in, CEPH_CAP_FILE_WR);
13516 return r;
13517 }
13518 #else
13519
13520 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13521 {
13522 return -EOPNOTSUPP;
13523 }
13524
13525 #endif
13526
13527
13528 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13529 {
13530 std::lock_guard lock(client_lock);
13531 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13532 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13533 tout(cct) << (unsigned long)fh << std::endl;
13534
13535 if (unmounting)
13536 return -ENOTCONN;
13537
13538 return _fallocate(fh, mode, offset, length);
13539 }
13540
13541 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13542 {
13543 std::lock_guard lock(client_lock);
13544 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13545
13546 if (unmounting)
13547 return -ENOTCONN;
13548
13549 Fh *fh = get_filehandle(fd);
13550 if (!fh)
13551 return -EBADF;
13552 #if defined(__linux__) && defined(O_PATH)
13553 if (fh->flags & O_PATH)
13554 return -EBADF;
13555 #endif
13556 return _fallocate(fh, mode, offset, length);
13557 }
13558
13559 int Client::ll_release(Fh *fh)
13560 {
13561 std::lock_guard lock(client_lock);
13562
13563 if (unmounting)
13564 return -ENOTCONN;
13565
13566 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13567 dendl;
13568 tout(cct) << __func__ << " (fh)" << std::endl;
13569 tout(cct) << (unsigned long)fh << std::endl;
13570
13571 if (ll_unclosed_fh_set.count(fh))
13572 ll_unclosed_fh_set.erase(fh);
13573 return _release_fh(fh);
13574 }
13575
13576 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13577 {
13578 std::lock_guard lock(client_lock);
13579
13580 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13581 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13582
13583 if (unmounting)
13584 return -ENOTCONN;
13585
13586 return _getlk(fh, fl, owner);
13587 }
13588
13589 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13590 {
13591 std::lock_guard lock(client_lock);
13592
13593 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13594 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13595
13596 if (unmounting)
13597 return -ENOTCONN;
13598
13599 return _setlk(fh, fl, owner, sleep);
13600 }
13601
13602 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13603 {
13604 std::lock_guard lock(client_lock);
13605
13606 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13607 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13608
13609 if (unmounting)
13610 return -ENOTCONN;
13611
13612 return _flock(fh, cmd, owner);
13613 }
13614
13615 int Client::set_deleg_timeout(uint32_t timeout)
13616 {
13617 std::lock_guard lock(client_lock);
13618
13619 /*
13620 * The whole point is to prevent blacklisting so we must time out the
13621 * delegation before the session autoclose timeout kicks in.
13622 */
13623 if (timeout >= mdsmap->get_session_autoclose())
13624 return -EINVAL;
13625
13626 deleg_timeout = timeout;
13627 return 0;
13628 }
13629
13630 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13631 {
13632 int ret = -EINVAL;
13633
13634 std::lock_guard lock(client_lock);
13635
13636 if (!mounted)
13637 return -ENOTCONN;
13638
13639 Inode *inode = fh->inode.get();
13640
13641 switch(cmd) {
13642 case CEPH_DELEGATION_NONE:
13643 inode->unset_deleg(fh);
13644 ret = 0;
13645 break;
13646 default:
13647 try {
13648 ret = inode->set_deleg(fh, cmd, cb, priv);
13649 } catch (std::bad_alloc&) {
13650 ret = -ENOMEM;
13651 }
13652 break;
13653 }
13654 return ret;
13655 }
13656
13657 class C_Client_RequestInterrupt : public Context {
13658 private:
13659 Client *client;
13660 MetaRequest *req;
13661 public:
13662 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13663 req->get();
13664 }
13665 void finish(int r) override {
13666 std::lock_guard l(client->client_lock);
13667 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13668 client->_interrupt_filelock(req);
13669 client->put_request(req);
13670 }
13671 };
13672
13673 void Client::ll_interrupt(void *d)
13674 {
13675 MetaRequest *req = static_cast<MetaRequest*>(d);
13676 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13677 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13678 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13679 }
13680
13681 // =========================================
13682 // layout
13683
13684 // expose file layouts
13685
13686 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13687 const UserPerm& perms)
13688 {
13689 std::lock_guard lock(client_lock);
13690
13691 if (unmounting)
13692 return -ENOTCONN;
13693
13694 filepath path(relpath);
13695 InodeRef in;
13696 int r = path_walk(path, &in, perms);
13697 if (r < 0)
13698 return r;
13699
13700 *lp = in->layout;
13701
13702 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13703 return 0;
13704 }
13705
13706 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13707 {
13708 std::lock_guard lock(client_lock);
13709
13710 if (unmounting)
13711 return -ENOTCONN;
13712
13713 Fh *f = get_filehandle(fd);
13714 if (!f)
13715 return -EBADF;
13716 Inode *in = f->inode.get();
13717
13718 *lp = in->layout;
13719
13720 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13721 return 0;
13722 }
13723
13724 int64_t Client::get_default_pool_id()
13725 {
13726 std::lock_guard lock(client_lock);
13727
13728 if (unmounting)
13729 return -ENOTCONN;
13730
13731 /* first data pool is the default */
13732 return mdsmap->get_first_data_pool();
13733 }
13734
13735 // expose osdmap
13736
13737 int64_t Client::get_pool_id(const char *pool_name)
13738 {
13739 std::lock_guard lock(client_lock);
13740
13741 if (unmounting)
13742 return -ENOTCONN;
13743
13744 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13745 pool_name);
13746 }
13747
13748 string Client::get_pool_name(int64_t pool)
13749 {
13750 std::lock_guard lock(client_lock);
13751
13752 if (unmounting)
13753 return string();
13754
13755 return objecter->with_osdmap([pool](const OSDMap& o) {
13756 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13757 });
13758 }
13759
13760 int Client::get_pool_replication(int64_t pool)
13761 {
13762 std::lock_guard lock(client_lock);
13763
13764 if (unmounting)
13765 return -ENOTCONN;
13766
13767 return objecter->with_osdmap([pool](const OSDMap& o) {
13768 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13769 });
13770 }
13771
13772 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13773 {
13774 std::lock_guard lock(client_lock);
13775
13776 if (unmounting)
13777 return -ENOTCONN;
13778
13779 Fh *f = get_filehandle(fd);
13780 if (!f)
13781 return -EBADF;
13782 Inode *in = f->inode.get();
13783
13784 vector<ObjectExtent> extents;
13785 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13786 ceph_assert(extents.size() == 1);
13787
13788 objecter->with_osdmap([&](const OSDMap& o) {
13789 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13790 o.pg_to_acting_osds(pg, osds);
13791 });
13792
13793 if (osds.empty())
13794 return -EINVAL;
13795
13796 /*
13797 * Return the remainder of the extent (stripe unit)
13798 *
13799 * If length = 1 is passed to Striper::file_to_extents we get a single
13800 * extent back, but its length is one so we still need to compute the length
13801 * to the end of the stripe unit.
13802 *
13803 * If length = su then we may get 1 or 2 objects back in the extents vector
13804 * which would have to be examined. Even then, the offsets are local to the
13805 * object, so matching up to the file offset is extra work.
13806 *
13807 * It seems simpler to stick with length = 1 and manually compute the
13808 * remainder.
13809 */
13810 if (len) {
13811 uint64_t su = in->layout.stripe_unit;
13812 *len = su - (off % su);
13813 }
13814
13815 return 0;
13816 }
13817
13818 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13819 {
13820 std::lock_guard lock(client_lock);
13821
13822 if (unmounting)
13823 return -ENOTCONN;
13824
13825 if (id < 0)
13826 return -EINVAL;
13827 return objecter->with_osdmap([&](const OSDMap& o) {
13828 return o.crush->get_full_location_ordered(id, path);
13829 });
13830 }
13831
13832 int Client::get_file_stripe_address(int fd, loff_t offset,
13833 vector<entity_addr_t>& address)
13834 {
13835 std::lock_guard lock(client_lock);
13836
13837 if (unmounting)
13838 return -ENOTCONN;
13839
13840 Fh *f = get_filehandle(fd);
13841 if (!f)
13842 return -EBADF;
13843 Inode *in = f->inode.get();
13844
13845 // which object?
13846 vector<ObjectExtent> extents;
13847 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13848 in->truncate_size, extents);
13849 ceph_assert(extents.size() == 1);
13850
13851 // now we have the object and its 'layout'
13852 return objecter->with_osdmap([&](const OSDMap& o) {
13853 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13854 vector<int> osds;
13855 o.pg_to_acting_osds(pg, osds);
13856 if (osds.empty())
13857 return -EINVAL;
13858 for (unsigned i = 0; i < osds.size(); i++) {
13859 entity_addr_t addr = o.get_addrs(osds[i]).front();
13860 address.push_back(addr);
13861 }
13862 return 0;
13863 });
13864 }
13865
13866 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13867 {
13868 std::lock_guard lock(client_lock);
13869
13870 if (unmounting)
13871 return -ENOTCONN;
13872
13873 return objecter->with_osdmap([&](const OSDMap& o) {
13874 if (!o.exists(osd))
13875 return -ENOENT;
13876
13877 addr = o.get_addrs(osd).front();
13878 return 0;
13879 });
13880 }
13881
13882 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13883 loff_t length, loff_t offset)
13884 {
13885 std::lock_guard lock(client_lock);
13886
13887 if (unmounting)
13888 return -ENOTCONN;
13889
13890 Fh *f = get_filehandle(fd);
13891 if (!f)
13892 return -EBADF;
13893 Inode *in = f->inode.get();
13894
13895 // map to a list of extents
13896 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13897
13898 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13899 return 0;
13900 }
13901
13902
13903 /* find an osd with the same ip. -ENXIO if none. */
13904 int Client::get_local_osd()
13905 {
13906 std::lock_guard lock(client_lock);
13907
13908 if (unmounting)
13909 return -ENOTCONN;
13910
13911 objecter->with_osdmap([this](const OSDMap& o) {
13912 if (o.get_epoch() != local_osd_epoch) {
13913 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
13914 local_osd_epoch = o.get_epoch();
13915 }
13916 });
13917 return local_osd;
13918 }
13919
13920
13921
13922
13923
13924
13925 // ===============================
13926
13927 void Client::ms_handle_connect(Connection *con)
13928 {
13929 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
13930 }
13931
13932 bool Client::ms_handle_reset(Connection *con)
13933 {
13934 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13935 return false;
13936 }
13937
13938 void Client::ms_handle_remote_reset(Connection *con)
13939 {
13940 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13941 std::lock_guard l(client_lock);
13942 switch (con->get_peer_type()) {
13943 case CEPH_ENTITY_TYPE_MDS:
13944 {
13945 // kludge to figure out which mds this is; fixme with a Connection* state
13946 mds_rank_t mds = MDS_RANK_NONE;
13947 MetaSession *s = NULL;
13948 for (auto &p : mds_sessions) {
13949 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13950 mds = p.first;
13951 s = &p.second;
13952 }
13953 }
13954 if (mds >= 0) {
13955 assert (s != NULL);
13956 switch (s->state) {
13957 case MetaSession::STATE_CLOSING:
13958 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13959 _closed_mds_session(s);
13960 break;
13961
13962 case MetaSession::STATE_OPENING:
13963 {
13964 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13965 list<Context*> waiters;
13966 waiters.swap(s->waiting_for_open);
13967 _closed_mds_session(s);
13968 MetaSession *news = _get_or_open_mds_session(mds);
13969 news->waiting_for_open.swap(waiters);
13970 }
13971 break;
13972
13973 case MetaSession::STATE_OPEN:
13974 {
13975 objecter->maybe_request_map(); /* to check if we are blacklisted */
13976 const auto& conf = cct->_conf;
13977 if (conf->client_reconnect_stale) {
13978 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13979 _closed_mds_session(s);
13980 } else {
13981 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13982 s->state = MetaSession::STATE_STALE;
13983 }
13984 }
13985 break;
13986
13987 case MetaSession::STATE_NEW:
13988 case MetaSession::STATE_CLOSED:
13989 default:
13990 break;
13991 }
13992 }
13993 }
13994 break;
13995 }
13996 }
13997
13998 bool Client::ms_handle_refused(Connection *con)
13999 {
14000 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14001 return false;
14002 }
14003
14004 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
14005 {
14006 if (dest_type == CEPH_ENTITY_TYPE_MON)
14007 return true;
14008 *authorizer = monclient->build_authorizer(dest_type);
14009 return true;
14010 }
14011
14012 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14013 {
14014 Inode *quota_in = root_ancestor;
14015 SnapRealm *realm = in->snaprealm;
14016 while (realm) {
14017 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14018 if (realm->ino != in->ino) {
14019 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14020 if (p == inode_map.end())
14021 break;
14022
14023 if (p->second->quota.is_enable()) {
14024 quota_in = p->second;
14025 break;
14026 }
14027 }
14028 realm = realm->pparent;
14029 }
14030 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14031 return quota_in;
14032 }
14033
14034 /**
14035 * Traverse quota ancestors of the Inode, return true
14036 * if any of them passes the passed function
14037 */
14038 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14039 std::function<bool (const Inode &in)> test)
14040 {
14041 while (true) {
14042 ceph_assert(in != NULL);
14043 if (test(*in)) {
14044 return true;
14045 }
14046
14047 if (in == root_ancestor) {
14048 // We're done traversing, drop out
14049 return false;
14050 } else {
14051 // Continue up the tree
14052 in = get_quota_root(in, perms);
14053 }
14054 }
14055
14056 return false;
14057 }
14058
14059 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14060 {
14061 return check_quota_condition(in, perms,
14062 [](const Inode &in) {
14063 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14064 });
14065 }
14066
14067 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14068 const UserPerm& perms)
14069 {
14070 return check_quota_condition(in, perms,
14071 [&new_bytes](const Inode &in) {
14072 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14073 > in.quota.max_bytes;
14074 });
14075 }
14076
14077 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14078 {
14079 return check_quota_condition(in, perms,
14080 [](const Inode &in) {
14081 if (in.quota.max_bytes) {
14082 if (in.rstat.rbytes >= in.quota.max_bytes) {
14083 return true;
14084 }
14085
14086 ceph_assert(in.size >= in.reported_size);
14087 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14088 const uint64_t size = in.size - in.reported_size;
14089 return (space >> 4) < size;
14090 } else {
14091 return false;
14092 }
14093 });
14094 }
14095
14096 enum {
14097 POOL_CHECKED = 1,
14098 POOL_CHECKING = 2,
14099 POOL_READ = 4,
14100 POOL_WRITE = 8,
14101 };
14102
14103 int Client::check_pool_perm(Inode *in, int need)
14104 {
14105 if (!cct->_conf->client_check_pool_perm)
14106 return 0;
14107
14108 int64_t pool_id = in->layout.pool_id;
14109 std::string pool_ns = in->layout.pool_ns;
14110 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14111 int have = 0;
14112 while (true) {
14113 auto it = pool_perms.find(perm_key);
14114 if (it == pool_perms.end())
14115 break;
14116 if (it->second == POOL_CHECKING) {
14117 // avoid concurrent checkings
14118 wait_on_list(waiting_for_pool_perm);
14119 } else {
14120 have = it->second;
14121 ceph_assert(have & POOL_CHECKED);
14122 break;
14123 }
14124 }
14125
14126 if (!have) {
14127 if (in->snapid != CEPH_NOSNAP) {
14128 // pool permission check needs to write to the first object. But for snapshot,
14129 // head of the first object may have alread been deleted. To avoid creating
14130 // orphan object, skip the check for now.
14131 return 0;
14132 }
14133
14134 pool_perms[perm_key] = POOL_CHECKING;
14135
14136 char oid_buf[32];
14137 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14138 object_t oid = oid_buf;
14139
14140 SnapContext nullsnapc;
14141
14142 C_SaferCond rd_cond;
14143 ObjectOperation rd_op;
14144 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14145
14146 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14147 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14148
14149 C_SaferCond wr_cond;
14150 ObjectOperation wr_op;
14151 wr_op.create(true);
14152
14153 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14154 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14155
14156 client_lock.Unlock();
14157 int rd_ret = rd_cond.wait();
14158 int wr_ret = wr_cond.wait();
14159 client_lock.Lock();
14160
14161 bool errored = false;
14162
14163 if (rd_ret == 0 || rd_ret == -ENOENT)
14164 have |= POOL_READ;
14165 else if (rd_ret != -EPERM) {
14166 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14167 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14168 errored = true;
14169 }
14170
14171 if (wr_ret == 0 || wr_ret == -EEXIST)
14172 have |= POOL_WRITE;
14173 else if (wr_ret != -EPERM) {
14174 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14175 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14176 errored = true;
14177 }
14178
14179 if (errored) {
14180 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14181 // Raise EIO because actual error code might be misleading for
14182 // userspace filesystem user.
14183 pool_perms.erase(perm_key);
14184 signal_cond_list(waiting_for_pool_perm);
14185 return -EIO;
14186 }
14187
14188 pool_perms[perm_key] = have | POOL_CHECKED;
14189 signal_cond_list(waiting_for_pool_perm);
14190 }
14191
14192 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14193 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14194 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14195 return -EPERM;
14196 }
14197 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14198 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14199 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14200 return -EPERM;
14201 }
14202
14203 return 0;
14204 }
14205
14206 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14207 {
14208 if (acl_type == POSIX_ACL) {
14209 if (in->xattrs.count(ACL_EA_ACCESS)) {
14210 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14211
14212 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14213 }
14214 }
14215 return -EAGAIN;
14216 }
14217
14218 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14219 {
14220 if (acl_type == NO_ACL)
14221 return 0;
14222
14223 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14224 if (r < 0)
14225 goto out;
14226
14227 if (acl_type == POSIX_ACL) {
14228 if (in->xattrs.count(ACL_EA_ACCESS)) {
14229 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14230 bufferptr acl(access_acl.c_str(), access_acl.length());
14231 r = posix_acl_access_chmod(acl, mode);
14232 if (r < 0)
14233 goto out;
14234 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14235 } else {
14236 r = 0;
14237 }
14238 }
14239 out:
14240 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14241 return r;
14242 }
14243
14244 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14245 const UserPerm& perms)
14246 {
14247 if (acl_type == NO_ACL)
14248 return 0;
14249
14250 if (S_ISLNK(*mode))
14251 return 0;
14252
14253 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14254 if (r < 0)
14255 goto out;
14256
14257 if (acl_type == POSIX_ACL) {
14258 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14259 map<string, bufferptr> xattrs;
14260
14261 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14262 bufferptr acl(default_acl.c_str(), default_acl.length());
14263 r = posix_acl_inherit_mode(acl, mode);
14264 if (r < 0)
14265 goto out;
14266
14267 if (r > 0) {
14268 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14269 if (r < 0)
14270 goto out;
14271 if (r > 0)
14272 xattrs[ACL_EA_ACCESS] = acl;
14273 }
14274
14275 if (S_ISDIR(*mode))
14276 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14277
14278 r = xattrs.size();
14279 if (r > 0)
14280 encode(xattrs, xattrs_bl);
14281 } else {
14282 if (umask_cb)
14283 *mode &= ~umask_cb(callback_handle);
14284 r = 0;
14285 }
14286 }
14287 out:
14288 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14289 return r;
14290 }
14291
14292 void Client::set_filer_flags(int flags)
14293 {
14294 std::lock_guard l(client_lock);
14295 ceph_assert(flags == 0 ||
14296 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14297 objecter->add_global_op_flags(flags);
14298 }
14299
14300 void Client::clear_filer_flags(int flags)
14301 {
14302 std::lock_guard l(client_lock);
14303 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14304 objecter->clear_global_op_flag(flags);
14305 }
14306
14307 // called before mount
14308 void Client::set_uuid(const std::string& uuid)
14309 {
14310 std::lock_guard l(client_lock);
14311 assert(initialized);
14312 assert(!uuid.empty());
14313
14314 metadata["uuid"] = uuid;
14315 _close_sessions();
14316 }
14317
14318 // called before mount. 0 means infinite
14319 void Client::set_session_timeout(unsigned timeout)
14320 {
14321 std::lock_guard l(client_lock);
14322 assert(initialized);
14323
14324 metadata["timeout"] = stringify(timeout);
14325 }
14326
14327 // called before mount
14328 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14329 const std::string& fs_name)
14330 {
14331 std::lock_guard l(client_lock);
14332 if (!initialized)
14333 return -ENOTCONN;
14334
14335 if (uuid.empty())
14336 return -EINVAL;
14337
14338 {
14339 auto it = metadata.find("uuid");
14340 if (it != metadata.end() && it->second == uuid)
14341 return -EINVAL;
14342 }
14343
14344 int r = subscribe_mdsmap(fs_name);
14345 if (r < 0) {
14346 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14347 return r;
14348 }
14349
14350 if (metadata.empty())
14351 populate_metadata("");
14352
14353 while (mdsmap->get_epoch() == 0)
14354 wait_on_list(waiting_for_mdsmap);
14355
14356 reclaim_errno = 0;
14357 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14358 if (!mdsmap->is_up(mds)) {
14359 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14360 wait_on_list(waiting_for_mdsmap);
14361 continue;
14362 }
14363
14364 MetaSession *session;
14365 if (!have_open_session(mds)) {
14366 session = _get_or_open_mds_session(mds);
14367 if (session->state != MetaSession::STATE_OPENING) {
14368 // umounting?
14369 return -EINVAL;
14370 }
14371 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14372 wait_on_context_list(session->waiting_for_open);
14373 if (rejected_by_mds.count(mds))
14374 return -EPERM;
14375 continue;
14376 }
14377
14378 session = &mds_sessions.at(mds);
14379 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14380 return -EOPNOTSUPP;
14381
14382 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14383 session->reclaim_state == MetaSession::RECLAIMING) {
14384 session->reclaim_state = MetaSession::RECLAIMING;
14385 auto m = MClientReclaim::create(uuid, flags);
14386 session->con->send_message2(std::move(m));
14387 wait_on_list(waiting_for_reclaim);
14388 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14389 return reclaim_errno ? : -ENOTRECOVERABLE;
14390 } else {
14391 mds++;
14392 }
14393 }
14394
14395 // didn't find target session in any mds
14396 if (reclaim_target_addrs.empty()) {
14397 if (flags & CEPH_RECLAIM_RESET)
14398 return -ENOENT;
14399 return -ENOTRECOVERABLE;
14400 }
14401
14402 if (flags & CEPH_RECLAIM_RESET)
14403 return 0;
14404
14405 // use blacklist to check if target session was killed
14406 // (config option mds_session_blacklist_on_evict needs to be true)
14407 C_SaferCond cond;
14408 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14409 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14410 client_lock.Unlock();
14411 cond.wait();
14412 client_lock.Lock();
14413 }
14414
14415 bool blacklisted = objecter->with_osdmap(
14416 [this](const OSDMap &osd_map) -> bool {
14417 return osd_map.is_blacklisted(reclaim_target_addrs);
14418 });
14419 if (blacklisted)
14420 return -ENOTRECOVERABLE;
14421
14422 metadata["reclaiming_uuid"] = uuid;
14423 return 0;
14424 }
14425
14426 void Client::finish_reclaim()
14427 {
14428 auto it = metadata.find("reclaiming_uuid");
14429 if (it == metadata.end()) {
14430 for (auto &p : mds_sessions)
14431 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14432 return;
14433 }
14434
14435 for (auto &p : mds_sessions) {
14436 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14437 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14438 p.second.con->send_message2(std::move(m));
14439 }
14440
14441 metadata["uuid"] = it->second;
14442 metadata.erase(it);
14443 }
14444
14445 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14446 {
14447 mds_rank_t from = mds_rank_t(reply->get_source().num());
14448 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14449
14450 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14451 if (!session) {
14452 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14453 return;
14454 }
14455
14456 if (reply->get_result() >= 0) {
14457 session->reclaim_state = MetaSession::RECLAIM_OK;
14458 if (reply->get_epoch() > reclaim_osd_epoch)
14459 reclaim_osd_epoch = reply->get_epoch();
14460 if (!reply->get_addrs().empty())
14461 reclaim_target_addrs = reply->get_addrs();
14462 } else {
14463 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14464 reclaim_errno = reply->get_result();
14465 }
14466
14467 signal_cond_list(waiting_for_reclaim);
14468 }
14469
14470 /**
14471 * This is included in cap release messages, to cause
14472 * the MDS to wait until this OSD map epoch. It is necessary
14473 * in corner cases where we cancel RADOS ops, so that
14474 * nobody else tries to do IO to the same objects in
14475 * the same epoch as the cancelled ops.
14476 */
14477 void Client::set_cap_epoch_barrier(epoch_t e)
14478 {
14479 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14480 cap_epoch_barrier = e;
14481 }
14482
14483 const char** Client::get_tracked_conf_keys() const
14484 {
14485 static const char* keys[] = {
14486 "client_cache_size",
14487 "client_cache_mid",
14488 "client_acl_type",
14489 "client_deleg_timeout",
14490 "client_deleg_break_on_open",
14491 NULL
14492 };
14493 return keys;
14494 }
14495
14496 void Client::handle_conf_change(const ConfigProxy& conf,
14497 const std::set <std::string> &changed)
14498 {
14499 std::lock_guard lock(client_lock);
14500
14501 if (changed.count("client_cache_mid")) {
14502 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14503 }
14504 if (changed.count("client_acl_type")) {
14505 acl_type = NO_ACL;
14506 if (cct->_conf->client_acl_type == "posix_acl")
14507 acl_type = POSIX_ACL;
14508 }
14509 }
14510
14511 void intrusive_ptr_add_ref(Inode *in)
14512 {
14513 in->get();
14514 }
14515
14516 void intrusive_ptr_release(Inode *in)
14517 {
14518 in->client->put_inode(in);
14519 }
14520
14521 mds_rank_t Client::_get_random_up_mds() const
14522 {
14523 ceph_assert(client_lock.is_locked_by_me());
14524
14525 std::set<mds_rank_t> up;
14526 mdsmap->get_up_mds_set(up);
14527
14528 if (up.empty())
14529 return MDS_RANK_NONE;
14530 std::set<mds_rank_t>::const_iterator p = up.begin();
14531 for (int n = rand() % up.size(); n; n--)
14532 ++p;
14533 return *p;
14534 }
14535
14536
14537 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14538 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14539 {
14540 monclient->set_messenger(m);
14541 objecter->set_client_incarnation(0);
14542 }
14543
14544 StandaloneClient::~StandaloneClient()
14545 {
14546 delete objecter;
14547 objecter = nullptr;
14548 }
14549
14550 int StandaloneClient::init()
14551 {
14552 timer.init();
14553 objectcacher->start();
14554 objecter->init();
14555
14556 client_lock.Lock();
14557 ceph_assert(!is_initialized());
14558
14559 messenger->add_dispatcher_tail(objecter);
14560 messenger->add_dispatcher_tail(this);
14561
14562 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14563 int r = monclient->init();
14564 if (r < 0) {
14565 // need to do cleanup because we're in an intermediate init state
14566 timer.shutdown();
14567 client_lock.Unlock();
14568 objecter->shutdown();
14569 objectcacher->stop();
14570 monclient->shutdown();
14571 return r;
14572 }
14573 objecter->start();
14574
14575 client_lock.Unlock();
14576 _finish_init();
14577
14578 return 0;
14579 }
14580
14581 void StandaloneClient::shutdown()
14582 {
14583 Client::shutdown();
14584 objecter->shutdown();
14585 monclient->shutdown();
14586 }