]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
import ceph 14.2.5
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/param.h>
24 #include <fcntl.h>
25 #include <sys/file.h>
26 #include <sys/utsname.h>
27 #include <sys/uio.h>
28
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
31
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
35 #else
36 #include <sys/xattr.h>
37 #endif
38
39 #if defined(__linux__)
40 #include <linux/falloc.h>
41 #endif
42
43 #include <sys/statvfs.h>
44
45 #include "common/config.h"
46 #include "common/version.h"
47
48 #include "mon/MonClient.h"
49
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
66
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
71
72 #include "common/Cond.h"
73 #include "common/Mutex.h"
74 #include "common/perf_counters.h"
75 #include "common/admin_socket.h"
76 #include "common/errno.h"
77 #include "include/str_list.h"
78
79 #define dout_subsys ceph_subsys_client
80
81 #include "include/lru.h"
82 #include "include/compat.h"
83 #include "include/stringify.h"
84
85 #include "Client.h"
86 #include "Inode.h"
87 #include "Dentry.h"
88 #include "Delegation.h"
89 #include "Dir.h"
90 #include "ClientSnapRealm.h"
91 #include "Fh.h"
92 #include "MetaSession.h"
93 #include "MetaRequest.h"
94 #include "ObjecterWriteback.h"
95 #include "posix_acl.h"
96
97 #include "include/ceph_assert.h"
98 #include "include/stat.h"
99
100 #include "include/cephfs/ceph_statx.h"
101
102 #if HAVE_GETGROUPLIST
103 #include <grp.h>
104 #include <pwd.h>
105 #include <unistd.h>
106 #endif
107
108 #undef dout_prefix
109 #define dout_prefix *_dout << "client." << whoami << " "
110
111 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113 // FreeBSD fails to define this
114 #ifndef O_DSYNC
115 #define O_DSYNC 0x0
116 #endif
117 // Darwin fails to define this
118 #ifndef O_RSYNC
119 #define O_RSYNC 0x0
120 #endif
121
122 #ifndef O_DIRECT
123 #define O_DIRECT 0x0
124 #endif
125
126 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129 {
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132 }
133
134
135 // -------------
136
137 Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139 {
140 }
141
142 bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
145 {
146 std::unique_ptr<Formatter> f(Formatter::create(format));
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
150 m_client->dump_mds_requests(f.get());
151 else if (command == "mds_sessions")
152 m_client->dump_mds_sessions(f.get());
153 else if (command == "dump_cache")
154 m_client->dump_cache(f.get());
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
158 m_client->dump_status(f.get());
159 else
160 ceph_abort_msg("bad command registered");
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
164 return true;
165 }
166
167
168 // -------------
169
170 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176 void Client::_reset_faked_inos()
177 {
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
182 last_used_faked_root = 0;
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184 }
185
186 void Client::_assign_faked_ino(Inode *in)
187 {
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
192 last_used_faked_ino = 2048;
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
195 ceph_assert(it != free_faked_inos.end());
196 if (last_used_faked_ino < it.get_start()) {
197 ceph_assert(it.get_len() > 0);
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206 }
207
208 /*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214 */
215 void Client::_assign_faked_root(Inode *in)
216 {
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232 }
233
234 void Client::_release_faked_ino(Inode *in)
235 {
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238 }
239
240 vinodeno_t Client::_map_faked_ino(ino_t ino)
241 {
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
250 return vino;
251 }
252
253 vinodeno_t Client::map_faked_ino(ino_t ino)
254 {
255 std::lock_guard lock(client_lock);
256 return _map_faked_ino(ino);
257 }
258
259 // cons/des
260
261 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
263 timer(m->cct, client_lock),
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
274 m_command_hook(this),
275 fscid(0)
276 {
277 _reset_faked_inos();
278
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
281
282 if (cct->_conf->client_acl_type == "posix_acl")
283 acl_type = POSIX_ACL;
284
285 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
286
287 // file handles
288 free_fd_set.insert(10, 1<<30);
289
290 mdsmap.reset(new MDSMap);
291
292 // osd interfaces
293 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
294 &client_lock));
295 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
296 client_flush_set_callback, // all commit callback
297 (void*)this,
298 cct->_conf->client_oc_size,
299 cct->_conf->client_oc_max_objects,
300 cct->_conf->client_oc_max_dirty,
301 cct->_conf->client_oc_target_dirty,
302 cct->_conf->client_oc_max_dirty_age,
303 true));
304 objecter_finisher.start();
305 filer.reset(new Filer(objecter, &objecter_finisher));
306 objecter->enable_blacklist_events();
307 }
308
309
310 Client::~Client()
311 {
312 ceph_assert(!client_lock.is_locked());
313
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 client_lock.Lock();
318 tear_down_cache();
319 client_lock.Unlock();
320 }
321
322 void Client::tear_down_cache()
323 {
324 // fd's
325 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
326 it != fd_map.end();
327 ++it) {
328 Fh *fh = it->second;
329 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
330 _release_fh(fh);
331 }
332 fd_map.clear();
333
334 while (!opened_dirs.empty()) {
335 dir_result_t *dirp = *opened_dirs.begin();
336 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
337 _closedir(dirp);
338 }
339
340 // caps!
341 // *** FIXME ***
342
343 // empty lru
344 trim_cache();
345 ceph_assert(lru.lru_get_size() == 0);
346
347 // close root ino
348 ceph_assert(inode_map.size() <= 1 + root_parents.size());
349 if (root && inode_map.size() == 1 + root_parents.size()) {
350 delete root;
351 root = 0;
352 root_ancestor = 0;
353 while (!root_parents.empty())
354 root_parents.erase(root_parents.begin());
355 inode_map.clear();
356 _reset_faked_inos();
357 }
358
359 ceph_assert(inode_map.empty());
360 }
361
362 inodeno_t Client::get_root_ino()
363 {
364 std::lock_guard l(client_lock);
365 if (use_faked_inos())
366 return root->faked_ino;
367 else
368 return root->ino;
369 }
370
371 Inode *Client::get_root()
372 {
373 std::lock_guard l(client_lock);
374 root->ll_get();
375 return root;
376 }
377
378
379 // debug crapola
380
381 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
382 {
383 filepath path;
384 in->make_long_path(path);
385 ldout(cct, 1) << "dump_inode: "
386 << (disconnected ? "DISCONNECTED ":"")
387 << "inode " << in->ino
388 << " " << path
389 << " ref " << in->get_num_ref()
390 << *in << dendl;
391
392 if (f) {
393 f->open_object_section("inode");
394 f->dump_stream("path") << path;
395 if (disconnected)
396 f->dump_int("disconnected", 1);
397 in->dump(f);
398 f->close_section();
399 }
400
401 did.insert(in);
402 if (in->dir) {
403 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
404 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
405 it != in->dir->dentries.end();
406 ++it) {
407 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
408 if (f) {
409 f->open_object_section("dentry");
410 it->second->dump(f);
411 f->close_section();
412 }
413 if (it->second->inode)
414 dump_inode(f, it->second->inode.get(), did, false);
415 }
416 }
417 }
418
419 void Client::dump_cache(Formatter *f)
420 {
421 set<Inode*> did;
422
423 ldout(cct, 1) << __func__ << dendl;
424
425 if (f)
426 f->open_array_section("cache");
427
428 if (root)
429 dump_inode(f, root, did, true);
430
431 // make a second pass to catch anything disconnected
432 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
433 it != inode_map.end();
434 ++it) {
435 if (did.count(it->second))
436 continue;
437 dump_inode(f, it->second, did, true);
438 }
439
440 if (f)
441 f->close_section();
442 }
443
444 void Client::dump_status(Formatter *f)
445 {
446 ceph_assert(client_lock.is_locked_by_me());
447
448 ldout(cct, 1) << __func__ << dendl;
449
450 const epoch_t osd_epoch
451 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
452
453 if (f) {
454 f->open_object_section("metadata");
455 for (const auto& kv : metadata)
456 f->dump_string(kv.first.c_str(), kv.second);
457 f->close_section();
458
459 f->dump_int("dentry_count", lru.lru_get_size());
460 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
461 f->dump_int("id", get_nodeid().v);
462 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
463 f->dump_object("inst", inst);
464 f->dump_object("addr", inst.addr);
465 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
466 f->dump_string("addr_str", inst.addr.get_legacy_str());
467 f->dump_int("inode_count", inode_map.size());
468 f->dump_int("mds_epoch", mdsmap->get_epoch());
469 f->dump_int("osd_epoch", osd_epoch);
470 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
471 f->dump_bool("blacklisted", blacklisted);
472 }
473 }
474
475 int Client::init()
476 {
477 timer.init();
478 objectcacher->start();
479
480 client_lock.Lock();
481 ceph_assert(!initialized);
482
483 messenger->add_dispatcher_tail(this);
484 client_lock.Unlock();
485
486 _finish_init();
487 return 0;
488 }
489
490 void Client::_finish_init()
491 {
492 client_lock.Lock();
493 // logger
494 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
495 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
496 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
497 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
498 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
499 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
500 logger.reset(plb.create_perf_counters());
501 cct->get_perfcounters_collection()->add(logger.get());
502
503 client_lock.Unlock();
504
505 cct->_conf.add_observer(this);
506
507 AdminSocket* admin_socket = cct->get_admin_socket();
508 int ret = admin_socket->register_command("mds_requests",
509 "mds_requests",
510 &m_command_hook,
511 "show in-progress mds requests");
512 if (ret < 0) {
513 lderr(cct) << "error registering admin socket command: "
514 << cpp_strerror(-ret) << dendl;
515 }
516 ret = admin_socket->register_command("mds_sessions",
517 "mds_sessions",
518 &m_command_hook,
519 "show mds session state");
520 if (ret < 0) {
521 lderr(cct) << "error registering admin socket command: "
522 << cpp_strerror(-ret) << dendl;
523 }
524 ret = admin_socket->register_command("dump_cache",
525 "dump_cache",
526 &m_command_hook,
527 "show in-memory metadata cache contents");
528 if (ret < 0) {
529 lderr(cct) << "error registering admin socket command: "
530 << cpp_strerror(-ret) << dendl;
531 }
532 ret = admin_socket->register_command("kick_stale_sessions",
533 "kick_stale_sessions",
534 &m_command_hook,
535 "kick sessions that were remote reset");
536 if (ret < 0) {
537 lderr(cct) << "error registering admin socket command: "
538 << cpp_strerror(-ret) << dendl;
539 }
540 ret = admin_socket->register_command("status",
541 "status",
542 &m_command_hook,
543 "show overall client status");
544 if (ret < 0) {
545 lderr(cct) << "error registering admin socket command: "
546 << cpp_strerror(-ret) << dendl;
547 }
548
549 client_lock.Lock();
550 initialized = true;
551 client_lock.Unlock();
552 }
553
554 void Client::shutdown()
555 {
556 ldout(cct, 1) << __func__ << dendl;
557
558 // If we were not mounted, but were being used for sending
559 // MDS commands, we may have sessions that need closing.
560 client_lock.Lock();
561 _close_sessions();
562 client_lock.Unlock();
563
564 cct->_conf.remove_observer(this);
565
566 cct->get_admin_socket()->unregister_commands(&m_command_hook);
567
568 if (ino_invalidate_cb) {
569 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
570 async_ino_invalidator.wait_for_empty();
571 async_ino_invalidator.stop();
572 }
573
574 if (dentry_invalidate_cb) {
575 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
576 async_dentry_invalidator.wait_for_empty();
577 async_dentry_invalidator.stop();
578 }
579
580 if (switch_interrupt_cb) {
581 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
582 interrupt_finisher.wait_for_empty();
583 interrupt_finisher.stop();
584 }
585
586 if (remount_cb) {
587 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
588 remount_finisher.wait_for_empty();
589 remount_finisher.stop();
590 }
591
592 objectcacher->stop(); // outside of client_lock! this does a join.
593
594 client_lock.Lock();
595 ceph_assert(initialized);
596 initialized = false;
597 timer.shutdown();
598 client_lock.Unlock();
599
600 objecter_finisher.wait_for_empty();
601 objecter_finisher.stop();
602
603 if (logger) {
604 cct->get_perfcounters_collection()->remove(logger.get());
605 logger.reset();
606 }
607 }
608
609
610 // ===================
611 // metadata cache stuff
612
613 void Client::trim_cache(bool trim_kernel_dcache)
614 {
615 uint64_t max = cct->_conf->client_cache_size;
616 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
617 unsigned last = 0;
618 while (lru.lru_get_size() != last) {
619 last = lru.lru_get_size();
620
621 if (!unmounting && lru.lru_get_size() <= max) break;
622
623 // trim!
624 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
625 if (!dn)
626 break; // done
627
628 trim_dentry(dn);
629 }
630
631 if (trim_kernel_dcache && lru.lru_get_size() > max)
632 _invalidate_kernel_dcache();
633
634 // hose root?
635 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
636 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
637 delete root;
638 root = 0;
639 root_ancestor = 0;
640 while (!root_parents.empty())
641 root_parents.erase(root_parents.begin());
642 inode_map.clear();
643 _reset_faked_inos();
644 }
645 }
646
647 void Client::trim_cache_for_reconnect(MetaSession *s)
648 {
649 mds_rank_t mds = s->mds_num;
650 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
651
652 int trimmed = 0;
653 list<Dentry*> skipped;
654 while (lru.lru_get_size() > 0) {
655 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
656 if (!dn)
657 break;
658
659 if ((dn->inode && dn->inode->caps.count(mds)) ||
660 dn->dir->parent_inode->caps.count(mds)) {
661 trim_dentry(dn);
662 trimmed++;
663 } else
664 skipped.push_back(dn);
665 }
666
667 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
668 lru.lru_insert_mid(*p);
669
670 ldout(cct, 20) << __func__ << " mds." << mds
671 << " trimmed " << trimmed << " dentries" << dendl;
672
673 if (s->caps.size() > 0)
674 _invalidate_kernel_dcache();
675 }
676
677 void Client::trim_dentry(Dentry *dn)
678 {
679 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
680 << " in dir "
681 << std::hex << dn->dir->parent_inode->ino << std::dec
682 << dendl;
683 if (dn->inode) {
684 Inode *diri = dn->dir->parent_inode;
685 diri->dir_release_count++;
686 clear_dir_complete_and_ordered(diri, true);
687 }
688 unlink(dn, false, false); // drop dir, drop dentry
689 }
690
691
692 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
693 uint64_t truncate_seq, uint64_t truncate_size)
694 {
695 uint64_t prior_size = in->size;
696
697 if (truncate_seq > in->truncate_seq ||
698 (truncate_seq == in->truncate_seq && size > in->size)) {
699 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
700 in->size = size;
701 in->reported_size = size;
702 if (truncate_seq != in->truncate_seq) {
703 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
704 << truncate_seq << dendl;
705 in->truncate_seq = truncate_seq;
706 in->oset.truncate_seq = truncate_seq;
707
708 // truncate cached file data
709 if (prior_size > size) {
710 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
711 }
712 }
713
714 // truncate inline data
715 if (in->inline_version < CEPH_INLINE_NONE) {
716 uint32_t len = in->inline_data.length();
717 if (size < len)
718 in->inline_data.splice(size, len - size);
719 }
720 }
721 if (truncate_seq >= in->truncate_seq &&
722 in->truncate_size != truncate_size) {
723 if (in->is_file()) {
724 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
725 << truncate_size << dendl;
726 in->truncate_size = truncate_size;
727 in->oset.truncate_size = truncate_size;
728 } else {
729 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
730 }
731 }
732 }
733
734 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
735 utime_t ctime, utime_t mtime, utime_t atime)
736 {
737 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
738 << " ctime " << ctime << " mtime " << mtime << dendl;
739
740 if (time_warp_seq > in->time_warp_seq)
741 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
742 << " is higher than local time_warp_seq "
743 << in->time_warp_seq << dendl;
744
745 int warn = false;
746 // be careful with size, mtime, atime
747 if (issued & (CEPH_CAP_FILE_EXCL|
748 CEPH_CAP_FILE_WR|
749 CEPH_CAP_FILE_BUFFER|
750 CEPH_CAP_AUTH_EXCL|
751 CEPH_CAP_XATTR_EXCL)) {
752 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
753 if (ctime > in->ctime)
754 in->ctime = ctime;
755 if (time_warp_seq > in->time_warp_seq) {
756 //the mds updated times, so take those!
757 in->mtime = mtime;
758 in->atime = atime;
759 in->time_warp_seq = time_warp_seq;
760 } else if (time_warp_seq == in->time_warp_seq) {
761 //take max times
762 if (mtime > in->mtime)
763 in->mtime = mtime;
764 if (atime > in->atime)
765 in->atime = atime;
766 } else if (issued & CEPH_CAP_FILE_EXCL) {
767 //ignore mds values as we have a higher seq
768 } else warn = true;
769 } else {
770 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
771 if (time_warp_seq >= in->time_warp_seq) {
772 in->ctime = ctime;
773 in->mtime = mtime;
774 in->atime = atime;
775 in->time_warp_seq = time_warp_seq;
776 } else warn = true;
777 }
778 if (warn) {
779 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
780 << time_warp_seq << " is lower than local time_warp_seq "
781 << in->time_warp_seq
782 << dendl;
783 }
784 }
785
786 void Client::_fragmap_remove_non_leaves(Inode *in)
787 {
788 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
789 if (!in->dirfragtree.is_leaf(p->first))
790 in->fragmap.erase(p++);
791 else
792 ++p;
793 }
794
795 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
796 {
797 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
798 if (p->second == mds)
799 in->fragmap.erase(p++);
800 else
801 ++p;
802 }
803
804 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
805 MetaSession *session,
806 const UserPerm& request_perms)
807 {
808 Inode *in;
809 bool was_new = false;
810 if (inode_map.count(st->vino)) {
811 in = inode_map[st->vino];
812 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
813 } else {
814 in = new Inode(this, st->vino, &st->layout);
815 inode_map[st->vino] = in;
816
817 if (use_faked_inos())
818 _assign_faked_ino(in);
819
820 if (!root) {
821 root = in;
822 if (use_faked_inos())
823 _assign_faked_root(root);
824 root_ancestor = in;
825 cwd = root;
826 } else if (!mounted) {
827 root_parents[root_ancestor] = in;
828 root_ancestor = in;
829 }
830
831 // immutable bits
832 in->ino = st->vino.ino;
833 in->snapid = st->vino.snapid;
834 in->mode = st->mode & S_IFMT;
835 was_new = true;
836 }
837
838 in->rdev = st->rdev;
839 if (in->is_symlink())
840 in->symlink = st->symlink;
841
842 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
843 bool new_version = false;
844 if (in->version == 0 ||
845 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
846 (in->version & ~1) < st->version))
847 new_version = true;
848
849 int issued;
850 in->caps_issued(&issued);
851 issued |= in->caps_dirty();
852 int new_issued = ~issued & (int)st->cap.caps;
853
854 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
855 !(issued & CEPH_CAP_AUTH_EXCL)) {
856 in->mode = st->mode;
857 in->uid = st->uid;
858 in->gid = st->gid;
859 in->btime = st->btime;
860 in->snap_btime = st->snap_btime;
861 }
862
863 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
864 !(issued & CEPH_CAP_LINK_EXCL)) {
865 in->nlink = st->nlink;
866 }
867
868 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
869 update_inode_file_time(in, issued, st->time_warp_seq,
870 st->ctime, st->mtime, st->atime);
871 }
872
873 if (new_version ||
874 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
875 in->layout = st->layout;
876 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
877 }
878
879 if (in->is_dir()) {
880 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
881 in->dirstat = st->dirstat;
882 }
883 // dir_layout/rstat/quota are not tracked by capability, update them only if
884 // the inode stat is from auth mds
885 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
886 in->dir_layout = st->dir_layout;
887 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
888 in->rstat = st->rstat;
889 in->quota = st->quota;
890 in->dir_pin = st->dir_pin;
891 }
892 // move me if/when version reflects fragtree changes.
893 if (in->dirfragtree != st->dirfragtree) {
894 in->dirfragtree = st->dirfragtree;
895 _fragmap_remove_non_leaves(in);
896 }
897 }
898
899 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
900 st->xattrbl.length() &&
901 st->xattr_version > in->xattr_version) {
902 auto p = st->xattrbl.cbegin();
903 decode(in->xattrs, p);
904 in->xattr_version = st->xattr_version;
905 }
906
907 if (st->inline_version > in->inline_version) {
908 in->inline_data = st->inline_data;
909 in->inline_version = st->inline_version;
910 }
911
912 /* always take a newer change attr */
913 if (st->change_attr > in->change_attr)
914 in->change_attr = st->change_attr;
915
916 if (st->version > in->version)
917 in->version = st->version;
918
919 if (was_new)
920 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
921
922 if (!st->cap.caps)
923 return in; // as with readdir returning indoes in different snaprealms (no caps!)
924
925 if (in->snapid == CEPH_NOSNAP) {
926 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
927 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
928 st->cap.flags, request_perms);
929 if (in->auth_cap && in->auth_cap->session == session) {
930 in->max_size = st->max_size;
931 in->rstat = st->rstat;
932 }
933
934 // setting I_COMPLETE needs to happen after adding the cap
935 if (in->is_dir() &&
936 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
937 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
938 in->dirstat.nfiles == 0 &&
939 in->dirstat.nsubdirs == 0) {
940 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
941 in->flags |= I_COMPLETE | I_DIR_ORDERED;
942 if (in->dir) {
943 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
944 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
945 in->dir->readdir_cache.clear();
946 for (const auto& p : in->dir->dentries) {
947 unlink(p.second, true, true); // keep dir, keep dentry
948 }
949 if (in->dir->dentries.empty())
950 close_dir(in->dir);
951 }
952 }
953 } else {
954 in->snap_caps |= st->cap.caps;
955 }
956
957 return in;
958 }
959
960
961 /*
962 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
963 */
964 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
965 Inode *in, utime_t from, MetaSession *session,
966 Dentry *old_dentry)
967 {
968 Dentry *dn = NULL;
969 if (dir->dentries.count(dname))
970 dn = dir->dentries[dname];
971
972 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
973 << " in dir " << dir->parent_inode->vino() << " dn " << dn
974 << dendl;
975
976 if (dn && dn->inode) {
977 if (dn->inode->vino() == in->vino()) {
978 touch_dn(dn);
979 ldout(cct, 12) << " had dentry " << dname
980 << " with correct vino " << dn->inode->vino()
981 << dendl;
982 } else {
983 ldout(cct, 12) << " had dentry " << dname
984 << " with WRONG vino " << dn->inode->vino()
985 << dendl;
986 unlink(dn, true, true); // keep dir, keep dentry
987 }
988 }
989
990 if (!dn || !dn->inode) {
991 InodeRef tmp_ref(in);
992 if (old_dentry) {
993 if (old_dentry->dir != dir) {
994 Inode *old_diri = old_dentry->dir->parent_inode;
995 old_diri->dir_ordered_count++;
996 clear_dir_complete_and_ordered(old_diri, false);
997 }
998 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
999 }
1000 Inode *diri = dir->parent_inode;
1001 diri->dir_ordered_count++;
1002 clear_dir_complete_and_ordered(diri, false);
1003 dn = link(dir, dname, in, dn);
1004 }
1005
1006 update_dentry_lease(dn, dlease, from, session);
1007 return dn;
1008 }
1009
1010 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1011 {
1012 utime_t dttl = from;
1013 dttl += (float)dlease->duration_ms / 1000.0;
1014
1015 ceph_assert(dn);
1016
1017 if (dlease->mask & CEPH_LOCK_DN) {
1018 if (dttl > dn->lease_ttl) {
1019 ldout(cct, 10) << "got dentry lease on " << dn->name
1020 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1021 dn->lease_ttl = dttl;
1022 dn->lease_mds = session->mds_num;
1023 dn->lease_seq = dlease->seq;
1024 dn->lease_gen = session->cap_gen;
1025 }
1026 }
1027 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1028 }
1029
1030
1031 /*
1032 * update MDS location cache for a single inode
1033 */
1034 void Client::update_dir_dist(Inode *in, DirStat *dst)
1035 {
1036 // auth
1037 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1038 if (dst->auth >= 0) {
1039 in->fragmap[dst->frag] = dst->auth;
1040 } else {
1041 in->fragmap.erase(dst->frag);
1042 }
1043 if (!in->dirfragtree.is_leaf(dst->frag)) {
1044 in->dirfragtree.force_to_leaf(cct, dst->frag);
1045 _fragmap_remove_non_leaves(in);
1046 }
1047
1048 // replicated
1049 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1050
1051 // dist
1052 /*
1053 if (!st->dirfrag_dist.empty()) { // FIXME
1054 set<int> dist = st->dirfrag_dist.begin()->second;
1055 if (dist.empty() && !in->dir_contacts.empty())
1056 ldout(cct, 9) << "lost dist spec for " << in->ino
1057 << " " << dist << dendl;
1058 if (!dist.empty() && in->dir_contacts.empty())
1059 ldout(cct, 9) << "got dist spec for " << in->ino
1060 << " " << dist << dendl;
1061 in->dir_contacts = dist;
1062 }
1063 */
1064 }
1065
1066 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1067 {
1068 if (diri->flags & I_COMPLETE) {
1069 if (complete) {
1070 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1071 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1072 } else {
1073 if (diri->flags & I_DIR_ORDERED) {
1074 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1075 diri->flags &= ~I_DIR_ORDERED;
1076 }
1077 }
1078 if (diri->dir)
1079 diri->dir->readdir_cache.clear();
1080 }
1081 }
1082
1083 /*
1084 * insert results from readdir or lssnap into the metadata cache.
1085 */
1086 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1087
1088 auto& reply = request->reply;
1089 ConnectionRef con = request->reply->get_connection();
1090 uint64_t features;
1091 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1092 features = (uint64_t)-1;
1093 }
1094 else {
1095 features = con->get_features();
1096 }
1097
1098 dir_result_t *dirp = request->dirp;
1099 ceph_assert(dirp);
1100
1101 // the extra buffer list is only set for readdir and lssnap replies
1102 auto p = reply->get_extra_bl().cbegin();
1103 if (!p.end()) {
1104 // snapdir?
1105 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1106 ceph_assert(diri);
1107 diri = open_snapdir(diri);
1108 }
1109
1110 // only open dir if we're actually adding stuff to it!
1111 Dir *dir = diri->open_dir();
1112 ceph_assert(dir);
1113
1114 // dirstat
1115 DirStat dst(p, features);
1116 __u32 numdn;
1117 __u16 flags;
1118 decode(numdn, p);
1119 decode(flags, p);
1120
1121 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1122 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1123
1124 frag_t fg = (unsigned)request->head.args.readdir.frag;
1125 unsigned readdir_offset = dirp->next_offset;
1126 string readdir_start = dirp->last_name;
1127 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1128
1129 unsigned last_hash = 0;
1130 if (hash_order) {
1131 if (!readdir_start.empty()) {
1132 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1133 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1134 /* mds understands offset_hash */
1135 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1136 }
1137 }
1138
1139 if (fg != dst.frag) {
1140 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1141 fg = dst.frag;
1142 if (!hash_order) {
1143 readdir_offset = 2;
1144 readdir_start.clear();
1145 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1146 }
1147 }
1148
1149 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1150 << ", hash_order=" << hash_order
1151 << ", readdir_start " << readdir_start
1152 << ", last_hash " << last_hash
1153 << ", next_offset " << readdir_offset << dendl;
1154
1155 if (diri->snapid != CEPH_SNAPDIR &&
1156 fg.is_leftmost() && readdir_offset == 2 &&
1157 !(hash_order && last_hash)) {
1158 dirp->release_count = diri->dir_release_count;
1159 dirp->ordered_count = diri->dir_ordered_count;
1160 dirp->start_shared_gen = diri->shared_gen;
1161 dirp->cache_index = 0;
1162 }
1163
1164 dirp->buffer_frag = fg;
1165
1166 _readdir_drop_dirp_buffer(dirp);
1167 dirp->buffer.reserve(numdn);
1168
1169 string dname;
1170 LeaseStat dlease;
1171 for (unsigned i=0; i<numdn; i++) {
1172 decode(dname, p);
1173 dlease.decode(p, features);
1174 InodeStat ist(p, features);
1175
1176 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1177
1178 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1179 request->perms);
1180 Dentry *dn;
1181 if (diri->dir->dentries.count(dname)) {
1182 Dentry *olddn = diri->dir->dentries[dname];
1183 if (olddn->inode != in) {
1184 // replace incorrect dentry
1185 unlink(olddn, true, true); // keep dir, dentry
1186 dn = link(dir, dname, in, olddn);
1187 ceph_assert(dn == olddn);
1188 } else {
1189 // keep existing dn
1190 dn = olddn;
1191 touch_dn(dn);
1192 }
1193 } else {
1194 // new dn
1195 dn = link(dir, dname, in, NULL);
1196 }
1197
1198 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1199 if (hash_order) {
1200 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1201 if (hash != last_hash)
1202 readdir_offset = 2;
1203 last_hash = hash;
1204 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1205 } else {
1206 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1207 }
1208 // add to readdir cache
1209 if (dirp->release_count == diri->dir_release_count &&
1210 dirp->ordered_count == diri->dir_ordered_count &&
1211 dirp->start_shared_gen == diri->shared_gen) {
1212 if (dirp->cache_index == dir->readdir_cache.size()) {
1213 if (i == 0) {
1214 ceph_assert(!dirp->inode->is_complete_and_ordered());
1215 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1216 }
1217 dir->readdir_cache.push_back(dn);
1218 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1219 if (dirp->inode->is_complete_and_ordered())
1220 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1221 else
1222 dir->readdir_cache[dirp->cache_index] = dn;
1223 } else {
1224 ceph_abort_msg("unexpected readdir buffer idx");
1225 }
1226 dirp->cache_index++;
1227 }
1228 // add to cached result list
1229 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1230 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1231 }
1232
1233 if (numdn > 0)
1234 dirp->last_name = dname;
1235 if (end)
1236 dirp->next_offset = 2;
1237 else
1238 dirp->next_offset = readdir_offset;
1239
1240 if (dir->is_empty())
1241 close_dir(dir);
1242 }
1243 }
1244
1245 /** insert_trace
1246 *
1247 * insert a trace from a MDS reply into the cache.
1248 */
1249 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1250 {
1251 auto& reply = request->reply;
1252 int op = request->get_op();
1253
1254 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1255 << " is_target=" << (int)reply->head.is_target
1256 << " is_dentry=" << (int)reply->head.is_dentry
1257 << dendl;
1258
1259 auto p = reply->get_trace_bl().cbegin();
1260 if (request->got_unsafe) {
1261 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1262 ceph_assert(p.end());
1263 return NULL;
1264 }
1265
1266 if (p.end()) {
1267 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1268
1269 Dentry *d = request->dentry();
1270 if (d) {
1271 Inode *diri = d->dir->parent_inode;
1272 diri->dir_release_count++;
1273 clear_dir_complete_and_ordered(diri, true);
1274 }
1275
1276 if (d && reply->get_result() == 0) {
1277 if (op == CEPH_MDS_OP_RENAME) {
1278 // rename
1279 Dentry *od = request->old_dentry();
1280 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1281 ceph_assert(od);
1282 unlink(od, true, true); // keep dir, dentry
1283 } else if (op == CEPH_MDS_OP_RMDIR ||
1284 op == CEPH_MDS_OP_UNLINK) {
1285 // unlink, rmdir
1286 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287 unlink(d, true, true); // keep dir, dentry
1288 }
1289 }
1290 return NULL;
1291 }
1292
1293 ConnectionRef con = request->reply->get_connection();
1294 uint64_t features;
1295 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296 features = (uint64_t)-1;
1297 }
1298 else {
1299 features = con->get_features();
1300 }
1301 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303 // snap trace
1304 SnapRealm *realm = NULL;
1305 if (reply->snapbl.length())
1306 update_snap_trace(reply->snapbl, &realm);
1307
1308 ldout(cct, 10) << " hrm "
1309 << " is_target=" << (int)reply->head.is_target
1310 << " is_dentry=" << (int)reply->head.is_dentry
1311 << dendl;
1312
1313 InodeStat dirst;
1314 DirStat dst;
1315 string dname;
1316 LeaseStat dlease;
1317 InodeStat ist;
1318
1319 if (reply->head.is_dentry) {
1320 dirst.decode(p, features);
1321 dst.decode(p, features);
1322 decode(dname, p);
1323 dlease.decode(p, features);
1324 }
1325
1326 Inode *in = 0;
1327 if (reply->head.is_target) {
1328 ist.decode(p, features);
1329 if (cct->_conf->client_debug_getattr_caps) {
1330 unsigned wanted = 0;
1331 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332 wanted = request->head.args.getattr.mask;
1333 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334 wanted = request->head.args.open.mask;
1335
1336 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1338 ceph_abort_msg("MDS reply does not contain xattrs");
1339 }
1340
1341 in = add_update_inode(&ist, request->sent_stamp, session,
1342 request->perms);
1343 }
1344
1345 Inode *diri = NULL;
1346 if (reply->head.is_dentry) {
1347 diri = add_update_inode(&dirst, request->sent_stamp, session,
1348 request->perms);
1349 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355 } else {
1356 Dentry *dn = NULL;
1357 if (diri->dir && diri->dir->dentries.count(dname)) {
1358 dn = diri->dir->dentries[dname];
1359 if (dn->inode) {
1360 diri->dir_ordered_count++;
1361 clear_dir_complete_and_ordered(diri, false);
1362 unlink(dn, true, true); // keep dir, dentry
1363 }
1364 }
1365 if (dlease.duration_ms > 0) {
1366 if (!dn) {
1367 Dir *dir = diri->open_dir();
1368 dn = link(dir, dname, NULL, NULL);
1369 }
1370 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1371 }
1372 }
1373 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1374 op == CEPH_MDS_OP_MKSNAP) {
1375 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1376 // fake it for snap lookup
1377 vinodeno_t vino = ist.vino;
1378 vino.snapid = CEPH_SNAPDIR;
1379 ceph_assert(inode_map.count(vino));
1380 diri = inode_map[vino];
1381
1382 string dname = request->path.last_dentry();
1383
1384 LeaseStat dlease;
1385 dlease.duration_ms = 0;
1386
1387 if (in) {
1388 Dir *dir = diri->open_dir();
1389 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1390 } else {
1391 if (diri->dir && diri->dir->dentries.count(dname)) {
1392 Dentry *dn = diri->dir->dentries[dname];
1393 if (dn->inode)
1394 unlink(dn, true, true); // keep dir, dentry
1395 }
1396 }
1397 }
1398
1399 if (in) {
1400 if (op == CEPH_MDS_OP_READDIR ||
1401 op == CEPH_MDS_OP_LSSNAP) {
1402 insert_readdir_results(request, session, in);
1403 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1404 // hack: return parent inode instead
1405 in = diri;
1406 }
1407
1408 if (request->dentry() == NULL && in != request->inode()) {
1409 // pin the target inode if its parent dentry is not pinned
1410 request->set_other_inode(in);
1411 }
1412 }
1413
1414 if (realm)
1415 put_snap_realm(realm);
1416
1417 request->target = in;
1418 return in;
1419 }
1420
1421 // -------
1422
1423 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1424 {
1425 mds_rank_t mds = MDS_RANK_NONE;
1426 __u32 hash = 0;
1427 bool is_hash = false;
1428
1429 Inode *in = NULL;
1430 Dentry *de = NULL;
1431
1432 if (req->resend_mds >= 0) {
1433 mds = req->resend_mds;
1434 req->resend_mds = -1;
1435 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1436 goto out;
1437 }
1438
1439 if (cct->_conf->client_use_random_mds)
1440 goto random_mds;
1441
1442 in = req->inode();
1443 de = req->dentry();
1444 if (in) {
1445 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1446 if (req->path.depth()) {
1447 hash = in->hash_dentry_name(req->path[0]);
1448 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1449 << " on " << req->path[0]
1450 << " => " << hash << dendl;
1451 is_hash = true;
1452 }
1453 } else if (de) {
1454 if (de->inode) {
1455 in = de->inode.get();
1456 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1457 } else {
1458 in = de->dir->parent_inode;
1459 hash = in->hash_dentry_name(de->name);
1460 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1461 << " on " << de->name
1462 << " => " << hash << dendl;
1463 is_hash = true;
1464 }
1465 }
1466 if (in) {
1467 if (in->snapid != CEPH_NOSNAP) {
1468 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1469 while (in->snapid != CEPH_NOSNAP) {
1470 if (in->snapid == CEPH_SNAPDIR)
1471 in = in->snapdir_parent.get();
1472 else if (!in->dentries.empty())
1473 /* In most cases there will only be one dentry, so getting it
1474 * will be the correct action. If there are multiple hard links,
1475 * I think the MDS should be able to redirect as needed*/
1476 in = in->get_first_parent()->dir->parent_inode;
1477 else {
1478 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1479 break;
1480 }
1481 }
1482 is_hash = false;
1483 }
1484
1485 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1486 << " hash=" << hash << dendl;
1487
1488 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1489 frag_t fg = in->dirfragtree[hash];
1490 if (in->fragmap.count(fg)) {
1491 mds = in->fragmap[fg];
1492 if (phash_diri)
1493 *phash_diri = in;
1494 } else if (in->auth_cap) {
1495 mds = in->auth_cap->session->mds_num;
1496 }
1497 if (mds >= 0) {
1498 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1499 goto out;
1500 }
1501 }
1502
1503 if (in->auth_cap && req->auth_is_best()) {
1504 mds = in->auth_cap->session->mds_num;
1505 } else if (!in->caps.empty()) {
1506 mds = in->caps.begin()->second.session->mds_num;
1507 } else {
1508 goto random_mds;
1509 }
1510 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1511
1512 goto out;
1513 }
1514
1515 random_mds:
1516 if (mds < 0) {
1517 mds = _get_random_up_mds();
1518 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1519 }
1520
1521 out:
1522 ldout(cct, 20) << "mds is " << mds << dendl;
1523 return mds;
1524 }
1525
1526
1527 void Client::connect_mds_targets(mds_rank_t mds)
1528 {
1529 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1530 ceph_assert(mds_sessions.count(mds));
1531 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1532 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1533 q != info.export_targets.end();
1534 ++q) {
1535 if (mds_sessions.count(*q) == 0 &&
1536 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1537 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1538 << " export target mds." << *q << dendl;
1539 _open_mds_session(*q);
1540 }
1541 }
1542 }
1543
1544 void Client::dump_mds_sessions(Formatter *f)
1545 {
1546 f->dump_int("id", get_nodeid().v);
1547 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1548 f->dump_object("inst", inst);
1549 f->dump_stream("inst_str") << inst;
1550 f->dump_stream("addr_str") << inst.addr;
1551 f->open_array_section("sessions");
1552 for (const auto &p : mds_sessions) {
1553 f->open_object_section("session");
1554 p.second.dump(f);
1555 f->close_section();
1556 }
1557 f->close_section();
1558 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1559 }
1560 void Client::dump_mds_requests(Formatter *f)
1561 {
1562 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1563 p != mds_requests.end();
1564 ++p) {
1565 f->open_object_section("request");
1566 p->second->dump(f);
1567 f->close_section();
1568 }
1569 }
1570
1571 int Client::verify_reply_trace(int r,
1572 MetaRequest *request, const MConstRef<MClientReply>& reply,
1573 InodeRef *ptarget, bool *pcreated,
1574 const UserPerm& perms)
1575 {
1576 // check whether this request actually did the create, and set created flag
1577 bufferlist extra_bl;
1578 inodeno_t created_ino;
1579 bool got_created_ino = false;
1580 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1581
1582 extra_bl = reply->get_extra_bl();
1583 if (extra_bl.length() >= 8) {
1584 // if the extra bufferlist has a buffer, we assume its the created inode
1585 // and that this request to create succeeded in actually creating
1586 // the inode (won the race with other create requests)
1587 decode(created_ino, extra_bl);
1588 got_created_ino = true;
1589 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1590 }
1591
1592 if (pcreated)
1593 *pcreated = got_created_ino;
1594
1595 if (request->target) {
1596 *ptarget = request->target;
1597 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1598 } else {
1599 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1600 (*ptarget) = p->second;
1601 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1602 } else {
1603 // we got a traceless reply, and need to look up what we just
1604 // created. for now, do this by name. someday, do this by the
1605 // ino... which we know! FIXME.
1606 InodeRef target;
1607 Dentry *d = request->dentry();
1608 if (d) {
1609 if (d->dir) {
1610 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1611 << d->dir->parent_inode->ino << "/" << d->name
1612 << " got_ino " << got_created_ino
1613 << " ino " << created_ino
1614 << dendl;
1615 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1616 &target, perms);
1617 } else {
1618 // if the dentry is not linked, just do our best. see #5021.
1619 ceph_abort_msg("how did this happen? i want logs!");
1620 }
1621 } else {
1622 Inode *in = request->inode();
1623 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1624 << in->ino << dendl;
1625 r = _getattr(in, request->regetattr_mask, perms, true);
1626 target = in;
1627 }
1628 if (r >= 0) {
1629 // verify ino returned in reply and trace_dist are the same
1630 if (got_created_ino &&
1631 created_ino.val != target->ino.val) {
1632 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1633 r = -EINTR;
1634 }
1635 if (ptarget)
1636 ptarget->swap(target);
1637 }
1638 }
1639 }
1640
1641 return r;
1642 }
1643
1644
1645 /**
1646 * make a request
1647 *
1648 * Blocking helper to make an MDS request.
1649 *
1650 * If the ptarget flag is set, behavior changes slightly: the caller
1651 * expects to get a pointer to the inode we are creating or operating
1652 * on. As a result, we will follow up any traceless mutation reply
1653 * with a getattr or lookup to transparently handle a traceless reply
1654 * from the MDS (as when the MDS restarts and the client has to replay
1655 * a request).
1656 *
1657 * @param request the MetaRequest to execute
1658 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1659 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1660 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1661 * @param use_mds [optional] prefer a specific mds (-1 for default)
1662 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1663 */
1664 int Client::make_request(MetaRequest *request,
1665 const UserPerm& perms,
1666 InodeRef *ptarget, bool *pcreated,
1667 mds_rank_t use_mds,
1668 bufferlist *pdirbl)
1669 {
1670 int r = 0;
1671
1672 // assign a unique tid
1673 ceph_tid_t tid = ++last_tid;
1674 request->set_tid(tid);
1675
1676 // and timestamp
1677 request->op_stamp = ceph_clock_now();
1678
1679 // make note
1680 mds_requests[tid] = request->get();
1681 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1682 oldest_tid = tid;
1683
1684 request->set_caller_perms(perms);
1685
1686 if (cct->_conf->client_inject_fixed_oldest_tid) {
1687 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1688 request->set_oldest_client_tid(1);
1689 } else {
1690 request->set_oldest_client_tid(oldest_tid);
1691 }
1692
1693 // hack target mds?
1694 if (use_mds >= 0)
1695 request->resend_mds = use_mds;
1696
1697 while (1) {
1698 if (request->aborted())
1699 break;
1700
1701 if (blacklisted) {
1702 request->abort(-EBLACKLISTED);
1703 break;
1704 }
1705
1706 // set up wait cond
1707 Cond caller_cond;
1708 request->caller_cond = &caller_cond;
1709
1710 // choose mds
1711 Inode *hash_diri = NULL;
1712 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1713 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1714 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1715 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1716 if (hash_diri) {
1717 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1718 _fragmap_remove_stopped_mds(hash_diri, mds);
1719 } else {
1720 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1721 request->resend_mds = _get_random_up_mds();
1722 }
1723 } else {
1724 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1725 wait_on_list(waiting_for_mdsmap);
1726 }
1727 continue;
1728 }
1729
1730 // open a session?
1731 MetaSession *session = NULL;
1732 if (!have_open_session(mds)) {
1733 session = _get_or_open_mds_session(mds);
1734
1735 // wait
1736 if (session->state == MetaSession::STATE_OPENING) {
1737 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1738 wait_on_context_list(session->waiting_for_open);
1739 // Abort requests on REJECT from MDS
1740 if (rejected_by_mds.count(mds)) {
1741 request->abort(-EPERM);
1742 break;
1743 }
1744 continue;
1745 }
1746
1747 if (!have_open_session(mds))
1748 continue;
1749 } else {
1750 session = &mds_sessions.at(mds);
1751 }
1752
1753 // send request.
1754 send_request(request, session);
1755
1756 // wait for signal
1757 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1758 request->kick = false;
1759 while (!request->reply && // reply
1760 request->resend_mds < 0 && // forward
1761 !request->kick)
1762 caller_cond.Wait(client_lock);
1763 request->caller_cond = NULL;
1764
1765 // did we get a reply?
1766 if (request->reply)
1767 break;
1768 }
1769
1770 if (!request->reply) {
1771 ceph_assert(request->aborted());
1772 ceph_assert(!request->got_unsafe);
1773 r = request->get_abort_code();
1774 request->item.remove_myself();
1775 unregister_request(request);
1776 put_request(request);
1777 return r;
1778 }
1779
1780 // got it!
1781 auto reply = std::move(request->reply);
1782 r = reply->get_result();
1783 if (r >= 0)
1784 request->success = true;
1785
1786 // kick dispatcher (we've got it!)
1787 ceph_assert(request->dispatch_cond);
1788 request->dispatch_cond->Signal();
1789 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1790 request->dispatch_cond = 0;
1791
1792 if (r >= 0 && ptarget)
1793 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1794
1795 if (pdirbl)
1796 *pdirbl = reply->get_extra_bl();
1797
1798 // -- log times --
1799 utime_t lat = ceph_clock_now();
1800 lat -= request->sent_stamp;
1801 ldout(cct, 20) << "lat " << lat << dendl;
1802 logger->tinc(l_c_lat, lat);
1803 logger->tinc(l_c_reply, lat);
1804
1805 put_request(request);
1806 return r;
1807 }
1808
1809 void Client::unregister_request(MetaRequest *req)
1810 {
1811 mds_requests.erase(req->tid);
1812 if (req->tid == oldest_tid) {
1813 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1814 while (true) {
1815 if (p == mds_requests.end()) {
1816 oldest_tid = 0;
1817 break;
1818 }
1819 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1820 oldest_tid = p->first;
1821 break;
1822 }
1823 ++p;
1824 }
1825 }
1826 put_request(req);
1827 }
1828
1829 void Client::put_request(MetaRequest *request)
1830 {
1831 if (request->_put()) {
1832 int op = -1;
1833 if (request->success)
1834 op = request->get_op();
1835 InodeRef other_in;
1836 request->take_other_inode(&other_in);
1837 delete request;
1838
1839 if (other_in &&
1840 (op == CEPH_MDS_OP_RMDIR ||
1841 op == CEPH_MDS_OP_RENAME ||
1842 op == CEPH_MDS_OP_RMSNAP)) {
1843 _try_to_trim_inode(other_in.get(), false);
1844 }
1845 }
1846 }
1847
1848 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1849 mds_rank_t mds, int drop,
1850 int unless, int force)
1851 {
1852 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1853 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1854 << ", have:" << ", force:" << force << ")" << dendl;
1855 int released = 0;
1856 auto it = in->caps.find(mds);
1857 if (it != in->caps.end()) {
1858 Cap &cap = it->second;
1859 drop &= ~(in->dirty_caps | get_caps_used(in));
1860 if ((drop & cap.issued) &&
1861 !(unless & cap.issued)) {
1862 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1863 cap.issued &= ~drop;
1864 cap.implemented &= ~drop;
1865 released = 1;
1866 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
1867 } else {
1868 released = force;
1869 }
1870 if (released) {
1871 ceph_mds_request_release rel;
1872 rel.ino = in->ino;
1873 rel.cap_id = cap.cap_id;
1874 rel.seq = cap.seq;
1875 rel.issue_seq = cap.issue_seq;
1876 rel.mseq = cap.mseq;
1877 rel.caps = cap.implemented;
1878 rel.wanted = cap.wanted;
1879 rel.dname_len = 0;
1880 rel.dname_seq = 0;
1881 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1882 }
1883 }
1884 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1885 << released << dendl;
1886 return released;
1887 }
1888
1889 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1890 mds_rank_t mds, int drop, int unless)
1891 {
1892 ldout(cct, 20) << __func__ << " enter(dn:"
1893 << dn << ")" << dendl;
1894 int released = 0;
1895 if (dn->dir)
1896 released = encode_inode_release(dn->dir->parent_inode, req,
1897 mds, drop, unless, 1);
1898 if (released && dn->lease_mds == mds) {
1899 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1900 auto& rel = req->cap_releases.back();
1901 rel.item.dname_len = dn->name.length();
1902 rel.item.dname_seq = dn->lease_seq;
1903 rel.dname = dn->name;
1904 }
1905 ldout(cct, 25) << __func__ << " exit(dn:"
1906 << dn << ")" << dendl;
1907 }
1908
1909
1910 /*
1911 * This requires the MClientRequest *request member to be set.
1912 * It will error out horribly without one.
1913 * Additionally, if you set any *drop member, you'd better have
1914 * set the corresponding dentry!
1915 */
1916 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1917 {
1918 ldout(cct, 20) << __func__ << " enter (req: "
1919 << req << ", mds: " << mds << ")" << dendl;
1920 if (req->inode_drop && req->inode())
1921 encode_inode_release(req->inode(), req,
1922 mds, req->inode_drop,
1923 req->inode_unless);
1924
1925 if (req->old_inode_drop && req->old_inode())
1926 encode_inode_release(req->old_inode(), req,
1927 mds, req->old_inode_drop,
1928 req->old_inode_unless);
1929 if (req->other_inode_drop && req->other_inode())
1930 encode_inode_release(req->other_inode(), req,
1931 mds, req->other_inode_drop,
1932 req->other_inode_unless);
1933
1934 if (req->dentry_drop && req->dentry())
1935 encode_dentry_release(req->dentry(), req,
1936 mds, req->dentry_drop,
1937 req->dentry_unless);
1938
1939 if (req->old_dentry_drop && req->old_dentry())
1940 encode_dentry_release(req->old_dentry(), req,
1941 mds, req->old_dentry_drop,
1942 req->old_dentry_unless);
1943 ldout(cct, 25) << __func__ << " exit (req: "
1944 << req << ", mds " << mds <<dendl;
1945 }
1946
1947 bool Client::have_open_session(mds_rank_t mds)
1948 {
1949 const auto &it = mds_sessions.find(mds);
1950 return it != mds_sessions.end() &&
1951 (it->second.state == MetaSession::STATE_OPEN ||
1952 it->second.state == MetaSession::STATE_STALE);
1953 }
1954
1955 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1956 {
1957 const auto &it = mds_sessions.find(mds);
1958 if (it == mds_sessions.end() || it->second.con != con) {
1959 return NULL;
1960 } else {
1961 return &it->second;
1962 }
1963 }
1964
1965 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1966 {
1967 auto it = mds_sessions.find(mds);
1968 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1969 }
1970
1971 /**
1972 * Populate a map of strings with client-identifying metadata,
1973 * such as the hostname. Call this once at initialization.
1974 */
1975 void Client::populate_metadata(const std::string &mount_root)
1976 {
1977 // Hostname
1978 struct utsname u;
1979 int r = uname(&u);
1980 if (r >= 0) {
1981 metadata["hostname"] = u.nodename;
1982 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1983 } else {
1984 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1985 }
1986
1987 metadata["pid"] = stringify(getpid());
1988
1989 // Ceph entity id (the '0' in "client.0")
1990 metadata["entity_id"] = cct->_conf->name.get_id();
1991
1992 // Our mount position
1993 if (!mount_root.empty()) {
1994 metadata["root"] = mount_root;
1995 }
1996
1997 // Ceph version
1998 metadata["ceph_version"] = pretty_version_to_str();
1999 metadata["ceph_sha1"] = git_version_to_str();
2000
2001 // Apply any metadata from the user's configured overrides
2002 std::vector<std::string> tokens;
2003 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2004 for (const auto &i : tokens) {
2005 auto eqpos = i.find("=");
2006 // Throw out anything that isn't of the form "<str>=<str>"
2007 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2008 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2009 continue;
2010 }
2011 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2012 }
2013 }
2014
2015 /**
2016 * Optionally add or override client metadata fields.
2017 */
2018 void Client::update_metadata(std::string const &k, std::string const &v)
2019 {
2020 std::lock_guard l(client_lock);
2021 ceph_assert(initialized);
2022
2023 auto it = metadata.find(k);
2024 if (it != metadata.end()) {
2025 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2026 << "' from '" << it->second << "' to '" << v << "'" << dendl;
2027 }
2028
2029 metadata[k] = v;
2030 }
2031
2032 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2033 {
2034 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2035 auto addrs = mdsmap->get_addrs(mds);
2036 auto em = mds_sessions.emplace(std::piecewise_construct,
2037 std::forward_as_tuple(mds),
2038 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2039 ceph_assert(em.second); /* not already present */
2040 MetaSession *session = &em.first->second;
2041
2042 // Maybe skip sending a request to open if this MDS daemon
2043 // has previously sent us a REJECT.
2044 if (rejected_by_mds.count(mds)) {
2045 if (rejected_by_mds[mds] == session->addrs) {
2046 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
2047 "because we were rejected" << dendl;
2048 return session;
2049 } else {
2050 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
2051 "rejected us, trying with new inst" << dendl;
2052 rejected_by_mds.erase(mds);
2053 }
2054 }
2055
2056 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2057 m->metadata = metadata;
2058 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2059 session->con->send_message2(std::move(m));
2060 return session;
2061 }
2062
2063 void Client::_close_mds_session(MetaSession *s)
2064 {
2065 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2066 s->state = MetaSession::STATE_CLOSING;
2067 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2068 }
2069
2070 void Client::_closed_mds_session(MetaSession *s)
2071 {
2072 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2073 s->state = MetaSession::STATE_CLOSED;
2074 s->con->mark_down();
2075 signal_context_list(s->waiting_for_open);
2076 mount_cond.Signal();
2077 remove_session_caps(s);
2078 kick_requests_closed(s);
2079 mds_sessions.erase(s->mds_num);
2080 }
2081
2082 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2083 {
2084 mds_rank_t from = mds_rank_t(m->get_source().num());
2085 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2086
2087 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2088 if (!session) {
2089 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2090 return;
2091 }
2092
2093 switch (m->get_op()) {
2094 case CEPH_SESSION_OPEN:
2095 {
2096 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2097 missing_features -= m->supported_features;
2098 if (!missing_features.empty()) {
2099 lderr(cct) << "mds." << from << " lacks required features '"
2100 << missing_features << "', closing session " << dendl;
2101 rejected_by_mds[session->mds_num] = session->addrs;
2102 _close_mds_session(session);
2103 _closed_mds_session(session);
2104 break;
2105 }
2106 session->mds_features = std::move(m->supported_features);
2107
2108 renew_caps(session);
2109 session->state = MetaSession::STATE_OPEN;
2110 if (unmounting)
2111 mount_cond.Signal();
2112 else
2113 connect_mds_targets(from);
2114 signal_context_list(session->waiting_for_open);
2115 break;
2116 }
2117
2118 case CEPH_SESSION_CLOSE:
2119 _closed_mds_session(session);
2120 break;
2121
2122 case CEPH_SESSION_RENEWCAPS:
2123 if (session->cap_renew_seq == m->get_seq()) {
2124 bool was_stale = ceph_clock_now() >= session->cap_ttl;
2125 session->cap_ttl =
2126 session->last_cap_renew_request + mdsmap->get_session_timeout();
2127 if (was_stale)
2128 wake_up_session_caps(session, false);
2129 }
2130 break;
2131
2132 case CEPH_SESSION_STALE:
2133 // invalidate session caps/leases
2134 session->cap_gen++;
2135 session->cap_ttl = ceph_clock_now();
2136 session->cap_ttl -= 1;
2137 renew_caps(session);
2138 break;
2139
2140 case CEPH_SESSION_RECALL_STATE:
2141 trim_caps(session, m->get_max_caps());
2142 break;
2143
2144 case CEPH_SESSION_FLUSHMSG:
2145 /* flush cap release */
2146 if (auto& m = session->release; m) {
2147 session->con->send_message2(std::move(m));
2148 }
2149 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2150 break;
2151
2152 case CEPH_SESSION_FORCE_RO:
2153 force_session_readonly(session);
2154 break;
2155
2156 case CEPH_SESSION_REJECT:
2157 {
2158 std::string_view error_str;
2159 auto it = m->metadata.find("error_string");
2160 if (it != m->metadata.end())
2161 error_str = it->second;
2162 else
2163 error_str = "unknown error";
2164 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2165
2166 rejected_by_mds[session->mds_num] = session->addrs;
2167 _closed_mds_session(session);
2168 }
2169 break;
2170
2171 default:
2172 ceph_abort();
2173 }
2174 }
2175
2176 bool Client::_any_stale_sessions() const
2177 {
2178 ceph_assert(client_lock.is_locked_by_me());
2179
2180 for (const auto &p : mds_sessions) {
2181 if (p.second.state == MetaSession::STATE_STALE) {
2182 return true;
2183 }
2184 }
2185
2186 return false;
2187 }
2188
2189 void Client::_kick_stale_sessions()
2190 {
2191 ldout(cct, 1) << __func__ << dendl;
2192
2193 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2194 MetaSession &s = it->second;
2195 ++it;
2196 if (s.state == MetaSession::STATE_STALE)
2197 _closed_mds_session(&s);
2198 }
2199 }
2200
2201 void Client::send_request(MetaRequest *request, MetaSession *session,
2202 bool drop_cap_releases)
2203 {
2204 // make the request
2205 mds_rank_t mds = session->mds_num;
2206 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2207 << " for mds." << mds << dendl;
2208 auto r = build_client_request(request);
2209 if (request->dentry()) {
2210 r->set_dentry_wanted();
2211 }
2212 if (request->got_unsafe) {
2213 r->set_replayed_op();
2214 if (request->target)
2215 r->head.ino = request->target->ino;
2216 } else {
2217 encode_cap_releases(request, mds);
2218 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2219 request->cap_releases.clear();
2220 else
2221 r->releases.swap(request->cap_releases);
2222 }
2223 r->set_mdsmap_epoch(mdsmap->get_epoch());
2224 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2225 objecter->with_osdmap([r](const OSDMap& o) {
2226 r->set_osdmap_epoch(o.get_epoch());
2227 });
2228 }
2229
2230 if (request->mds == -1) {
2231 request->sent_stamp = ceph_clock_now();
2232 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2233 }
2234 request->mds = mds;
2235
2236 Inode *in = request->inode();
2237 if (in) {
2238 auto it = in->caps.find(mds);
2239 if (it != in->caps.end()) {
2240 request->sent_on_mseq = it->second.mseq;
2241 }
2242 }
2243
2244 session->requests.push_back(&request->item);
2245
2246 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2247 session->con->send_message2(std::move(r));
2248 }
2249
2250 MClientRequest::ref Client::build_client_request(MetaRequest *request)
2251 {
2252 auto req = MClientRequest::create(request->get_op());
2253 req->set_tid(request->tid);
2254 req->set_stamp(request->op_stamp);
2255 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2256
2257 // if the filepath's haven't been set, set them!
2258 if (request->path.empty()) {
2259 Inode *in = request->inode();
2260 Dentry *de = request->dentry();
2261 if (in)
2262 in->make_nosnap_relative_path(request->path);
2263 else if (de) {
2264 if (de->inode)
2265 de->inode->make_nosnap_relative_path(request->path);
2266 else if (de->dir) {
2267 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2268 request->path.push_dentry(de->name);
2269 }
2270 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2271 << " No path, inode, or appropriately-endowed dentry given!"
2272 << dendl;
2273 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or dentry given!"
2275 << dendl;
2276 }
2277 req->set_filepath(request->get_filepath());
2278 req->set_filepath2(request->get_filepath2());
2279 req->set_data(request->data);
2280 req->set_retry_attempt(request->retry_attempt++);
2281 req->head.num_fwd = request->num_fwd;
2282 const gid_t *_gids;
2283 int gid_count = request->perms.get_gids(&_gids);
2284 req->set_gid_list(gid_count, _gids);
2285 return req;
2286 }
2287
2288
2289
2290 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2291 {
2292 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2293 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2294 if (!session) {
2295 return;
2296 }
2297 ceph_tid_t tid = fwd->get_tid();
2298
2299 if (mds_requests.count(tid) == 0) {
2300 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2301 return;
2302 }
2303
2304 MetaRequest *request = mds_requests[tid];
2305 ceph_assert(request);
2306
2307 // reset retry counter
2308 request->retry_attempt = 0;
2309
2310 // request not forwarded, or dest mds has no session.
2311 // resend.
2312 ldout(cct, 10) << __func__ << " tid " << tid
2313 << " fwd " << fwd->get_num_fwd()
2314 << " to mds." << fwd->get_dest_mds()
2315 << ", resending to " << fwd->get_dest_mds()
2316 << dendl;
2317
2318 request->mds = -1;
2319 request->item.remove_myself();
2320 request->num_fwd = fwd->get_num_fwd();
2321 request->resend_mds = fwd->get_dest_mds();
2322 request->caller_cond->Signal();
2323 }
2324
2325 bool Client::is_dir_operation(MetaRequest *req)
2326 {
2327 int op = req->get_op();
2328 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2329 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2330 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2331 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2332 return true;
2333 return false;
2334 }
2335
2336 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2337 {
2338 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2339 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2340 if (!session) {
2341 return;
2342 }
2343
2344 ceph_tid_t tid = reply->get_tid();
2345 bool is_safe = reply->is_safe();
2346
2347 if (mds_requests.count(tid) == 0) {
2348 lderr(cct) << __func__ << " no pending request on tid " << tid
2349 << " safe is:" << is_safe << dendl;
2350 return;
2351 }
2352 MetaRequest *request = mds_requests.at(tid);
2353
2354 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2355 << " tid " << tid << dendl;
2356
2357 if (request->got_unsafe && !is_safe) {
2358 //duplicate response
2359 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2360 << mds_num << " safe:" << is_safe << dendl;
2361 return;
2362 }
2363
2364 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2365 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2366 << " from mds." << request->mds << dendl;
2367 request->send_to_auth = true;
2368 request->resend_mds = choose_target_mds(request);
2369 Inode *in = request->inode();
2370 std::map<mds_rank_t, Cap>::const_iterator it;
2371 if (request->resend_mds >= 0 &&
2372 request->resend_mds == request->mds &&
2373 (in == NULL ||
2374 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2375 request->sent_on_mseq == it->second.mseq)) {
2376 ldout(cct, 20) << "have to return ESTALE" << dendl;
2377 } else {
2378 request->caller_cond->Signal();
2379 return;
2380 }
2381 }
2382
2383 ceph_assert(!request->reply);
2384 request->reply = reply;
2385 insert_trace(request, session);
2386
2387 // Handle unsafe reply
2388 if (!is_safe) {
2389 request->got_unsafe = true;
2390 session->unsafe_requests.push_back(&request->unsafe_item);
2391 if (is_dir_operation(request)) {
2392 Inode *dir = request->inode();
2393 ceph_assert(dir);
2394 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2395 }
2396 if (request->target) {
2397 InodeRef &in = request->target;
2398 in->unsafe_ops.push_back(&request->unsafe_target_item);
2399 }
2400 }
2401
2402 // Only signal the caller once (on the first reply):
2403 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2404 if (!is_safe || !request->got_unsafe) {
2405 Cond cond;
2406 request->dispatch_cond = &cond;
2407
2408 // wake up waiter
2409 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2410 request->caller_cond->Signal();
2411
2412 // wake for kick back
2413 while (request->dispatch_cond) {
2414 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
2415 cond.Wait(client_lock);
2416 }
2417 }
2418
2419 if (is_safe) {
2420 // the filesystem change is committed to disk
2421 // we're done, clean up
2422 if (request->got_unsafe) {
2423 request->unsafe_item.remove_myself();
2424 request->unsafe_dir_item.remove_myself();
2425 request->unsafe_target_item.remove_myself();
2426 signal_cond_list(request->waitfor_safe);
2427 }
2428 request->item.remove_myself();
2429 unregister_request(request);
2430 }
2431 if (unmounting)
2432 mount_cond.Signal();
2433 }
2434
2435 void Client::_handle_full_flag(int64_t pool)
2436 {
2437 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2438 << "on " << pool << dendl;
2439 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2440 // to do this rather than blocking, because otherwise when we fill up we
2441 // potentially lock caps forever on files with dirty pages, and we need
2442 // to be able to release those caps to the MDS so that it can delete files
2443 // and free up space.
2444 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2445
2446 // For all inodes with layouts in this pool and a pending flush write op
2447 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2448 // from ObjectCacher so that it doesn't re-issue the write in response to
2449 // the ENOSPC error.
2450 // Fortunately since we're cancelling everything in a given pool, we don't
2451 // need to know which ops belong to which ObjectSet, we can just blow all
2452 // the un-flushed cached data away and mark any dirty inodes' async_err
2453 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2454 // affecting this pool, and all the objectsets we're purging were also
2455 // in this pool.
2456 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2457 i != inode_map.end(); ++i)
2458 {
2459 Inode *inode = i->second;
2460 if (inode->oset.dirty_or_tx
2461 && (pool == -1 || inode->layout.pool_id == pool)) {
2462 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2463 << " has dirty objects, purging and setting ENOSPC" << dendl;
2464 objectcacher->purge_set(&inode->oset);
2465 inode->set_async_err(-ENOSPC);
2466 }
2467 }
2468
2469 if (cancelled_epoch != (epoch_t)-1) {
2470 set_cap_epoch_barrier(cancelled_epoch);
2471 }
2472 }
2473
2474 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2475 {
2476 std::set<entity_addr_t> new_blacklists;
2477 objecter->consume_blacklist_events(&new_blacklists);
2478
2479 const auto myaddrs = messenger->get_myaddrs();
2480 bool new_blacklist = false;
2481 bool prenautilus = objecter->with_osdmap(
2482 [&](const OSDMap& o) {
2483 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2484 });
2485 if (!blacklisted) {
2486 for (auto a : myaddrs.v) {
2487 // blacklist entries are always TYPE_ANY for nautilus+
2488 a.set_type(entity_addr_t::TYPE_ANY);
2489 if (new_blacklists.count(a)) {
2490 new_blacklist = true;
2491 break;
2492 }
2493 if (prenautilus) {
2494 // ...except pre-nautilus, they were TYPE_LEGACY
2495 a.set_type(entity_addr_t::TYPE_LEGACY);
2496 if (new_blacklists.count(a)) {
2497 new_blacklist = true;
2498 break;
2499 }
2500 }
2501 }
2502 }
2503 if (new_blacklist) {
2504 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2505 return o.get_epoch();
2506 });
2507 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2508 blacklisted = true;
2509
2510 _abort_mds_sessions(-EBLACKLISTED);
2511
2512 // Since we know all our OSD ops will fail, cancel them all preemtively,
2513 // so that on an unhealthy cluster we can umount promptly even if e.g.
2514 // some PGs were inaccessible.
2515 objecter->op_cancel_writes(-EBLACKLISTED);
2516
2517 } else if (blacklisted) {
2518 // Handle case where we were blacklisted but no longer are
2519 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2520 return o.is_blacklisted(myaddrs);});
2521 }
2522
2523 // Always subscribe to next osdmap for blacklisted client
2524 // until this client is not blacklisted.
2525 if (blacklisted) {
2526 objecter->maybe_request_map();
2527 }
2528
2529 if (objecter->osdmap_full_flag()) {
2530 _handle_full_flag(-1);
2531 } else {
2532 // Accumulate local list of full pools so that I can drop
2533 // the objecter lock before re-entering objecter in
2534 // cancel_writes
2535 std::vector<int64_t> full_pools;
2536
2537 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2538 for (const auto& kv : o.get_pools()) {
2539 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2540 full_pools.push_back(kv.first);
2541 }
2542 }
2543 });
2544
2545 for (auto p : full_pools)
2546 _handle_full_flag(p);
2547
2548 // Subscribe to subsequent maps to watch for the full flag going
2549 // away. For the global full flag objecter does this for us, but
2550 // it pays no attention to the per-pool full flag so in this branch
2551 // we do it ourselves.
2552 if (!full_pools.empty()) {
2553 objecter->maybe_request_map();
2554 }
2555 }
2556 }
2557
2558
2559 // ------------------------
2560 // incoming messages
2561
2562
2563 bool Client::ms_dispatch2(const MessageRef &m)
2564 {
2565 std::lock_guard l(client_lock);
2566 if (!initialized) {
2567 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2568 return true;
2569 }
2570
2571 switch (m->get_type()) {
2572 // mounting and mds sessions
2573 case CEPH_MSG_MDS_MAP:
2574 handle_mds_map(MMDSMap::msgref_cast(m));
2575 break;
2576 case CEPH_MSG_FS_MAP:
2577 handle_fs_map(MFSMap::msgref_cast(m));
2578 break;
2579 case CEPH_MSG_FS_MAP_USER:
2580 handle_fs_map_user(MFSMapUser::msgref_cast(m));
2581 break;
2582 case CEPH_MSG_CLIENT_SESSION:
2583 handle_client_session(MClientSession::msgref_cast(m));
2584 break;
2585
2586 case CEPH_MSG_OSD_MAP:
2587 handle_osd_map(MOSDMap::msgref_cast(m));
2588 break;
2589
2590 // requests
2591 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2592 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
2593 break;
2594 case CEPH_MSG_CLIENT_REPLY:
2595 handle_client_reply(MClientReply::msgref_cast(m));
2596 break;
2597
2598 // reclaim reply
2599 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2600 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
2601 break;
2602
2603 case CEPH_MSG_CLIENT_SNAP:
2604 handle_snap(MClientSnap::msgref_cast(m));
2605 break;
2606 case CEPH_MSG_CLIENT_CAPS:
2607 handle_caps(MClientCaps::msgref_cast(m));
2608 break;
2609 case CEPH_MSG_CLIENT_LEASE:
2610 handle_lease(MClientLease::msgref_cast(m));
2611 break;
2612 case MSG_COMMAND_REPLY:
2613 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2614 handle_command_reply(MCommandReply::msgref_cast(m));
2615 } else {
2616 return false;
2617 }
2618 break;
2619 case CEPH_MSG_CLIENT_QUOTA:
2620 handle_quota(MClientQuota::msgref_cast(m));
2621 break;
2622
2623 default:
2624 return false;
2625 }
2626
2627 // unmounting?
2628 if (unmounting) {
2629 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2630 << "+" << inode_map.size() << dendl;
2631 long unsigned size = lru.lru_get_size() + inode_map.size();
2632 trim_cache();
2633 if (size < lru.lru_get_size() + inode_map.size()) {
2634 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2635 mount_cond.Signal();
2636 } else {
2637 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2638 << "+" << inode_map.size() << dendl;
2639 }
2640 }
2641
2642 return true;
2643 }
2644
2645 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2646 {
2647 fsmap.reset(new FSMap(m->get_fsmap()));
2648
2649 signal_cond_list(waiting_for_fsmap);
2650
2651 monclient->sub_got("fsmap", fsmap->get_epoch());
2652 }
2653
2654 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2655 {
2656 fsmap_user.reset(new FSMapUser);
2657 *fsmap_user = m->get_fsmap();
2658
2659 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2660 signal_cond_list(waiting_for_fsmap);
2661 }
2662
2663 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2664 {
2665 mds_gid_t old_inc, new_inc;
2666 if (m->get_epoch() <= mdsmap->get_epoch()) {
2667 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2668 << " is identical to or older than our "
2669 << mdsmap->get_epoch() << dendl;
2670 return;
2671 }
2672
2673 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2674
2675 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2676 oldmap.swap(mdsmap);
2677
2678 mdsmap->decode(m->get_encoded());
2679
2680 // Cancel any commands for missing or laggy GIDs
2681 std::list<ceph_tid_t> cancel_ops;
2682 auto &commands = command_table.get_commands();
2683 for (const auto &i : commands) {
2684 auto &op = i.second;
2685 const mds_gid_t op_mds_gid = op.mds_gid;
2686 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2687 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2688 cancel_ops.push_back(i.first);
2689 if (op.outs) {
2690 std::ostringstream ss;
2691 ss << "MDS " << op_mds_gid << " went away";
2692 *(op.outs) = ss.str();
2693 }
2694 op.con->mark_down();
2695 if (op.on_finish) {
2696 op.on_finish->complete(-ETIMEDOUT);
2697 }
2698 }
2699 }
2700
2701 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2702 i != cancel_ops.end(); ++i) {
2703 command_table.erase(*i);
2704 }
2705
2706 // reset session
2707 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2708 mds_rank_t mds = p->first;
2709 MetaSession *session = &p->second;
2710 ++p;
2711
2712 int oldstate = oldmap->get_state(mds);
2713 int newstate = mdsmap->get_state(mds);
2714 if (!mdsmap->is_up(mds)) {
2715 session->con->mark_down();
2716 } else if (mdsmap->get_addrs(mds) != session->addrs) {
2717 old_inc = oldmap->get_incarnation(mds);
2718 new_inc = mdsmap->get_incarnation(mds);
2719 if (old_inc != new_inc) {
2720 ldout(cct, 1) << "mds incarnation changed from "
2721 << old_inc << " to " << new_inc << dendl;
2722 oldstate = MDSMap::STATE_NULL;
2723 }
2724 session->con->mark_down();
2725 session->addrs = mdsmap->get_addrs(mds);
2726 // When new MDS starts to take over, notify kernel to trim unused entries
2727 // in its dcache/icache. Hopefully, the kernel will release some unused
2728 // inodes before the new MDS enters reconnect state.
2729 trim_cache_for_reconnect(session);
2730 } else if (oldstate == newstate)
2731 continue; // no change
2732
2733 session->mds_state = newstate;
2734 if (newstate == MDSMap::STATE_RECONNECT) {
2735 session->con = messenger->connect_to_mds(session->addrs);
2736 send_reconnect(session);
2737 } else if (newstate > MDSMap::STATE_RECONNECT) {
2738 if (oldstate < MDSMap::STATE_RECONNECT) {
2739 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2740 _closed_mds_session(session);
2741 continue;
2742 }
2743 if (newstate >= MDSMap::STATE_ACTIVE) {
2744 if (oldstate < MDSMap::STATE_ACTIVE) {
2745 // kick new requests
2746 kick_requests(session);
2747 kick_flushing_caps(session);
2748 signal_context_list(session->waiting_for_open);
2749 wake_up_session_caps(session, true);
2750 }
2751 connect_mds_targets(mds);
2752 }
2753 } else if (newstate == MDSMap::STATE_NULL &&
2754 mds >= mdsmap->get_max_mds()) {
2755 _closed_mds_session(session);
2756 }
2757 }
2758
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap);
2761
2762 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2763 }
2764
2765 void Client::send_reconnect(MetaSession *session)
2766 {
2767 mds_rank_t mds = session->mds_num;
2768 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2769
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session);
2772
2773 session->readonly = false;
2774
2775 session->release.reset();
2776
2777 // reset my cap seq number
2778 session->seq = 0;
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session);
2783
2784 early_kick_flushing_caps(session);
2785
2786 auto m = MClientReconnect::create();
2787 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2788
2789 // i have an open session.
2790 ceph::unordered_set<inodeno_t> did_snaprealm;
2791 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2792 p != inode_map.end();
2793 ++p) {
2794 Inode *in = p->second;
2795 auto it = in->caps.find(mds);
2796 if (it != in->caps.end()) {
2797 if (allow_multi &&
2798 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2799 m->mark_more();
2800 session->con->send_message2(std::move(m));
2801
2802 m = MClientReconnect::create();
2803 }
2804
2805 Cap &cap = it->second;
2806 ldout(cct, 10) << " caps on " << p->first
2807 << " " << ccap_string(cap.issued)
2808 << " wants " << ccap_string(in->caps_wanted())
2809 << dendl;
2810 filepath path;
2811 in->make_long_path(path);
2812 ldout(cct, 10) << " path " << path << dendl;
2813
2814 bufferlist flockbl;
2815 _encode_filelocks(in, flockbl);
2816
2817 cap.seq = 0; // reset seq.
2818 cap.issue_seq = 0; // reset seq.
2819 cap.mseq = 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap.gen < session->cap_gen) {
2822 cap.gen = session->cap_gen;
2823 cap.issued = cap.implemented = CEPH_CAP_PIN;
2824 } else {
2825 cap.issued = cap.implemented;
2826 }
2827 snapid_t snap_follows = 0;
2828 if (!in->cap_snaps.empty())
2829 snap_follows = in->cap_snaps.begin()->first;
2830
2831 m->add_cap(p->first.ino,
2832 cap.cap_id,
2833 path.get_ino(), path.get_path(), // ino
2834 in->caps_wanted(), // wanted
2835 cap.issued, // issued
2836 in->snaprealm->ino,
2837 snap_follows,
2838 flockbl);
2839
2840 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2841 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2842 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2843 did_snaprealm.insert(in->snaprealm->ino);
2844 }
2845 }
2846 }
2847
2848 if (!allow_multi)
2849 m->set_encoding_version(0); // use connection features to choose encoding
2850 session->con->send_message2(std::move(m));
2851
2852 mount_cond.Signal();
2853
2854 if (session->reclaim_state == MetaSession::RECLAIMING)
2855 signal_cond_list(waiting_for_reclaim);
2856 }
2857
2858
2859 void Client::kick_requests(MetaSession *session)
2860 {
2861 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2862 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2863 p != mds_requests.end();
2864 ++p) {
2865 MetaRequest *req = p->second;
2866 if (req->got_unsafe)
2867 continue;
2868 if (req->aborted()) {
2869 if (req->caller_cond) {
2870 req->kick = true;
2871 req->caller_cond->Signal();
2872 }
2873 continue;
2874 }
2875 if (req->retry_attempt > 0)
2876 continue; // new requests only
2877 if (req->mds == session->mds_num) {
2878 send_request(p->second, session);
2879 }
2880 }
2881 }
2882
2883 void Client::resend_unsafe_requests(MetaSession *session)
2884 {
2885 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2886 !iter.end();
2887 ++iter)
2888 send_request(*iter, session);
2889
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2893 p != mds_requests.end();
2894 ++p) {
2895 MetaRequest *req = p->second;
2896 if (req->got_unsafe)
2897 continue;
2898 if (req->aborted())
2899 continue;
2900 if (req->retry_attempt == 0)
2901 continue; // old requests only
2902 if (req->mds == session->mds_num)
2903 send_request(req, session, true);
2904 }
2905 }
2906
2907 void Client::wait_unsafe_requests()
2908 {
2909 list<MetaRequest*> last_unsafe_reqs;
2910 for (const auto &p : mds_sessions) {
2911 const MetaSession &s = p.second;
2912 if (!s.unsafe_requests.empty()) {
2913 MetaRequest *req = s.unsafe_requests.back();
2914 req->get();
2915 last_unsafe_reqs.push_back(req);
2916 }
2917 }
2918
2919 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2920 p != last_unsafe_reqs.end();
2921 ++p) {
2922 MetaRequest *req = *p;
2923 if (req->unsafe_item.is_on_list())
2924 wait_on_list(req->waitfor_safe);
2925 put_request(req);
2926 }
2927 }
2928
2929 void Client::kick_requests_closed(MetaSession *session)
2930 {
2931 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2932 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2933 p != mds_requests.end(); ) {
2934 MetaRequest *req = p->second;
2935 ++p;
2936 if (req->mds == session->mds_num) {
2937 if (req->caller_cond) {
2938 req->kick = true;
2939 req->caller_cond->Signal();
2940 }
2941 req->item.remove_myself();
2942 if (req->got_unsafe) {
2943 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2944 req->unsafe_item.remove_myself();
2945 if (is_dir_operation(req)) {
2946 Inode *dir = req->inode();
2947 assert(dir);
2948 dir->set_async_err(-EIO);
2949 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2950 << dir->ino << " " << req->get_tid() << dendl;
2951 req->unsafe_dir_item.remove_myself();
2952 }
2953 if (req->target) {
2954 InodeRef &in = req->target;
2955 in->set_async_err(-EIO);
2956 lderr(cct) << "kick_requests_closed drop req of inode : "
2957 << in->ino << " " << req->get_tid() << dendl;
2958 req->unsafe_target_item.remove_myself();
2959 }
2960 signal_cond_list(req->waitfor_safe);
2961 unregister_request(req);
2962 }
2963 }
2964 }
2965 ceph_assert(session->requests.empty());
2966 ceph_assert(session->unsafe_requests.empty());
2967 }
2968
2969
2970
2971
2972 /************
2973 * leases
2974 */
2975
2976 void Client::got_mds_push(MetaSession *s)
2977 {
2978 s->seq++;
2979 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2980 if (s->state == MetaSession::STATE_CLOSING) {
2981 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2982 }
2983 }
2984
2985 void Client::handle_lease(const MConstRef<MClientLease>& m)
2986 {
2987 ldout(cct, 10) << __func__ << " " << *m << dendl;
2988
2989 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2990
2991 mds_rank_t mds = mds_rank_t(m->get_source().num());
2992 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2993 if (!session) {
2994 return;
2995 }
2996
2997 got_mds_push(session);
2998
2999 ceph_seq_t seq = m->get_seq();
3000
3001 Inode *in;
3002 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3003 if (inode_map.count(vino) == 0) {
3004 ldout(cct, 10) << " don't have vino " << vino << dendl;
3005 goto revoke;
3006 }
3007 in = inode_map[vino];
3008
3009 if (m->get_mask() & CEPH_LOCK_DN) {
3010 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3011 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3012 goto revoke;
3013 }
3014 Dentry *dn = in->dir->dentries[m->dname];
3015 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3016 dn->lease_mds = -1;
3017 }
3018
3019 revoke:
3020 {
3021 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3022 m->get_connection()->send_message2(std::move(reply));
3023 }
3024 }
3025
3026 void Client::put_inode(Inode *in, int n)
3027 {
3028 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3029 int left = in->_put(n);
3030 if (left == 0) {
3031 // release any caps
3032 remove_all_caps(in);
3033
3034 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3035 bool unclean = objectcacher->release_set(&in->oset);
3036 ceph_assert(!unclean);
3037 inode_map.erase(in->vino());
3038 if (use_faked_inos())
3039 _release_faked_ino(in);
3040
3041 if (in == root) {
3042 root = 0;
3043 root_ancestor = 0;
3044 while (!root_parents.empty())
3045 root_parents.erase(root_parents.begin());
3046 }
3047
3048 delete in;
3049 }
3050 }
3051
3052 void Client::close_dir(Dir *dir)
3053 {
3054 Inode *in = dir->parent_inode;
3055 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3056 ceph_assert(dir->is_empty());
3057 ceph_assert(in->dir == dir);
3058 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3059 if (!in->dentries.empty())
3060 in->get_first_parent()->put(); // unpin dentry
3061
3062 delete in->dir;
3063 in->dir = 0;
3064 put_inode(in); // unpin inode
3065 }
3066
3067 /**
3068 * Don't call this with in==NULL, use get_or_create for that
3069 * leave dn set to default NULL unless you're trying to add
3070 * a new inode to a pre-created Dentry
3071 */
3072 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3073 {
3074 if (!dn) {
3075 // create a new Dentry
3076 dn = new Dentry(dir, name);
3077
3078 lru.lru_insert_mid(dn); // mid or top?
3079
3080 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3081 << " dn " << dn << " (new dn)" << dendl;
3082 } else {
3083 ceph_assert(!dn->inode);
3084 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3085 << " dn " << dn << " (old dn)" << dendl;
3086 }
3087
3088 if (in) { // link to inode
3089 InodeRef tmp_ref;
3090 // only one parent for directories!
3091 if (in->is_dir() && !in->dentries.empty()) {
3092 tmp_ref = in; // prevent unlink below from freeing the inode.
3093 Dentry *olddn = in->get_first_parent();
3094 ceph_assert(olddn->dir != dir || olddn->name != name);
3095 Inode *old_diri = olddn->dir->parent_inode;
3096 old_diri->dir_release_count++;
3097 clear_dir_complete_and_ordered(old_diri, true);
3098 unlink(olddn, true, true); // keep dir, dentry
3099 }
3100
3101 dn->link(in);
3102 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
3103 }
3104
3105 return dn;
3106 }
3107
3108 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3109 {
3110 InodeRef in(dn->inode);
3111 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3112 << " inode " << dn->inode << dendl;
3113
3114 // unlink from inode
3115 if (dn->inode) {
3116 dn->unlink();
3117 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
3118 }
3119
3120 if (keepdentry) {
3121 dn->lease_mds = -1;
3122 } else {
3123 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3124
3125 // unlink from dir
3126 Dir *dir = dn->dir;
3127 dn->detach();
3128
3129 // delete den
3130 lru.lru_remove(dn);
3131 dn->put();
3132
3133 if (dir->is_empty() && !keepdir)
3134 close_dir(dir);
3135 }
3136 }
3137
3138 /**
3139 * For asynchronous flushes, check for errors from the IO and
3140 * update the inode if necessary
3141 */
3142 class C_Client_FlushComplete : public Context {
3143 private:
3144 Client *client;
3145 InodeRef inode;
3146 public:
3147 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3148 void finish(int r) override {
3149 ceph_assert(client->client_lock.is_locked_by_me());
3150 if (r != 0) {
3151 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3152 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3153 << " 0x" << std::hex << inode->ino << std::dec
3154 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3155 inode->set_async_err(r);
3156 }
3157 }
3158 };
3159
3160
3161 /****
3162 * caps
3163 */
3164
3165 void Client::get_cap_ref(Inode *in, int cap)
3166 {
3167 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3168 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3169 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3170 in->get();
3171 }
3172 if ((cap & CEPH_CAP_FILE_CACHE) &&
3173 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3174 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3175 in->get();
3176 }
3177 in->get_cap_ref(cap);
3178 }
3179
3180 void Client::put_cap_ref(Inode *in, int cap)
3181 {
3182 int last = in->put_cap_ref(cap);
3183 if (last) {
3184 int put_nref = 0;
3185 int drop = last & ~in->caps_issued();
3186 if (in->snapid == CEPH_NOSNAP) {
3187 if ((last & CEPH_CAP_FILE_WR) &&
3188 !in->cap_snaps.empty() &&
3189 in->cap_snaps.rbegin()->second.writing) {
3190 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3191 in->cap_snaps.rbegin()->second.writing = 0;
3192 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3193 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3194 }
3195 if (last & CEPH_CAP_FILE_BUFFER) {
3196 for (auto &p : in->cap_snaps)
3197 p.second.dirty_data = 0;
3198 signal_cond_list(in->waitfor_commit);
3199 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3200 ++put_nref;
3201 }
3202 }
3203 if (last & CEPH_CAP_FILE_CACHE) {
3204 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3205 ++put_nref;
3206 }
3207 if (drop)
3208 check_caps(in, 0);
3209 if (put_nref)
3210 put_inode(in, put_nref);
3211 }
3212 }
3213
3214 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3215 {
3216 int r = check_pool_perm(in, need);
3217 if (r < 0)
3218 return r;
3219
3220 while (1) {
3221 int file_wanted = in->caps_file_wanted();
3222 if ((file_wanted & need) != need) {
3223 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3224 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3225 << dendl;
3226 return -EBADF;
3227 }
3228
3229 int implemented;
3230 int have = in->caps_issued(&implemented);
3231
3232 bool waitfor_caps = false;
3233 bool waitfor_commit = false;
3234
3235 if (have & need & CEPH_CAP_FILE_WR) {
3236 if (endoff > 0 &&
3237 (endoff >= (loff_t)in->max_size ||
3238 endoff > (loff_t)(in->size << 1)) &&
3239 endoff > (loff_t)in->wanted_max_size) {
3240 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3241 in->wanted_max_size = endoff;
3242 check_caps(in, 0);
3243 }
3244
3245 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3246 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3247 waitfor_caps = true;
3248 }
3249 if (!in->cap_snaps.empty()) {
3250 if (in->cap_snaps.rbegin()->second.writing) {
3251 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3252 waitfor_caps = true;
3253 }
3254 for (auto &p : in->cap_snaps) {
3255 if (p.second.dirty_data) {
3256 waitfor_commit = true;
3257 break;
3258 }
3259 }
3260 if (waitfor_commit) {
3261 _flush(in, new C_Client_FlushComplete(this, in));
3262 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3263 }
3264 }
3265 }
3266
3267 if (!waitfor_caps && !waitfor_commit) {
3268 if ((have & need) == need) {
3269 int revoking = implemented & ~have;
3270 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3271 << " need " << ccap_string(need) << " want " << ccap_string(want)
3272 << " revoking " << ccap_string(revoking)
3273 << dendl;
3274 if ((revoking & want) == 0) {
3275 *phave = need | (have & want);
3276 in->get_cap_ref(need);
3277 return 0;
3278 }
3279 }
3280 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3281 waitfor_caps = true;
3282 }
3283
3284 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3285 in->auth_cap->session->readonly)
3286 return -EROFS;
3287
3288 if (in->flags & I_CAP_DROPPED) {
3289 int mds_wanted = in->caps_mds_wanted();
3290 if ((mds_wanted & need) != need) {
3291 int ret = _renew_caps(in);
3292 if (ret < 0)
3293 return ret;
3294 continue;
3295 }
3296 if (!(file_wanted & ~mds_wanted))
3297 in->flags &= ~I_CAP_DROPPED;
3298 }
3299
3300 if (waitfor_caps)
3301 wait_on_list(in->waitfor_caps);
3302 else if (waitfor_commit)
3303 wait_on_list(in->waitfor_commit);
3304 }
3305 }
3306
3307 int Client::get_caps_used(Inode *in)
3308 {
3309 unsigned used = in->caps_used();
3310 if (!(used & CEPH_CAP_FILE_CACHE) &&
3311 !objectcacher->set_is_empty(&in->oset))
3312 used |= CEPH_CAP_FILE_CACHE;
3313 return used;
3314 }
3315
3316 void Client::cap_delay_requeue(Inode *in)
3317 {
3318 ldout(cct, 10) << __func__ << " on " << *in << dendl;
3319 in->hold_caps_until = ceph_clock_now();
3320 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3321 delayed_list.push_back(&in->delay_cap_item);
3322 }
3323
3324 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3325 int flags, int used, int want, int retain,
3326 int flush, ceph_tid_t flush_tid)
3327 {
3328 int held = cap->issued | cap->implemented;
3329 int revoking = cap->implemented & ~cap->issued;
3330 retain &= ~revoking;
3331 int dropping = cap->issued & ~retain;
3332 int op = CEPH_CAP_OP_UPDATE;
3333
3334 ldout(cct, 10) << __func__ << " " << *in
3335 << " mds." << session->mds_num << " seq " << cap->seq
3336 << " used " << ccap_string(used)
3337 << " want " << ccap_string(want)
3338 << " flush " << ccap_string(flush)
3339 << " retain " << ccap_string(retain)
3340 << " held "<< ccap_string(held)
3341 << " revoking " << ccap_string(revoking)
3342 << " dropping " << ccap_string(dropping)
3343 << dendl;
3344
3345 if (cct->_conf->client_inject_release_failure && revoking) {
3346 const int would_have_issued = cap->issued & retain;
3347 const int would_have_implemented = cap->implemented & (cap->issued | used);
3348 // Simulated bug:
3349 // - tell the server we think issued is whatever they issued plus whatever we implemented
3350 // - leave what we have implemented in place
3351 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3352 cap->issued = cap->issued | cap->implemented;
3353
3354 // Make an exception for revoking xattr caps: we are injecting
3355 // failure to release other caps, but allow xattr because client
3356 // will block on xattr ops if it can't release these to MDS (#9800)
3357 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3358 cap->issued ^= xattr_mask & revoking;
3359 cap->implemented ^= xattr_mask & revoking;
3360
3361 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3362 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3363 } else {
3364 // Normal behaviour
3365 cap->issued &= retain;
3366 cap->implemented &= cap->issued | used;
3367 }
3368
3369 snapid_t follows = 0;
3370
3371 if (flush)
3372 follows = in->snaprealm->get_snap_context().seq;
3373
3374 auto m = MClientCaps::create(op,
3375 in->ino,
3376 0,
3377 cap->cap_id, cap->seq,
3378 cap->implemented,
3379 want,
3380 flush,
3381 cap->mseq,
3382 cap_epoch_barrier);
3383 m->caller_uid = in->cap_dirtier_uid;
3384 m->caller_gid = in->cap_dirtier_gid;
3385
3386 m->head.issue_seq = cap->issue_seq;
3387 m->set_tid(flush_tid);
3388
3389 m->head.uid = in->uid;
3390 m->head.gid = in->gid;
3391 m->head.mode = in->mode;
3392
3393 m->head.nlink = in->nlink;
3394
3395 if (flush & CEPH_CAP_XATTR_EXCL) {
3396 encode(in->xattrs, m->xattrbl);
3397 m->head.xattr_version = in->xattr_version;
3398 }
3399
3400 m->size = in->size;
3401 m->max_size = in->max_size;
3402 m->truncate_seq = in->truncate_seq;
3403 m->truncate_size = in->truncate_size;
3404 m->mtime = in->mtime;
3405 m->atime = in->atime;
3406 m->ctime = in->ctime;
3407 m->btime = in->btime;
3408 m->time_warp_seq = in->time_warp_seq;
3409 m->change_attr = in->change_attr;
3410
3411 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3412 !in->cap_snaps.empty() &&
3413 in->cap_snaps.rbegin()->second.flush_tid == 0)
3414 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3415 m->flags = flags;
3416
3417 if (flush & CEPH_CAP_FILE_WR) {
3418 m->inline_version = in->inline_version;
3419 m->inline_data = in->inline_data;
3420 }
3421
3422 in->reported_size = in->size;
3423 m->set_snap_follows(follows);
3424 cap->wanted = want;
3425 if (cap == in->auth_cap) {
3426 m->set_max_size(in->wanted_max_size);
3427 in->requested_max_size = in->wanted_max_size;
3428 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3429 }
3430
3431 if (!session->flushing_caps_tids.empty())
3432 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3433
3434 session->con->send_message2(std::move(m));
3435 }
3436
3437 static bool is_max_size_approaching(Inode *in)
3438 {
3439 /* mds will adjust max size according to the reported size */
3440 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3441 return false;
3442 if (in->size >= in->max_size)
3443 return true;
3444 /* half of previous max_size increment has been used */
3445 if (in->max_size > in->reported_size &&
3446 (in->size << 1) >= in->max_size + in->reported_size)
3447 return true;
3448 return false;
3449 }
3450
3451 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3452 {
3453 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3454 return used;
3455 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3456 return used;
3457
3458 if (issued & CEPH_CAP_FILE_LAZYIO) {
3459 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3460 used &= ~CEPH_CAP_FILE_CACHE;
3461 used |= CEPH_CAP_FILE_LAZYIO;
3462 }
3463 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3464 used &= ~CEPH_CAP_FILE_BUFFER;
3465 used |= CEPH_CAP_FILE_LAZYIO;
3466 }
3467 } else {
3468 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3469 used &= ~CEPH_CAP_FILE_CACHE;
3470 used |= CEPH_CAP_FILE_LAZYIO;
3471 }
3472 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3473 used &= ~CEPH_CAP_FILE_BUFFER;
3474 used |= CEPH_CAP_FILE_LAZYIO;
3475 }
3476 }
3477 return used;
3478 }
3479
3480 /**
3481 * check_caps
3482 *
3483 * Examine currently used and wanted versus held caps. Release, flush or ack
3484 * revoked caps to the MDS as appropriate.
3485 *
3486 * @param in the inode to check
3487 * @param flags flags to apply to cap check
3488 */
3489 void Client::check_caps(Inode *in, unsigned flags)
3490 {
3491 unsigned wanted = in->caps_wanted();
3492 unsigned used = get_caps_used(in);
3493 unsigned cap_used;
3494
3495 int implemented;
3496 int issued = in->caps_issued(&implemented);
3497 int revoking = implemented & ~issued;
3498
3499 int orig_used = used;
3500 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3501
3502 int retain = wanted | used | CEPH_CAP_PIN;
3503 if (!unmounting && in->nlink > 0) {
3504 if (wanted) {
3505 retain |= CEPH_CAP_ANY;
3506 } else if (in->is_dir() &&
3507 (issued & CEPH_CAP_FILE_SHARED) &&
3508 (in->flags & I_COMPLETE)) {
3509 // we do this here because we don't want to drop to Fs (and then
3510 // drop the Fs if we do a create!) if that alone makes us send lookups
3511 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3512 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3513 retain |= wanted;
3514 } else {
3515 retain |= CEPH_CAP_ANY_SHARED;
3516 // keep RD only if we didn't have the file open RW,
3517 // because then the mds would revoke it anyway to
3518 // journal max_size=0.
3519 if (in->max_size == 0)
3520 retain |= CEPH_CAP_ANY_RD;
3521 }
3522 }
3523
3524 ldout(cct, 10) << __func__ << " on " << *in
3525 << " wanted " << ccap_string(wanted)
3526 << " used " << ccap_string(used)
3527 << " issued " << ccap_string(issued)
3528 << " revoking " << ccap_string(revoking)
3529 << " flags=" << flags
3530 << dendl;
3531
3532 if (in->snapid != CEPH_NOSNAP)
3533 return; //snap caps last forever, can't write
3534
3535 if (in->caps.empty())
3536 return; // guard if at end of func
3537
3538 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3539 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3540 if (_release(in))
3541 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3542 }
3543
3544
3545 for (auto &p : in->caps) {
3546 mds_rank_t mds = p.first;
3547 Cap &cap = p.second;
3548
3549 MetaSession *session = &mds_sessions.at(mds);
3550
3551 cap_used = used;
3552 if (in->auth_cap && &cap != in->auth_cap)
3553 cap_used &= ~in->auth_cap->issued;
3554
3555 revoking = cap.implemented & ~cap.issued;
3556
3557 ldout(cct, 10) << " cap mds." << mds
3558 << " issued " << ccap_string(cap.issued)
3559 << " implemented " << ccap_string(cap.implemented)
3560 << " revoking " << ccap_string(revoking) << dendl;
3561
3562 if (in->wanted_max_size > in->max_size &&
3563 in->wanted_max_size > in->requested_max_size &&
3564 &cap == in->auth_cap)
3565 goto ack;
3566
3567 /* approaching file_max? */
3568 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3569 &cap == in->auth_cap &&
3570 is_max_size_approaching(in)) {
3571 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3572 << ", reported " << in->reported_size << dendl;
3573 goto ack;
3574 }
3575
3576 /* completed revocation? */
3577 if (revoking && (revoking & cap_used) == 0) {
3578 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3579 goto ack;
3580 }
3581
3582 /* want more caps from mds? */
3583 if (wanted & ~(cap.wanted | cap.issued))
3584 goto ack;
3585
3586 if (!revoking && unmounting && (cap_used == 0))
3587 goto ack;
3588
3589 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3590 !in->dirty_caps) // and we have no dirty caps
3591 continue;
3592
3593 if (!(flags & CHECK_CAPS_NODELAY)) {
3594 ldout(cct, 10) << "delaying cap release" << dendl;
3595 cap_delay_requeue(in);
3596 continue;
3597 }
3598
3599 ack:
3600 if (&cap == in->auth_cap) {
3601 if (in->flags & I_KICK_FLUSH) {
3602 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3603 << " to mds." << mds << dendl;
3604 kick_flushing_caps(in, session);
3605 }
3606 if (!in->cap_snaps.empty() &&
3607 in->cap_snaps.rbegin()->second.flush_tid == 0)
3608 flush_snaps(in);
3609 }
3610
3611 int flushing;
3612 ceph_tid_t flush_tid;
3613 if (in->auth_cap == &cap && in->dirty_caps) {
3614 flushing = mark_caps_flushing(in, &flush_tid);
3615 } else {
3616 flushing = 0;
3617 flush_tid = 0;
3618 }
3619
3620 int msg_flags = (flags & CHECK_CAPS_SYNCHRONOUS) ? MClientCaps::FLAG_SYNC : 0;
3621 send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3622 flushing, flush_tid);
3623 }
3624 }
3625
3626
3627 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3628 {
3629 int used = get_caps_used(in);
3630 int dirty = in->caps_dirty();
3631 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3632
3633 if (in->cap_snaps.size() &&
3634 in->cap_snaps.rbegin()->second.writing) {
3635 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3636 return;
3637 } else if (in->caps_dirty() ||
3638 (used & CEPH_CAP_FILE_WR) ||
3639 (dirty & CEPH_CAP_ANY_WR)) {
3640 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3641 ceph_assert(capsnapem.second); /* element inserted */
3642 CapSnap &capsnap = capsnapem.first->second;
3643 capsnap.context = old_snapc;
3644 capsnap.issued = in->caps_issued();
3645 capsnap.dirty = in->caps_dirty();
3646
3647 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3648
3649 capsnap.uid = in->uid;
3650 capsnap.gid = in->gid;
3651 capsnap.mode = in->mode;
3652 capsnap.btime = in->btime;
3653 capsnap.xattrs = in->xattrs;
3654 capsnap.xattr_version = in->xattr_version;
3655 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3656 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3657
3658 if (used & CEPH_CAP_FILE_WR) {
3659 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3660 capsnap.writing = 1;
3661 } else {
3662 finish_cap_snap(in, capsnap, used);
3663 }
3664 } else {
3665 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3666 }
3667 }
3668
3669 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3670 {
3671 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3672 capsnap.size = in->size;
3673 capsnap.mtime = in->mtime;
3674 capsnap.atime = in->atime;
3675 capsnap.ctime = in->ctime;
3676 capsnap.time_warp_seq = in->time_warp_seq;
3677 capsnap.change_attr = in->change_attr;
3678 capsnap.dirty |= in->caps_dirty();
3679
3680 /* Only reset it if it wasn't set before */
3681 if (capsnap.cap_dirtier_uid == -1) {
3682 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3683 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3684 }
3685
3686 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3687 capsnap.inline_data = in->inline_data;
3688 capsnap.inline_version = in->inline_version;
3689 }
3690
3691 if (used & CEPH_CAP_FILE_BUFFER) {
3692 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3693 << " WRBUFFER, delaying" << dendl;
3694 } else {
3695 capsnap.dirty_data = 0;
3696 flush_snaps(in);
3697 }
3698 }
3699
3700 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3701 {
3702 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3703 in->cap_snaps.at(seq).dirty_data = 0;
3704 flush_snaps(in);
3705 }
3706
3707 void Client::send_flush_snap(Inode *in, MetaSession *session,
3708 snapid_t follows, CapSnap& capsnap)
3709 {
3710 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP,
3711 in->ino, in->snaprealm->ino, 0,
3712 in->auth_cap->mseq, cap_epoch_barrier);
3713 m->caller_uid = capsnap.cap_dirtier_uid;
3714 m->caller_gid = capsnap.cap_dirtier_gid;
3715
3716 m->set_client_tid(capsnap.flush_tid);
3717 m->head.snap_follows = follows;
3718
3719 m->head.caps = capsnap.issued;
3720 m->head.dirty = capsnap.dirty;
3721
3722 m->head.uid = capsnap.uid;
3723 m->head.gid = capsnap.gid;
3724 m->head.mode = capsnap.mode;
3725 m->btime = capsnap.btime;
3726
3727 m->size = capsnap.size;
3728
3729 m->head.xattr_version = capsnap.xattr_version;
3730 encode(capsnap.xattrs, m->xattrbl);
3731
3732 m->ctime = capsnap.ctime;
3733 m->btime = capsnap.btime;
3734 m->mtime = capsnap.mtime;
3735 m->atime = capsnap.atime;
3736 m->time_warp_seq = capsnap.time_warp_seq;
3737 m->change_attr = capsnap.change_attr;
3738
3739 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3740 m->inline_version = in->inline_version;
3741 m->inline_data = in->inline_data;
3742 }
3743
3744 ceph_assert(!session->flushing_caps_tids.empty());
3745 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3746
3747 session->con->send_message2(std::move(m));
3748 }
3749
3750 void Client::flush_snaps(Inode *in)
3751 {
3752 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3753 ceph_assert(in->cap_snaps.size());
3754
3755 // pick auth mds
3756 ceph_assert(in->auth_cap);
3757 MetaSession *session = in->auth_cap->session;
3758
3759 for (auto &p : in->cap_snaps) {
3760 CapSnap &capsnap = p.second;
3761 // only do new flush
3762 if (capsnap.flush_tid > 0)
3763 continue;
3764
3765 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3766 << " follows " << p.first
3767 << " size " << capsnap.size
3768 << " mtime " << capsnap.mtime
3769 << " dirty_data=" << capsnap.dirty_data
3770 << " writing=" << capsnap.writing
3771 << " on " << *in << dendl;
3772 if (capsnap.dirty_data || capsnap.writing)
3773 break;
3774
3775 capsnap.flush_tid = ++last_flush_tid;
3776 session->flushing_caps_tids.insert(capsnap.flush_tid);
3777 in->flushing_cap_tids[capsnap.flush_tid] = 0;
3778 if (!in->flushing_cap_item.is_on_list())
3779 session->flushing_caps.push_back(&in->flushing_cap_item);
3780
3781 send_flush_snap(in, session, p.first, capsnap);
3782 }
3783 }
3784
3785 void Client::wait_on_list(list<Cond*>& ls)
3786 {
3787 Cond cond;
3788 ls.push_back(&cond);
3789 cond.Wait(client_lock);
3790 ls.remove(&cond);
3791 }
3792
3793 void Client::signal_cond_list(list<Cond*>& ls)
3794 {
3795 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3796 (*it)->Signal();
3797 }
3798
3799 void Client::wait_on_context_list(list<Context*>& ls)
3800 {
3801 Cond cond;
3802 bool done = false;
3803 int r;
3804 ls.push_back(new C_Cond(&cond, &done, &r));
3805 while (!done)
3806 cond.Wait(client_lock);
3807 }
3808
3809 void Client::signal_context_list(list<Context*>& ls)
3810 {
3811 while (!ls.empty()) {
3812 ls.front()->complete(0);
3813 ls.pop_front();
3814 }
3815 }
3816
3817 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3818 {
3819 for (const auto &cap : s->caps) {
3820 auto &in = cap->inode;
3821 if (reconnect) {
3822 in.requested_max_size = 0;
3823 in.wanted_max_size = 0;
3824 } else {
3825 if (cap->gen < s->cap_gen) {
3826 // mds did not re-issue stale cap.
3827 cap->issued = cap->implemented = CEPH_CAP_PIN;
3828 // make sure mds knows what we want.
3829 if (in.caps_file_wanted() & ~cap->wanted)
3830 in.flags |= I_CAP_DROPPED;
3831 }
3832 }
3833 signal_cond_list(in.waitfor_caps);
3834 }
3835 }
3836
3837
3838 // flush dirty data (from objectcache)
3839
3840 class C_Client_CacheInvalidate : public Context {
3841 private:
3842 Client *client;
3843 vinodeno_t ino;
3844 int64_t offset, length;
3845 public:
3846 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3847 client(c), offset(off), length(len) {
3848 if (client->use_faked_inos())
3849 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3850 else
3851 ino = in->vino();
3852 }
3853 void finish(int r) override {
3854 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3855 ceph_assert(!client->client_lock.is_locked_by_me());
3856 client->_async_invalidate(ino, offset, length);
3857 }
3858 };
3859
3860 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3861 {
3862 if (unmounting)
3863 return;
3864 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3865 ino_invalidate_cb(callback_handle, ino, off, len);
3866 }
3867
3868 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3869
3870 if (ino_invalidate_cb)
3871 // we queue the invalidate, which calls the callback and decrements the ref
3872 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3873 }
3874
3875 void Client::_invalidate_inode_cache(Inode *in)
3876 {
3877 ldout(cct, 10) << __func__ << " " << *in << dendl;
3878
3879 // invalidate our userspace inode cache
3880 if (cct->_conf->client_oc) {
3881 objectcacher->release_set(&in->oset);
3882 if (!objectcacher->set_is_empty(&in->oset))
3883 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3884 }
3885
3886 _schedule_invalidate_callback(in, 0, 0);
3887 }
3888
3889 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3890 {
3891 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3892
3893 // invalidate our userspace inode cache
3894 if (cct->_conf->client_oc) {
3895 vector<ObjectExtent> ls;
3896 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3897 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3898 }
3899
3900 _schedule_invalidate_callback(in, off, len);
3901 }
3902
3903 bool Client::_release(Inode *in)
3904 {
3905 ldout(cct, 20) << "_release " << *in << dendl;
3906 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3907 _invalidate_inode_cache(in);
3908 return true;
3909 }
3910 return false;
3911 }
3912
3913 bool Client::_flush(Inode *in, Context *onfinish)
3914 {
3915 ldout(cct, 10) << "_flush " << *in << dendl;
3916
3917 if (!in->oset.dirty_or_tx) {
3918 ldout(cct, 10) << " nothing to flush" << dendl;
3919 onfinish->complete(0);
3920 return true;
3921 }
3922
3923 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3924 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3925 objectcacher->purge_set(&in->oset);
3926 if (onfinish) {
3927 onfinish->complete(-ENOSPC);
3928 }
3929 return true;
3930 }
3931
3932 return objectcacher->flush_set(&in->oset, onfinish);
3933 }
3934
3935 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3936 {
3937 ceph_assert(client_lock.is_locked());
3938 if (!in->oset.dirty_or_tx) {
3939 ldout(cct, 10) << " nothing to flush" << dendl;
3940 return;
3941 }
3942
3943 C_SaferCond onflush("Client::_flush_range flock");
3944 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3945 offset, size, &onflush);
3946 if (!ret) {
3947 // wait for flush
3948 client_lock.Unlock();
3949 onflush.wait();
3950 client_lock.Lock();
3951 }
3952 }
3953
3954 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3955 {
3956 // std::lock_guard l(client_lock);
3957 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3958 Inode *in = static_cast<Inode *>(oset->parent);
3959 ceph_assert(in);
3960 _flushed(in);
3961 }
3962
3963 void Client::_flushed(Inode *in)
3964 {
3965 ldout(cct, 10) << "_flushed " << *in << dendl;
3966
3967 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3968 }
3969
3970
3971
3972 // checks common to add_update_cap, handle_cap_grant
3973 void Client::check_cap_issue(Inode *in, unsigned issued)
3974 {
3975 unsigned had = in->caps_issued();
3976
3977 if ((issued & CEPH_CAP_FILE_CACHE) &&
3978 !(had & CEPH_CAP_FILE_CACHE))
3979 in->cache_gen++;
3980
3981 if ((issued & CEPH_CAP_FILE_SHARED) &&
3982 !(had & CEPH_CAP_FILE_SHARED)) {
3983 in->shared_gen++;
3984
3985 if (in->is_dir())
3986 clear_dir_complete_and_ordered(in, true);
3987 }
3988 }
3989
3990 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3991 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3992 inodeno_t realm, int flags, const UserPerm& cap_perms)
3993 {
3994 if (!in->is_any_caps()) {
3995 ceph_assert(in->snaprealm == 0);
3996 in->snaprealm = get_snap_realm(realm);
3997 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3998 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3999 } else {
4000 ceph_assert(in->snaprealm);
4001 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4002 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4003 in->snaprealm_item.remove_myself();
4004 auto oldrealm = in->snaprealm;
4005 in->snaprealm = get_snap_realm(realm);
4006 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4007 put_snap_realm(oldrealm);
4008 }
4009 }
4010
4011 mds_rank_t mds = mds_session->mds_num;
4012 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4013 Cap &cap = capem.first->second;
4014 if (!capem.second) {
4015 if (cap.gen < mds_session->cap_gen)
4016 cap.issued = cap.implemented = CEPH_CAP_PIN;
4017
4018 /*
4019 * auth mds of the inode changed. we received the cap export
4020 * message, but still haven't received the cap import message.
4021 * handle_cap_export() updated the new auth MDS' cap.
4022 *
4023 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4024 * a message that was send before the cap import message. So
4025 * don't remove caps.
4026 */
4027 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4028 ceph_assert(&cap == in->auth_cap);
4029 ceph_assert(cap.cap_id == cap_id);
4030 seq = cap.seq;
4031 mseq = cap.mseq;
4032 issued |= cap.issued;
4033 flags |= CEPH_CAP_FLAG_AUTH;
4034 }
4035 }
4036
4037 check_cap_issue(in, issued);
4038
4039 if (flags & CEPH_CAP_FLAG_AUTH) {
4040 if (in->auth_cap != &cap &&
4041 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4042 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4043 ldout(cct, 10) << __func__ << " changing auth cap: "
4044 << "add myself to new auth MDS' flushing caps list" << dendl;
4045 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4046 }
4047 in->auth_cap = &cap;
4048 }
4049 }
4050
4051 unsigned old_caps = cap.issued;
4052 cap.cap_id = cap_id;
4053 cap.issued = issued;
4054 cap.implemented |= issued;
4055 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4056 cap.wanted = wanted;
4057 else
4058 cap.wanted |= wanted;
4059 cap.seq = seq;
4060 cap.issue_seq = seq;
4061 cap.mseq = mseq;
4062 cap.gen = mds_session->cap_gen;
4063 cap.latest_perms = cap_perms;
4064 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4065 << " from mds." << mds
4066 << " on " << *in
4067 << dendl;
4068
4069 if ((issued & ~old_caps) && in->auth_cap == &cap) {
4070 // non-auth MDS is revoking the newly grant caps ?
4071 for (auto &p : in->caps) {
4072 if (&p.second == &cap)
4073 continue;
4074 if (p.second.implemented & ~p.second.issued & issued) {
4075 check_caps(in, CHECK_CAPS_NODELAY);
4076 break;
4077 }
4078 }
4079 }
4080
4081 if (issued & ~old_caps)
4082 signal_cond_list(in->waitfor_caps);
4083 }
4084
4085 void Client::remove_cap(Cap *cap, bool queue_release)
4086 {
4087 auto &in = cap->inode;
4088 MetaSession *session = cap->session;
4089 mds_rank_t mds = cap->session->mds_num;
4090
4091 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4092
4093 if (queue_release) {
4094 session->enqueue_cap_release(
4095 in.ino,
4096 cap->cap_id,
4097 cap->issue_seq,
4098 cap->mseq,
4099 cap_epoch_barrier);
4100 }
4101
4102 if (in.auth_cap == cap) {
4103 if (in.flushing_cap_item.is_on_list()) {
4104 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4105 in.flushing_cap_item.remove_myself();
4106 }
4107 in.auth_cap = NULL;
4108 }
4109 size_t n = in.caps.erase(mds);
4110 ceph_assert(n == 1);
4111 cap = nullptr;
4112
4113 if (!in.is_any_caps()) {
4114 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4115 in.snaprealm_item.remove_myself();
4116 put_snap_realm(in.snaprealm);
4117 in.snaprealm = 0;
4118 }
4119 }
4120
4121 void Client::remove_all_caps(Inode *in)
4122 {
4123 while (!in->caps.empty())
4124 remove_cap(&in->caps.begin()->second, true);
4125 }
4126
4127 void Client::remove_session_caps(MetaSession *s)
4128 {
4129 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4130
4131 while (s->caps.size()) {
4132 Cap *cap = *s->caps.begin();
4133 InodeRef in(&cap->inode);
4134 bool dirty_caps = false;
4135 if (in->auth_cap == cap) {
4136 dirty_caps = in->dirty_caps | in->flushing_caps;
4137 in->wanted_max_size = 0;
4138 in->requested_max_size = 0;
4139 }
4140 if (cap->wanted | cap->issued)
4141 in->flags |= I_CAP_DROPPED;
4142 remove_cap(cap, false);
4143 in->cap_snaps.clear();
4144 if (dirty_caps) {
4145 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4146 if (in->flushing_caps) {
4147 num_flushing_caps--;
4148 in->flushing_cap_tids.clear();
4149 }
4150 in->flushing_caps = 0;
4151 in->mark_caps_clean();
4152 put_inode(in.get());
4153 }
4154 signal_cond_list(in->waitfor_caps);
4155 }
4156 s->flushing_caps_tids.clear();
4157 sync_cond.Signal();
4158 }
4159
4160 int Client::_do_remount(bool retry_on_error)
4161 {
4162 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4163
4164 errno = 0;
4165 int r = remount_cb(callback_handle);
4166 if (r == 0) {
4167 retries_on_invalidate = 0;
4168 } else {
4169 int e = errno;
4170 client_t whoami = get_nodeid();
4171 if (r == -1) {
4172 lderr(cct) <<
4173 "failed to remount (to trim kernel dentries): "
4174 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4175 } else {
4176 lderr(cct) <<
4177 "failed to remount (to trim kernel dentries): "
4178 "return code = " << r << dendl;
4179 }
4180 bool should_abort =
4181 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4182 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4183 !(retry_on_error && (++retries_on_invalidate < max_retries));
4184 if (should_abort && !unmounting) {
4185 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4186 ceph_abort();
4187 }
4188 }
4189 return r;
4190 }
4191
4192 class C_Client_Remount : public Context {
4193 private:
4194 Client *client;
4195 public:
4196 explicit C_Client_Remount(Client *c) : client(c) {}
4197 void finish(int r) override {
4198 ceph_assert(r == 0);
4199 client->_do_remount(true);
4200 }
4201 };
4202
4203 void Client::_invalidate_kernel_dcache()
4204 {
4205 if (unmounting)
4206 return;
4207 if (can_invalidate_dentries) {
4208 if (dentry_invalidate_cb && root->dir) {
4209 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4210 p != root->dir->dentries.end();
4211 ++p) {
4212 if (p->second->inode)
4213 _schedule_invalidate_dentry_callback(p->second, false);
4214 }
4215 }
4216 } else if (remount_cb) {
4217 // Hacky:
4218 // when remounting a file system, linux kernel trims all unused dentries in the fs
4219 remount_finisher.queue(new C_Client_Remount(this));
4220 }
4221 }
4222
4223 void Client::_trim_negative_child_dentries(InodeRef& in)
4224 {
4225 if (!in->is_dir())
4226 return;
4227
4228 Dir* dir = in->dir;
4229 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4230 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4231 Dentry *dn = p->second;
4232 ++p;
4233 ceph_assert(!dn->inode);
4234 if (dn->lru_is_expireable())
4235 unlink(dn, true, false); // keep dir, drop dentry
4236 }
4237 if (dir->dentries.empty()) {
4238 close_dir(dir);
4239 }
4240 }
4241
4242 if (in->flags & I_SNAPDIR_OPEN) {
4243 InodeRef snapdir = open_snapdir(in.get());
4244 _trim_negative_child_dentries(snapdir);
4245 }
4246 }
4247
4248 void Client::trim_caps(MetaSession *s, uint64_t max)
4249 {
4250 mds_rank_t mds = s->mds_num;
4251 size_t caps_size = s->caps.size();
4252 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4253 << " caps " << caps_size << dendl;
4254
4255 uint64_t trimmed = 0;
4256 auto p = s->caps.begin();
4257 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4258 * looking at from getting deleted during traversal. */
4259 while ((caps_size - trimmed) > max && !p.end()) {
4260 Cap *cap = *p;
4261 InodeRef in(&cap->inode);
4262
4263 // Increment p early because it will be invalidated if cap
4264 // is deleted inside remove_cap
4265 ++p;
4266
4267 if (in->caps.size() > 1 && cap != in->auth_cap) {
4268 int mine = cap->issued | cap->implemented;
4269 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4270 // disposable non-auth cap
4271 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4272 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4273 cap = (remove_cap(cap, true), nullptr);
4274 trimmed++;
4275 }
4276 } else {
4277 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4278 _trim_negative_child_dentries(in);
4279 bool all = true;
4280 auto q = in->dentries.begin();
4281 while (q != in->dentries.end()) {
4282 Dentry *dn = *q;
4283 ++q;
4284 if (dn->lru_is_expireable()) {
4285 if (can_invalidate_dentries &&
4286 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4287 // Only issue one of these per DN for inodes in root: handle
4288 // others more efficiently by calling for root-child DNs at
4289 // the end of this function.
4290 _schedule_invalidate_dentry_callback(dn, true);
4291 }
4292 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4293 to_trim.insert(dn);
4294 } else {
4295 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4296 all = false;
4297 }
4298 }
4299 if (all && in->ino != MDS_INO_ROOT) {
4300 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4301 trimmed++;
4302 }
4303 }
4304 }
4305 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4306 for (const auto &dn : to_trim) {
4307 trim_dentry(dn);
4308 }
4309 to_trim.clear();
4310
4311 caps_size = s->caps.size();
4312 if (caps_size > (size_t)max)
4313 _invalidate_kernel_dcache();
4314 }
4315
4316 void Client::force_session_readonly(MetaSession *s)
4317 {
4318 s->readonly = true;
4319 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4320 auto &in = (*p)->inode;
4321 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4322 signal_cond_list(in.waitfor_caps);
4323 }
4324 }
4325
4326 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4327 {
4328 MetaSession *session = in->auth_cap->session;
4329
4330 int flushing = in->dirty_caps;
4331 ceph_assert(flushing);
4332
4333 ceph_tid_t flush_tid = ++last_flush_tid;
4334 in->flushing_cap_tids[flush_tid] = flushing;
4335
4336 if (!in->flushing_caps) {
4337 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4338 num_flushing_caps++;
4339 } else {
4340 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4341 }
4342
4343 in->flushing_caps |= flushing;
4344 in->mark_caps_clean();
4345
4346 if (!in->flushing_cap_item.is_on_list())
4347 session->flushing_caps.push_back(&in->flushing_cap_item);
4348 session->flushing_caps_tids.insert(flush_tid);
4349
4350 *ptid = flush_tid;
4351 return flushing;
4352 }
4353
4354 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4355 {
4356 for (auto &p : in->cap_snaps) {
4357 CapSnap &capsnap = p.second;
4358 if (capsnap.flush_tid > 0) {
4359 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4360 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4361 }
4362 }
4363 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4364 it != in->flushing_cap_tids.end();
4365 ++it) {
4366 old_s->flushing_caps_tids.erase(it->first);
4367 new_s->flushing_caps_tids.insert(it->first);
4368 }
4369 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4370 }
4371
4372 /*
4373 * Flush all caps back to the MDS. Because the callers generally wait on the
4374 * result of this function (syncfs and umount cases), we set
4375 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4376 */
4377 void Client::flush_caps_sync()
4378 {
4379 ldout(cct, 10) << __func__ << dendl;
4380 xlist<Inode*>::iterator p = delayed_list.begin();
4381 while (!p.end()) {
4382 unsigned flags = CHECK_CAPS_NODELAY;
4383 Inode *in = *p;
4384
4385 ++p;
4386 delayed_list.pop_front();
4387 if (p.end() && dirty_list.empty())
4388 flags |= CHECK_CAPS_SYNCHRONOUS;
4389 check_caps(in, flags);
4390 }
4391
4392 // other caps, too
4393 p = dirty_list.begin();
4394 while (!p.end()) {
4395 unsigned flags = CHECK_CAPS_NODELAY;
4396 Inode *in = *p;
4397
4398 ++p;
4399 if (p.end())
4400 flags |= CHECK_CAPS_SYNCHRONOUS;
4401 check_caps(in, flags);
4402 }
4403 }
4404
4405 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4406 {
4407 while (in->flushing_caps) {
4408 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4409 ceph_assert(it != in->flushing_cap_tids.end());
4410 if (it->first > want)
4411 break;
4412 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4413 << ccap_string(it->second) << " want " << want
4414 << " last " << it->first << dendl;
4415 wait_on_list(in->waitfor_caps);
4416 }
4417 }
4418
4419 void Client::wait_sync_caps(ceph_tid_t want)
4420 {
4421 retry:
4422 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
4423 << num_flushing_caps << " total flushing)" << dendl;
4424 for (auto &p : mds_sessions) {
4425 MetaSession *s = &p.second;
4426 if (s->flushing_caps_tids.empty())
4427 continue;
4428 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4429 if (oldest_tid <= want) {
4430 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4431 << " (want " << want << ")" << dendl;
4432 sync_cond.Wait(client_lock);
4433 goto retry;
4434 }
4435 }
4436 }
4437
4438 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4439 {
4440 in->flags &= ~I_KICK_FLUSH;
4441
4442 Cap *cap = in->auth_cap;
4443 ceph_assert(cap->session == session);
4444
4445 ceph_tid_t last_snap_flush = 0;
4446 for (auto p = in->flushing_cap_tids.rbegin();
4447 p != in->flushing_cap_tids.rend();
4448 ++p) {
4449 if (!p->second) {
4450 last_snap_flush = p->first;
4451 break;
4452 }
4453 }
4454
4455 int wanted = in->caps_wanted();
4456 int used = get_caps_used(in) | in->caps_dirty();
4457 auto it = in->cap_snaps.begin();
4458 for (auto& p : in->flushing_cap_tids) {
4459 if (p.second) {
4460 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4461 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4462 p.second, p.first);
4463 } else {
4464 ceph_assert(it != in->cap_snaps.end());
4465 ceph_assert(it->second.flush_tid == p.first);
4466 send_flush_snap(in, session, it->first, it->second);
4467 ++it;
4468 }
4469 }
4470 }
4471
4472 void Client::kick_flushing_caps(MetaSession *session)
4473 {
4474 mds_rank_t mds = session->mds_num;
4475 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4476
4477 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4478 Inode *in = *p;
4479 if (in->flags & I_KICK_FLUSH) {
4480 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4481 kick_flushing_caps(in, session);
4482 }
4483 }
4484 }
4485
4486 void Client::early_kick_flushing_caps(MetaSession *session)
4487 {
4488 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4489 Inode *in = *p;
4490 Cap *cap = in->auth_cap;
4491 ceph_assert(cap);
4492
4493 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4494 // stage. This guarantees that MDS processes the cap flush message before issuing
4495 // the flushing caps to other client.
4496 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4497 in->flags |= I_KICK_FLUSH;
4498 continue;
4499 }
4500
4501 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4502 << " to mds." << session->mds_num << dendl;
4503 // send_reconnect() also will reset these sequence numbers. make sure
4504 // sequence numbers in cap flush message match later reconnect message.
4505 cap->seq = 0;
4506 cap->issue_seq = 0;
4507 cap->mseq = 0;
4508 cap->issued = cap->implemented;
4509
4510 kick_flushing_caps(in, session);
4511 }
4512 }
4513
4514 void SnapRealm::build_snap_context()
4515 {
4516 set<snapid_t> snaps;
4517 snapid_t max_seq = seq;
4518
4519 // start with prior_parents?
4520 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4521 snaps.insert(prior_parent_snaps[i]);
4522
4523 // current parent's snaps
4524 if (pparent) {
4525 const SnapContext& psnapc = pparent->get_snap_context();
4526 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4527 if (psnapc.snaps[i] >= parent_since)
4528 snaps.insert(psnapc.snaps[i]);
4529 if (psnapc.seq > max_seq)
4530 max_seq = psnapc.seq;
4531 }
4532
4533 // my snaps
4534 for (unsigned i=0; i<my_snaps.size(); i++)
4535 snaps.insert(my_snaps[i]);
4536
4537 // ok!
4538 cached_snap_context.seq = max_seq;
4539 cached_snap_context.snaps.resize(0);
4540 cached_snap_context.snaps.reserve(snaps.size());
4541 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4542 cached_snap_context.snaps.push_back(*p);
4543 }
4544
4545 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4546 {
4547 list<SnapRealm*> q;
4548 q.push_back(realm);
4549
4550 while (!q.empty()) {
4551 realm = q.front();
4552 q.pop_front();
4553
4554 ldout(cct, 10) << __func__ << " " << *realm << dendl;
4555 realm->invalidate_cache();
4556
4557 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4558 p != realm->pchildren.end();
4559 ++p)
4560 q.push_back(*p);
4561 }
4562 }
4563
4564 SnapRealm *Client::get_snap_realm(inodeno_t r)
4565 {
4566 SnapRealm *realm = snap_realms[r];
4567 if (!realm)
4568 snap_realms[r] = realm = new SnapRealm(r);
4569 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4570 realm->nref++;
4571 return realm;
4572 }
4573
4574 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4575 {
4576 if (snap_realms.count(r) == 0) {
4577 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4578 return NULL;
4579 }
4580 SnapRealm *realm = snap_realms[r];
4581 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4582 realm->nref++;
4583 return realm;
4584 }
4585
4586 void Client::put_snap_realm(SnapRealm *realm)
4587 {
4588 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4589 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4590 if (--realm->nref == 0) {
4591 snap_realms.erase(realm->ino);
4592 if (realm->pparent) {
4593 realm->pparent->pchildren.erase(realm);
4594 put_snap_realm(realm->pparent);
4595 }
4596 delete realm;
4597 }
4598 }
4599
4600 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4601 {
4602 if (realm->parent != parent) {
4603 ldout(cct, 10) << __func__ << " " << *realm
4604 << " " << realm->parent << " -> " << parent << dendl;
4605 realm->parent = parent;
4606 if (realm->pparent) {
4607 realm->pparent->pchildren.erase(realm);
4608 put_snap_realm(realm->pparent);
4609 }
4610 realm->pparent = get_snap_realm(parent);
4611 realm->pparent->pchildren.insert(realm);
4612 return true;
4613 }
4614 return false;
4615 }
4616
4617 static bool has_new_snaps(const SnapContext& old_snapc,
4618 const SnapContext& new_snapc)
4619 {
4620 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4621 }
4622
4623
4624 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4625 {
4626 SnapRealm *first_realm = NULL;
4627 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4628
4629 map<SnapRealm*, SnapContext> dirty_realms;
4630
4631 auto p = bl.cbegin();
4632 while (!p.end()) {
4633 SnapRealmInfo info;
4634 decode(info, p);
4635 SnapRealm *realm = get_snap_realm(info.ino());
4636
4637 bool invalidate = false;
4638
4639 if (info.seq() > realm->seq) {
4640 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4641 << dendl;
4642
4643 if (flush) {
4644 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4645 // flush me + children
4646 list<SnapRealm*> q;
4647 q.push_back(realm);
4648 while (!q.empty()) {
4649 SnapRealm *realm = q.front();
4650 q.pop_front();
4651
4652 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4653 p != realm->pchildren.end();
4654 ++p)
4655 q.push_back(*p);
4656
4657 if (dirty_realms.count(realm) == 0) {
4658 realm->nref++;
4659 dirty_realms[realm] = realm->get_snap_context();
4660 }
4661 }
4662 }
4663
4664 // update
4665 realm->seq = info.seq();
4666 realm->created = info.created();
4667 realm->parent_since = info.parent_since();
4668 realm->prior_parent_snaps = info.prior_parent_snaps;
4669 realm->my_snaps = info.my_snaps;
4670 invalidate = true;
4671 }
4672
4673 // _always_ verify parent
4674 if (adjust_realm_parent(realm, info.parent()))
4675 invalidate = true;
4676
4677 if (invalidate) {
4678 invalidate_snaprealm_and_children(realm);
4679 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4680 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4681 } else {
4682 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4683 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4684 }
4685
4686 if (!first_realm)
4687 first_realm = realm;
4688 else
4689 put_snap_realm(realm);
4690 }
4691
4692 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4693 q != dirty_realms.end();
4694 ++q) {
4695 SnapRealm *realm = q->first;
4696 // if there are new snaps ?
4697 if (has_new_snaps(q->second, realm->get_snap_context())) {
4698 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4699 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4700 while (!r.end()) {
4701 Inode *in = *r;
4702 ++r;
4703 queue_cap_snap(in, q->second);
4704 }
4705 } else {
4706 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4707 }
4708 put_snap_realm(realm);
4709 }
4710
4711 if (realm_ret)
4712 *realm_ret = first_realm;
4713 else
4714 put_snap_realm(first_realm);
4715 }
4716
4717 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4718 {
4719 ldout(cct, 10) << __func__ << " " << *m << dendl;
4720 mds_rank_t mds = mds_rank_t(m->get_source().num());
4721 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4722 if (!session) {
4723 return;
4724 }
4725
4726 got_mds_push(session);
4727
4728 map<Inode*, SnapContext> to_move;
4729 SnapRealm *realm = 0;
4730
4731 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4732 ceph_assert(m->head.split);
4733 SnapRealmInfo info;
4734 auto p = m->bl.cbegin();
4735 decode(info, p);
4736 ceph_assert(info.ino() == m->head.split);
4737
4738 // flush, then move, ino's.
4739 realm = get_snap_realm(info.ino());
4740 ldout(cct, 10) << " splitting off " << *realm << dendl;
4741 for (auto& ino : m->split_inos) {
4742 vinodeno_t vino(ino, CEPH_NOSNAP);
4743 if (inode_map.count(vino)) {
4744 Inode *in = inode_map[vino];
4745 if (!in->snaprealm || in->snaprealm == realm)
4746 continue;
4747 if (in->snaprealm->created > info.created()) {
4748 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4749 << *in->snaprealm << dendl;
4750 continue;
4751 }
4752 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4753
4754
4755 in->snaprealm_item.remove_myself();
4756 to_move[in] = in->snaprealm->get_snap_context();
4757 put_snap_realm(in->snaprealm);
4758 }
4759 }
4760
4761 // move child snaprealms, too
4762 for (auto& child_realm : m->split_realms) {
4763 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4764 SnapRealm *child = get_snap_realm_maybe(child_realm);
4765 if (!child)
4766 continue;
4767 adjust_realm_parent(child, realm->ino);
4768 put_snap_realm(child);
4769 }
4770 }
4771
4772 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4773
4774 if (realm) {
4775 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4776 Inode *in = p->first;
4777 in->snaprealm = realm;
4778 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4779 realm->nref++;
4780 // queue for snap writeback
4781 if (has_new_snaps(p->second, realm->get_snap_context()))
4782 queue_cap_snap(in, p->second);
4783 }
4784 put_snap_realm(realm);
4785 }
4786 }
4787
4788 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4789 {
4790 mds_rank_t mds = mds_rank_t(m->get_source().num());
4791 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4792 if (!session) {
4793 return;
4794 }
4795
4796 got_mds_push(session);
4797
4798 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4799
4800 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4801 if (inode_map.count(vino)) {
4802 Inode *in = NULL;
4803 in = inode_map[vino];
4804
4805 if (in) {
4806 in->quota = m->quota;
4807 in->rstat = m->rstat;
4808 }
4809 }
4810 }
4811
4812 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4813 {
4814 mds_rank_t mds = mds_rank_t(m->get_source().num());
4815 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4816 if (!session) {
4817 return;
4818 }
4819
4820 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4821 // Pause RADOS operations until we see the required epoch
4822 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4823 }
4824
4825 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4826 // Record the barrier so that we will transmit it to MDS when releasing
4827 set_cap_epoch_barrier(m->osd_epoch_barrier);
4828 }
4829
4830 got_mds_push(session);
4831
4832 Inode *in;
4833 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4834 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4835 in = it->second;
4836 } else {
4837 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4838 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4839 session->enqueue_cap_release(
4840 m->get_ino(),
4841 m->get_cap_id(),
4842 m->get_seq(),
4843 m->get_mseq(),
4844 cap_epoch_barrier);
4845 } else {
4846 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4847 }
4848
4849 // in case the mds is waiting on e.g. a revocation
4850 flush_cap_releases();
4851 return;
4852 }
4853
4854 switch (m->get_op()) {
4855 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4856 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4857 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4858 }
4859
4860 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4861 Cap &cap = in->caps.at(mds);
4862
4863 switch (m->get_op()) {
4864 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4865 case CEPH_CAP_OP_IMPORT:
4866 case CEPH_CAP_OP_REVOKE:
4867 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4868 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4869 }
4870 } else {
4871 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4872 return;
4873 }
4874 }
4875
4876 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4877 {
4878 mds_rank_t mds = session->mds_num;
4879
4880 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4881 << " IMPORT from mds." << mds << dendl;
4882
4883 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4884 Cap *cap = NULL;
4885 UserPerm cap_perms;
4886 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4887 cap = &it->second;
4888 cap_perms = cap->latest_perms;
4889 }
4890
4891 // add/update it
4892 SnapRealm *realm = NULL;
4893 update_snap_trace(m->snapbl, &realm);
4894
4895 add_update_cap(in, session, m->get_cap_id(),
4896 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4897 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4898
4899 if (cap && cap->cap_id == m->peer.cap_id) {
4900 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4901 }
4902
4903 if (realm)
4904 put_snap_realm(realm);
4905
4906 if (in->auth_cap && in->auth_cap->session == session) {
4907 // reflush any/all caps (if we are now the auth_cap)
4908 kick_flushing_caps(in, session);
4909 }
4910 }
4911
4912 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4913 {
4914 mds_rank_t mds = session->mds_num;
4915
4916 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4917 << " EXPORT from mds." << mds << dendl;
4918
4919 auto it = in->caps.find(mds);
4920 if (it != in->caps.end()) {
4921 Cap &cap = it->second;
4922 if (cap.cap_id == m->get_cap_id()) {
4923 if (m->peer.cap_id) {
4924 const auto peer_mds = mds_rank_t(m->peer.mds);
4925 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4926 auto it = in->caps.find(peer_mds);
4927 if (it != in->caps.end()) {
4928 Cap &tcap = it->second;
4929 if (tcap.cap_id == m->peer.cap_id &&
4930 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4931 tcap.cap_id = m->peer.cap_id;
4932 tcap.seq = m->peer.seq - 1;
4933 tcap.issue_seq = tcap.seq;
4934 tcap.issued |= cap.issued;
4935 tcap.implemented |= cap.issued;
4936 if (&cap == in->auth_cap)
4937 in->auth_cap = &tcap;
4938 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4939 adjust_session_flushing_caps(in, session, tsession);
4940 }
4941 } else {
4942 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4943 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4944 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4945 cap.latest_perms);
4946 }
4947 } else {
4948 if (cap.wanted | cap.issued)
4949 in->flags |= I_CAP_DROPPED;
4950 }
4951
4952 remove_cap(&cap, false);
4953 }
4954 }
4955 }
4956
4957 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4958 {
4959 mds_rank_t mds = session->mds_num;
4960 ceph_assert(in->caps.count(mds));
4961
4962 ldout(cct, 10) << __func__ << " on ino " << *in
4963 << " size " << in->size << " -> " << m->get_size()
4964 << dendl;
4965
4966 int issued;
4967 in->caps_issued(&issued);
4968 issued |= in->caps_dirty();
4969 update_inode_file_size(in, issued, m->get_size(),
4970 m->get_truncate_seq(), m->get_truncate_size());
4971 }
4972
4973 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
4974 {
4975 ceph_tid_t flush_ack_tid = m->get_client_tid();
4976 int dirty = m->get_dirty();
4977 int cleaned = 0;
4978 int flushed = 0;
4979
4980 auto it = in->flushing_cap_tids.begin();
4981 if (it->first < flush_ack_tid) {
4982 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4983 << " got unexpected flush ack tid " << flush_ack_tid
4984 << " expected is " << it->first << dendl;
4985 }
4986 for (; it != in->flushing_cap_tids.end(); ) {
4987 if (!it->second) {
4988 // cap snap
4989 ++it;
4990 continue;
4991 }
4992 if (it->first == flush_ack_tid)
4993 cleaned = it->second;
4994 if (it->first <= flush_ack_tid) {
4995 session->flushing_caps_tids.erase(it->first);
4996 in->flushing_cap_tids.erase(it++);
4997 ++flushed;
4998 continue;
4999 }
5000 cleaned &= ~it->second;
5001 if (!cleaned)
5002 break;
5003 ++it;
5004 }
5005
5006 ldout(cct, 5) << __func__ << " mds." << session->mds_num
5007 << " cleaned " << ccap_string(cleaned) << " on " << *in
5008 << " with " << ccap_string(dirty) << dendl;
5009
5010 if (flushed) {
5011 signal_cond_list(in->waitfor_caps);
5012 if (session->flushing_caps_tids.empty() ||
5013 *session->flushing_caps_tids.begin() > flush_ack_tid)
5014 sync_cond.Signal();
5015 }
5016
5017 if (!dirty) {
5018 in->cap_dirtier_uid = -1;
5019 in->cap_dirtier_gid = -1;
5020 }
5021
5022 if (!cleaned) {
5023 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5024 } else {
5025 if (in->flushing_caps) {
5026 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5027 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5028 in->flushing_caps &= ~cleaned;
5029 if (in->flushing_caps == 0) {
5030 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5031 num_flushing_caps--;
5032 if (in->flushing_cap_tids.empty())
5033 in->flushing_cap_item.remove_myself();
5034 }
5035 if (!in->caps_dirty())
5036 put_inode(in);
5037 }
5038 }
5039 }
5040
5041
5042 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5043 {
5044 ceph_tid_t flush_ack_tid = m->get_client_tid();
5045 mds_rank_t mds = session->mds_num;
5046 ceph_assert(in->caps.count(mds));
5047 snapid_t follows = m->get_snap_follows();
5048
5049 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5050 auto& capsnap = it->second;
5051 if (flush_ack_tid != capsnap.flush_tid) {
5052 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5053 } else {
5054 InodeRef tmp_ref(in);
5055 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5056 << " on " << *in << dendl;
5057 session->flushing_caps_tids.erase(capsnap.flush_tid);
5058 in->flushing_cap_tids.erase(capsnap.flush_tid);
5059 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5060 in->flushing_cap_item.remove_myself();
5061 in->cap_snaps.erase(it);
5062
5063 signal_cond_list(in->waitfor_caps);
5064 if (session->flushing_caps_tids.empty() ||
5065 *session->flushing_caps_tids.begin() > flush_ack_tid)
5066 sync_cond.Signal();
5067 }
5068 } else {
5069 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5070 << " on " << *in << dendl;
5071 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5072 }
5073 }
5074
5075 class C_Client_DentryInvalidate : public Context {
5076 private:
5077 Client *client;
5078 vinodeno_t dirino;
5079 vinodeno_t ino;
5080 string name;
5081 public:
5082 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5083 client(c), name(dn->name) {
5084 if (client->use_faked_inos()) {
5085 dirino.ino = dn->dir->parent_inode->faked_ino;
5086 if (del)
5087 ino.ino = dn->inode->faked_ino;
5088 } else {
5089 dirino = dn->dir->parent_inode->vino();
5090 if (del)
5091 ino = dn->inode->vino();
5092 }
5093 if (!del)
5094 ino.ino = inodeno_t();
5095 }
5096 void finish(int r) override {
5097 // _async_dentry_invalidate is responsible for its own locking
5098 ceph_assert(!client->client_lock.is_locked_by_me());
5099 client->_async_dentry_invalidate(dirino, ino, name);
5100 }
5101 };
5102
5103 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5104 {
5105 if (unmounting)
5106 return;
5107 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5108 << " in dir " << dirino << dendl;
5109 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5110 }
5111
5112 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5113 {
5114 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5115 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5116 }
5117
5118 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5119 {
5120 int ref = in->get_num_ref();
5121 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5122
5123 if (in->dir && !in->dir->dentries.empty()) {
5124 for (auto p = in->dir->dentries.begin();
5125 p != in->dir->dentries.end(); ) {
5126 Dentry *dn = p->second;
5127 ++p;
5128 /* rmsnap removes whole subtree, need trim inodes recursively.
5129 * we don't need to invalidate dentries recursively. because
5130 * invalidating a directory dentry effectively invalidate
5131 * whole subtree */
5132 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5133 _try_to_trim_inode(dn->inode.get(), false);
5134
5135 if (dn->lru_is_expireable())
5136 unlink(dn, true, false); // keep dir, drop dentry
5137 }
5138 if (in->dir->dentries.empty()) {
5139 close_dir(in->dir);
5140 --ref;
5141 }
5142 }
5143
5144 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5145 InodeRef snapdir = open_snapdir(in);
5146 _try_to_trim_inode(snapdir.get(), false);
5147 --ref;
5148 }
5149
5150 if (ref > 0) {
5151 auto q = in->dentries.begin();
5152 while (q != in->dentries.end()) {
5153 Dentry *dn = *q;
5154 ++q;
5155 if( in->ll_ref > 0 && sched_inval) {
5156 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5157 // so in->dentries doesn't always reflect the state of kernel's dcache.
5158 _schedule_invalidate_dentry_callback(dn, true);
5159 }
5160 unlink(dn, true, true);
5161 }
5162 }
5163 }
5164
5165 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5166 {
5167 mds_rank_t mds = session->mds_num;
5168 int used = get_caps_used(in);
5169 int wanted = in->caps_wanted();
5170
5171 const unsigned new_caps = m->get_caps();
5172 const bool was_stale = session->cap_gen > cap->gen;
5173 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5174 << " mds." << mds << " seq " << m->get_seq()
5175 << " caps now " << ccap_string(new_caps)
5176 << " was " << ccap_string(cap->issued)
5177 << (was_stale ? "" : " (stale)") << dendl;
5178
5179 if (was_stale)
5180 cap->issued = cap->implemented = CEPH_CAP_PIN;
5181 cap->seq = m->get_seq();
5182 cap->gen = session->cap_gen;
5183
5184 check_cap_issue(in, new_caps);
5185
5186 // update inode
5187 int issued;
5188 in->caps_issued(&issued);
5189 issued |= in->caps_dirty();
5190
5191 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5192 !(issued & CEPH_CAP_AUTH_EXCL)) {
5193 in->mode = m->head.mode;
5194 in->uid = m->head.uid;
5195 in->gid = m->head.gid;
5196 in->btime = m->btime;
5197 }
5198 bool deleted_inode = false;
5199 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5200 !(issued & CEPH_CAP_LINK_EXCL)) {
5201 in->nlink = m->head.nlink;
5202 if (in->nlink == 0 &&
5203 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5204 deleted_inode = true;
5205 }
5206 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5207 m->xattrbl.length() &&
5208 m->head.xattr_version > in->xattr_version) {
5209 auto p = m->xattrbl.cbegin();
5210 decode(in->xattrs, p);
5211 in->xattr_version = m->head.xattr_version;
5212 }
5213
5214 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5215 in->dirstat.nfiles = m->get_nfiles();
5216 in->dirstat.nsubdirs = m->get_nsubdirs();
5217 }
5218
5219 if (new_caps & CEPH_CAP_ANY_RD) {
5220 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5221 m->get_ctime(), m->get_mtime(), m->get_atime());
5222 }
5223
5224 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5225 in->layout = m->get_layout();
5226 update_inode_file_size(in, issued, m->get_size(),
5227 m->get_truncate_seq(), m->get_truncate_size());
5228 }
5229
5230 if (m->inline_version > in->inline_version) {
5231 in->inline_data = m->inline_data;
5232 in->inline_version = m->inline_version;
5233 }
5234
5235 /* always take a newer change attr */
5236 if (m->get_change_attr() > in->change_attr)
5237 in->change_attr = m->get_change_attr();
5238
5239 // max_size
5240 if (cap == in->auth_cap &&
5241 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5242 (m->get_max_size() != in->max_size)) {
5243 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5244 in->max_size = m->get_max_size();
5245 if (in->max_size > in->wanted_max_size) {
5246 in->wanted_max_size = 0;
5247 in->requested_max_size = 0;
5248 }
5249 }
5250
5251 bool check = false;
5252 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5253 (wanted & ~(cap->wanted | new_caps))) {
5254 // If mds is importing cap, prior cap messages that update 'wanted'
5255 // may get dropped by mds (migrate seq mismatch).
5256 //
5257 // We don't send cap message to update 'wanted' if what we want are
5258 // already issued. If mds revokes caps, cap message that releases caps
5259 // also tells mds what we want. But if caps got revoked by mds forcedly
5260 // (session stale). We may haven't told mds what we want.
5261 check = true;
5262 }
5263
5264
5265 // update caps
5266 auto revoked = cap->issued & ~new_caps;
5267 if (revoked) {
5268 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5269 cap->issued = new_caps;
5270 cap->implemented |= new_caps;
5271
5272 // recall delegations if we're losing caps necessary for them
5273 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5274 in->recall_deleg(false);
5275 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5276 in->recall_deleg(true);
5277
5278 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5279 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5280 !_flush(in, new C_Client_FlushComplete(this, in))) {
5281 // waitin' for flush
5282 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5283 if (_release(in))
5284 check = true;
5285 } else {
5286 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5287 check = true;
5288 }
5289 } else if (cap->issued == new_caps) {
5290 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
5291 } else {
5292 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5293 cap->issued = new_caps;
5294 cap->implemented |= new_caps;
5295
5296 if (cap == in->auth_cap) {
5297 // non-auth MDS is revoking the newly grant caps ?
5298 for (const auto &p : in->caps) {
5299 if (&p.second == cap)
5300 continue;
5301 if (p.second.implemented & ~p.second.issued & new_caps) {
5302 check = true;
5303 break;
5304 }
5305 }
5306 }
5307 }
5308
5309 if (check)
5310 check_caps(in, 0);
5311
5312 // wake up waiters
5313 if (new_caps)
5314 signal_cond_list(in->waitfor_caps);
5315
5316 // may drop inode's last ref
5317 if (deleted_inode)
5318 _try_to_trim_inode(in, true);
5319 }
5320
5321 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5322 {
5323 if (perms.uid() == 0)
5324 return 0;
5325
5326 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5327 int ret = _posix_acl_permission(in, perms, want);
5328 if (ret != -EAGAIN)
5329 return ret;
5330 }
5331
5332 // check permissions before doing anything else
5333 if (!in->check_mode(perms, want))
5334 return -EACCES;
5335 return 0;
5336 }
5337
5338 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5339 const UserPerm& perms)
5340 {
5341 int r = _getattr_for_perm(in, perms);
5342 if (r < 0)
5343 goto out;
5344
5345 r = 0;
5346 if (strncmp(name, "system.", 7) == 0) {
5347 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5348 r = -EPERM;
5349 } else {
5350 r = inode_permission(in, perms, want);
5351 }
5352 out:
5353 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5354 return r;
5355 }
5356
5357 ostream& operator<<(ostream &out, const UserPerm& perm) {
5358 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5359 return out;
5360 }
5361
5362 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5363 const UserPerm& perms)
5364 {
5365 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5366 int r = _getattr_for_perm(in, perms);
5367 if (r < 0)
5368 goto out;
5369
5370 if (mask & CEPH_SETATTR_SIZE) {
5371 r = inode_permission(in, perms, MAY_WRITE);
5372 if (r < 0)
5373 goto out;
5374 }
5375
5376 r = -EPERM;
5377 if (mask & CEPH_SETATTR_UID) {
5378 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5379 goto out;
5380 }
5381 if (mask & CEPH_SETATTR_GID) {
5382 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5383 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5384 goto out;
5385 }
5386
5387 if (mask & CEPH_SETATTR_MODE) {
5388 if (perms.uid() != 0 && perms.uid() != in->uid)
5389 goto out;
5390
5391 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5392 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5393 stx->stx_mode &= ~S_ISGID;
5394 }
5395
5396 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5397 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5398 if (perms.uid() != 0 && perms.uid() != in->uid) {
5399 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5400 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5401 check_mask |= CEPH_SETATTR_MTIME;
5402 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5403 check_mask |= CEPH_SETATTR_ATIME;
5404 if (check_mask & mask) {
5405 goto out;
5406 } else {
5407 r = inode_permission(in, perms, MAY_WRITE);
5408 if (r < 0)
5409 goto out;
5410 }
5411 }
5412 }
5413 r = 0;
5414 out:
5415 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5416 return r;
5417 }
5418
5419 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5420 {
5421 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5422 unsigned want = 0;
5423
5424 if ((flags & O_ACCMODE) == O_WRONLY)
5425 want = MAY_WRITE;
5426 else if ((flags & O_ACCMODE) == O_RDWR)
5427 want = MAY_READ | MAY_WRITE;
5428 else if ((flags & O_ACCMODE) == O_RDONLY)
5429 want = MAY_READ;
5430 if (flags & O_TRUNC)
5431 want |= MAY_WRITE;
5432
5433 int r = 0;
5434 switch (in->mode & S_IFMT) {
5435 case S_IFLNK:
5436 r = -ELOOP;
5437 goto out;
5438 case S_IFDIR:
5439 if (want & MAY_WRITE) {
5440 r = -EISDIR;
5441 goto out;
5442 }
5443 break;
5444 }
5445
5446 r = _getattr_for_perm(in, perms);
5447 if (r < 0)
5448 goto out;
5449
5450 r = inode_permission(in, perms, want);
5451 out:
5452 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5453 return r;
5454 }
5455
5456 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5457 {
5458 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5459 int r = _getattr_for_perm(dir, perms);
5460 if (r < 0)
5461 goto out;
5462
5463 r = inode_permission(dir, perms, MAY_EXEC);
5464 out:
5465 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5466 return r;
5467 }
5468
5469 int Client::may_create(Inode *dir, const UserPerm& perms)
5470 {
5471 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5472 int r = _getattr_for_perm(dir, perms);
5473 if (r < 0)
5474 goto out;
5475
5476 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5477 out:
5478 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5479 return r;
5480 }
5481
5482 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5483 {
5484 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5485 int r = _getattr_for_perm(dir, perms);
5486 if (r < 0)
5487 goto out;
5488
5489 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5490 if (r < 0)
5491 goto out;
5492
5493 /* 'name == NULL' means rmsnap */
5494 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5495 InodeRef otherin;
5496 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5497 if (r < 0)
5498 goto out;
5499 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5500 r = -EPERM;
5501 }
5502 out:
5503 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5504 return r;
5505 }
5506
5507 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5508 {
5509 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5510 int r = _getattr_for_perm(in, perms);
5511 if (r < 0)
5512 goto out;
5513
5514 if (perms.uid() == 0 || perms.uid() == in->uid) {
5515 r = 0;
5516 goto out;
5517 }
5518
5519 r = -EPERM;
5520 if (!S_ISREG(in->mode))
5521 goto out;
5522
5523 if (in->mode & S_ISUID)
5524 goto out;
5525
5526 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5527 goto out;
5528
5529 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5530 out:
5531 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5532 return r;
5533 }
5534
5535 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5536 {
5537 int mask = CEPH_STAT_CAP_MODE;
5538 bool force = false;
5539 if (acl_type != NO_ACL) {
5540 mask |= CEPH_STAT_CAP_XATTR;
5541 force = in->xattr_version == 0;
5542 }
5543 return _getattr(in, mask, perms, force);
5544 }
5545
5546 vinodeno_t Client::_get_vino(Inode *in)
5547 {
5548 /* The caller must hold the client lock */
5549 return vinodeno_t(in->ino, in->snapid);
5550 }
5551
5552 /**
5553 * Resolve an MDS spec to a list of MDS daemon GIDs.
5554 *
5555 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5556 * It may be '*' in which case it matches all GIDs.
5557 *
5558 * If no error is returned, the `targets` vector will be populated with at least
5559 * one MDS.
5560 */
5561 int Client::resolve_mds(
5562 const std::string &mds_spec,
5563 std::vector<mds_gid_t> *targets)
5564 {
5565 ceph_assert(fsmap);
5566 ceph_assert(targets != nullptr);
5567
5568 mds_role_t role;
5569 std::stringstream ss;
5570 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5571 if (role_r == 0) {
5572 // We got a role, resolve it to a GID
5573 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5574 << role << "'" << dendl;
5575 targets->push_back(
5576 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5577 return 0;
5578 }
5579
5580 std::string strtol_err;
5581 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5582 if (strtol_err.empty()) {
5583 // It is a possible GID
5584 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5585 if (fsmap->gid_exists(mds_gid)) {
5586 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5587 targets->push_back(mds_gid);
5588 } else {
5589 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5590 << dendl;
5591 return -ENOENT;
5592 }
5593 } else if (mds_spec == "*") {
5594 // It is a wildcard: use all MDSs
5595 const auto mds_info = fsmap->get_mds_info();
5596
5597 if (mds_info.empty()) {
5598 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5599 return -ENOENT;
5600 }
5601
5602 for (const auto i : mds_info) {
5603 targets->push_back(i.first);
5604 }
5605 } else {
5606 // It did not parse as an integer, it is not a wildcard, it must be a name
5607 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5608 if (mds_gid == 0) {
5609 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5610
5611 lderr(cct) << "FSMap: " << *fsmap << dendl;
5612
5613 return -ENOENT;
5614 } else {
5615 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5616 << "' to GID " << mds_gid << dendl;
5617 targets->push_back(mds_gid);
5618 }
5619 }
5620
5621 return 0;
5622 }
5623
5624
5625 /**
5626 * Authenticate with mon and establish global ID
5627 */
5628 int Client::authenticate()
5629 {
5630 ceph_assert(client_lock.is_locked_by_me());
5631
5632 if (monclient->is_authenticated()) {
5633 return 0;
5634 }
5635
5636 client_lock.Unlock();
5637 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5638 client_lock.Lock();
5639 if (r < 0) {
5640 return r;
5641 }
5642
5643 whoami = monclient->get_global_id();
5644 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5645
5646 return 0;
5647 }
5648
5649 int Client::fetch_fsmap(bool user)
5650 {
5651 int r;
5652 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5653 // rather than MDSMap because no one MDSMap contains all the daemons, and
5654 // a `tell` can address any daemon.
5655 version_t fsmap_latest;
5656 do {
5657 C_SaferCond cond;
5658 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5659 client_lock.Unlock();
5660 r = cond.wait();
5661 client_lock.Lock();
5662 } while (r == -EAGAIN);
5663
5664 if (r < 0) {
5665 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5666 return r;
5667 }
5668
5669 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5670
5671 if (user) {
5672 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5673 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5674 monclient->renew_subs();
5675 wait_on_list(waiting_for_fsmap);
5676 }
5677 ceph_assert(fsmap_user);
5678 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5679 } else {
5680 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5681 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5682 monclient->renew_subs();
5683 wait_on_list(waiting_for_fsmap);
5684 }
5685 ceph_assert(fsmap);
5686 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5687 }
5688 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5689 << fsmap_latest << dendl;
5690 return 0;
5691 }
5692
5693 /**
5694 *
5695 * @mds_spec one of ID, rank, GID, "*"
5696 *
5697 */
5698 int Client::mds_command(
5699 const std::string &mds_spec,
5700 const vector<string>& cmd,
5701 const bufferlist& inbl,
5702 bufferlist *outbl,
5703 string *outs,
5704 Context *onfinish)
5705 {
5706 std::lock_guard lock(client_lock);
5707
5708 if (!initialized)
5709 return -ENOTCONN;
5710
5711 int r;
5712 r = authenticate();
5713 if (r < 0) {
5714 return r;
5715 }
5716
5717 r = fetch_fsmap(false);
5718 if (r < 0) {
5719 return r;
5720 }
5721
5722 // Look up MDS target(s) of the command
5723 std::vector<mds_gid_t> targets;
5724 r = resolve_mds(mds_spec, &targets);
5725 if (r < 0) {
5726 return r;
5727 }
5728
5729 // If daemons are laggy, we won't send them commands. If all
5730 // are laggy then we fail.
5731 std::vector<mds_gid_t> non_laggy;
5732 for (const auto gid : targets) {
5733 const auto info = fsmap->get_info_gid(gid);
5734 if (!info.laggy()) {
5735 non_laggy.push_back(gid);
5736 }
5737 }
5738 if (non_laggy.size() == 0) {
5739 *outs = "All targeted MDS daemons are laggy";
5740 return -ENOENT;
5741 }
5742
5743 if (metadata.empty()) {
5744 // We are called on an unmounted client, so metadata
5745 // won't be initialized yet.
5746 populate_metadata("");
5747 }
5748
5749 // Send commands to targets
5750 C_GatherBuilder gather(cct, onfinish);
5751 for (const auto target_gid : non_laggy) {
5752 const auto info = fsmap->get_info_gid(target_gid);
5753
5754 // Open a connection to the target MDS
5755 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5756
5757 // Generate MDSCommandOp state
5758 auto &op = command_table.start_command();
5759
5760 op.on_finish = gather.new_sub();
5761 op.cmd = cmd;
5762 op.outbl = outbl;
5763 op.outs = outs;
5764 op.inbl = inbl;
5765 op.mds_gid = target_gid;
5766 op.con = conn;
5767
5768 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5769 << " tid=" << op.tid << cmd << dendl;
5770
5771 // Construct and send MCommand
5772 auto m = op.get_message(monclient->get_fsid());
5773 conn->send_message2(std::move(m));
5774 }
5775 gather.activate();
5776
5777 return 0;
5778 }
5779
5780 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5781 {
5782 ceph_tid_t const tid = m->get_tid();
5783
5784 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5785
5786 if (!command_table.exists(tid)) {
5787 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5788 return;
5789 }
5790
5791 auto &op = command_table.get_command(tid);
5792 if (op.outbl) {
5793 *op.outbl = m->get_data();
5794 }
5795 if (op.outs) {
5796 *op.outs = m->rs;
5797 }
5798
5799 if (op.on_finish) {
5800 op.on_finish->complete(m->r);
5801 }
5802
5803 command_table.erase(tid);
5804 }
5805
5806 // -------------------
5807 // MOUNT
5808
5809 int Client::subscribe_mdsmap(const std::string &fs_name)
5810 {
5811 int r = authenticate();
5812 if (r < 0) {
5813 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5814 return r;
5815 }
5816
5817 std::string resolved_fs_name;
5818 if (fs_name.empty()) {
5819 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5820 } else {
5821 resolved_fs_name = fs_name;
5822 }
5823
5824 std::string want = "mdsmap";
5825 if (!resolved_fs_name.empty()) {
5826 r = fetch_fsmap(true);
5827 if (r < 0)
5828 return r;
5829 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5830 if (fscid == FS_CLUSTER_ID_NONE) {
5831 return -ENOENT;
5832 }
5833
5834 std::ostringstream oss;
5835 oss << want << "." << fscid;
5836 want = oss.str();
5837 }
5838 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5839
5840 monclient->sub_want(want, 0, 0);
5841 monclient->renew_subs();
5842
5843 return 0;
5844 }
5845
5846 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5847 bool require_mds, const std::string &fs_name)
5848 {
5849 std::lock_guard lock(client_lock);
5850
5851 if (mounted) {
5852 ldout(cct, 5) << "already mounted" << dendl;
5853 return 0;
5854 }
5855
5856 unmounting = false;
5857
5858 int r = subscribe_mdsmap(fs_name);
5859 if (r < 0) {
5860 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5861 return r;
5862 }
5863
5864 tick(); // start tick
5865
5866 if (require_mds) {
5867 while (1) {
5868 auto availability = mdsmap->is_cluster_available();
5869 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5870 // Error out
5871 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5872 return CEPH_FUSE_NO_MDS_UP;
5873 } else if (availability == MDSMap::AVAILABLE) {
5874 // Continue to mount
5875 break;
5876 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5877 // Else, wait. MDSMonitor will update the map to bring
5878 // us to a conclusion eventually.
5879 wait_on_list(waiting_for_mdsmap);
5880 } else {
5881 // Unexpected value!
5882 ceph_abort();
5883 }
5884 }
5885 }
5886
5887 populate_metadata(mount_root.empty() ? "/" : mount_root);
5888
5889 filepath fp(CEPH_INO_ROOT);
5890 if (!mount_root.empty()) {
5891 fp = filepath(mount_root.c_str());
5892 }
5893 while (true) {
5894 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5895 req->set_filepath(fp);
5896 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5897 int res = make_request(req, perms);
5898 if (res < 0) {
5899 if (res == -EACCES && root) {
5900 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5901 break;
5902 }
5903 return res;
5904 }
5905
5906 if (fp.depth())
5907 fp.pop_dentry();
5908 else
5909 break;
5910 }
5911
5912 ceph_assert(root);
5913 _ll_get(root);
5914
5915 mounted = true;
5916
5917 // trace?
5918 if (!cct->_conf->client_trace.empty()) {
5919 traceout.open(cct->_conf->client_trace.c_str());
5920 if (traceout.is_open()) {
5921 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5922 } else {
5923 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5924 }
5925 }
5926
5927 /*
5928 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5929 ldout(cct, 3) << "op: struct stat st;" << dendl;
5930 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5931 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5932 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5933 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5934 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5935 ldout(cct, 3) << "op: int fd;" << dendl;
5936 */
5937 return 0;
5938 }
5939
5940 // UNMOUNT
5941
5942 void Client::_close_sessions()
5943 {
5944 while (!mds_sessions.empty()) {
5945 // send session closes!
5946 for (auto &p : mds_sessions) {
5947 if (p.second.state != MetaSession::STATE_CLOSING) {
5948 _close_mds_session(&p.second);
5949 }
5950 }
5951
5952 // wait for sessions to close
5953 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5954 mount_cond.Wait(client_lock);
5955 }
5956 }
5957
5958 void Client::flush_mdlog_sync()
5959 {
5960 if (mds_requests.empty())
5961 return;
5962 for (auto &p : mds_sessions) {
5963 flush_mdlog(&p.second);
5964 }
5965 }
5966
5967 void Client::flush_mdlog(MetaSession *session)
5968 {
5969 // Only send this to Luminous or newer MDS daemons, older daemons
5970 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5971 const uint64_t features = session->con->get_features();
5972 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5973 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5974 session->con->send_message2(std::move(m));
5975 }
5976 }
5977
5978
5979 void Client::_abort_mds_sessions(int err)
5980 {
5981 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5982 auto req = p->second;
5983 ++p;
5984 // unsafe requests will be removed during close session below.
5985 if (req->got_unsafe)
5986 continue;
5987
5988 req->abort(err);
5989 if (req->caller_cond) {
5990 req->kick = true;
5991 req->caller_cond->Signal();
5992 }
5993 }
5994
5995 // Process aborts on any requests that were on this waitlist.
5996 // Any requests that were on a waiting_for_open session waitlist
5997 // will get kicked during close session below.
5998 signal_cond_list(waiting_for_mdsmap);
5999
6000 // Force-close all sessions
6001 while(!mds_sessions.empty()) {
6002 auto& session = mds_sessions.begin()->second;
6003 _closed_mds_session(&session);
6004 }
6005 }
6006
6007 void Client::_unmount(bool abort)
6008 {
6009 if (unmounting)
6010 return;
6011
6012 if (abort || blacklisted) {
6013 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6014 } else {
6015 ldout(cct, 2) << "unmounting" << dendl;
6016 }
6017 unmounting = true;
6018
6019 deleg_timeout = 0;
6020
6021 if (abort) {
6022 // Abort all mds sessions
6023 _abort_mds_sessions(-ENOTCONN);
6024
6025 objecter->op_cancel_writes(-ENOTCONN);
6026 } else {
6027 // flush the mdlog for pending requests, if any
6028 flush_mdlog_sync();
6029 }
6030
6031 while (!mds_requests.empty()) {
6032 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6033 mount_cond.Wait(client_lock);
6034 }
6035
6036 if (tick_event)
6037 timer.cancel_event(tick_event);
6038 tick_event = 0;
6039
6040 cwd.reset();
6041
6042 // clean up any unclosed files
6043 while (!fd_map.empty()) {
6044 Fh *fh = fd_map.begin()->second;
6045 fd_map.erase(fd_map.begin());
6046 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6047 _release_fh(fh);
6048 }
6049
6050 while (!ll_unclosed_fh_set.empty()) {
6051 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6052 Fh *fh = *it;
6053 ll_unclosed_fh_set.erase(fh);
6054 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6055 _release_fh(fh);
6056 }
6057
6058 while (!opened_dirs.empty()) {
6059 dir_result_t *dirp = *opened_dirs.begin();
6060 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6061 _closedir(dirp);
6062 }
6063
6064 _ll_drop_pins();
6065
6066 while (unsafe_sync_write > 0) {
6067 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6068 mount_cond.Wait(client_lock);
6069 }
6070
6071 if (cct->_conf->client_oc) {
6072 // flush/release all buffered data
6073 std::list<InodeRef> anchor;
6074 for (auto& p : inode_map) {
6075 Inode *in = p.second;
6076 if (!in) {
6077 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6078 ceph_assert(in);
6079 }
6080
6081 // prevent inode from getting freed
6082 anchor.emplace_back(in);
6083
6084 if (abort || blacklisted) {
6085 objectcacher->purge_set(&in->oset);
6086 } else if (!in->caps.empty()) {
6087 _release(in);
6088 _flush(in, new C_Client_FlushComplete(this, in));
6089 }
6090 }
6091 }
6092
6093 if (abort || blacklisted) {
6094 for (auto p = dirty_list.begin(); !p.end(); ) {
6095 Inode *in = *p;
6096 ++p;
6097 if (in->dirty_caps) {
6098 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6099 in->mark_caps_clean();
6100 put_inode(in);
6101 }
6102 }
6103 } else {
6104 flush_caps_sync();
6105 wait_sync_caps(last_flush_tid);
6106 }
6107
6108 // empty lru cache
6109 trim_cache();
6110
6111 while (lru.lru_get_size() > 0 ||
6112 !inode_map.empty()) {
6113 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6114 << "+" << inode_map.size() << " items"
6115 << ", waiting (for caps to release?)"
6116 << dendl;
6117 utime_t until = ceph_clock_now() + utime_t(5, 0);
6118 int r = mount_cond.WaitUntil(client_lock, until);
6119 if (r == ETIMEDOUT) {
6120 dump_cache(NULL);
6121 }
6122 }
6123 ceph_assert(lru.lru_get_size() == 0);
6124 ceph_assert(inode_map.empty());
6125
6126 // stop tracing
6127 if (!cct->_conf->client_trace.empty()) {
6128 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6129 traceout.close();
6130 }
6131
6132 _close_sessions();
6133
6134 mounted = false;
6135
6136 ldout(cct, 2) << "unmounted." << dendl;
6137 }
6138
6139 void Client::unmount()
6140 {
6141 std::lock_guard lock(client_lock);
6142 _unmount(false);
6143 }
6144
6145 void Client::abort_conn()
6146 {
6147 std::lock_guard lock(client_lock);
6148 _unmount(true);
6149 }
6150
6151 void Client::flush_cap_releases()
6152 {
6153 // send any cap releases
6154 for (auto &p : mds_sessions) {
6155 auto &session = p.second;
6156 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6157 p.first)) {
6158 if (cct->_conf->client_inject_release_failure) {
6159 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6160 } else {
6161 session.con->send_message2(std::move(session.release));
6162 }
6163 session.release.reset();
6164 }
6165 }
6166 }
6167
6168 void Client::tick()
6169 {
6170 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6171 sleep(cct->_conf->client_debug_inject_tick_delay);
6172 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6173 cct->_conf.apply_changes(nullptr);
6174 }
6175
6176 ldout(cct, 21) << "tick" << dendl;
6177 tick_event = timer.add_event_after(
6178 cct->_conf->client_tick_interval,
6179 new FunctionContext([this](int) {
6180 // Called back via Timer, which takes client_lock for us
6181 ceph_assert(client_lock.is_locked_by_me());
6182 tick();
6183 }));
6184 utime_t now = ceph_clock_now();
6185
6186 if (!mounted && !mds_requests.empty()) {
6187 MetaRequest *req = mds_requests.begin()->second;
6188 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6189 req->abort(-ETIMEDOUT);
6190 if (req->caller_cond) {
6191 req->kick = true;
6192 req->caller_cond->Signal();
6193 }
6194 signal_cond_list(waiting_for_mdsmap);
6195 for (auto &p : mds_sessions) {
6196 signal_context_list(p.second.waiting_for_open);
6197 }
6198 }
6199 }
6200
6201 if (mdsmap->get_epoch()) {
6202 // renew caps?
6203 utime_t el = now - last_cap_renew;
6204 if (el > mdsmap->get_session_timeout() / 3.0)
6205 renew_caps();
6206
6207 flush_cap_releases();
6208 }
6209
6210 // delayed caps
6211 xlist<Inode*>::iterator p = delayed_list.begin();
6212 while (!p.end()) {
6213 Inode *in = *p;
6214 ++p;
6215 if (in->hold_caps_until > now)
6216 break;
6217 delayed_list.pop_front();
6218 check_caps(in, CHECK_CAPS_NODELAY);
6219 }
6220
6221 trim_cache(true);
6222 }
6223
6224 void Client::renew_caps()
6225 {
6226 ldout(cct, 10) << "renew_caps()" << dendl;
6227 last_cap_renew = ceph_clock_now();
6228
6229 for (auto &p : mds_sessions) {
6230 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6231 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6232 renew_caps(&p.second);
6233 }
6234 }
6235
6236 void Client::renew_caps(MetaSession *session)
6237 {
6238 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6239 session->last_cap_renew_request = ceph_clock_now();
6240 uint64_t seq = ++session->cap_renew_seq;
6241 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6242 }
6243
6244
6245 // ===============================================================
6246 // high level (POSIXy) interface
6247
6248 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6249 InodeRef *target, const UserPerm& perms)
6250 {
6251 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6252 MetaRequest *req = new MetaRequest(op);
6253 filepath path;
6254 dir->make_nosnap_relative_path(path);
6255 path.push_dentry(name);
6256 req->set_filepath(path);
6257 req->set_inode(dir);
6258 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6259 mask |= DEBUG_GETATTR_CAPS;
6260 req->head.args.getattr.mask = mask;
6261
6262 ldout(cct, 10) << __func__ << " on " << path << dendl;
6263
6264 int r = make_request(req, perms, target);
6265 ldout(cct, 10) << __func__ << " res is " << r << dendl;
6266 return r;
6267 }
6268
6269 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6270 const UserPerm& perms)
6271 {
6272 int r = 0;
6273 Dentry *dn = NULL;
6274
6275 if (dname == "..") {
6276 if (dir->dentries.empty()) {
6277 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6278 filepath path(dir->ino);
6279 req->set_filepath(path);
6280
6281 InodeRef tmptarget;
6282 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6283
6284 if (r == 0) {
6285 Inode *tempino = tmptarget.get();
6286 _ll_get(tempino);
6287 *target = tempino;
6288 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6289 } else {
6290 *target = dir;
6291 }
6292 }
6293 else
6294 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6295 goto done;
6296 }
6297
6298 if (dname == ".") {
6299 *target = dir;
6300 goto done;
6301 }
6302
6303 if (!dir->is_dir()) {
6304 r = -ENOTDIR;
6305 goto done;
6306 }
6307
6308 if (dname.length() > NAME_MAX) {
6309 r = -ENAMETOOLONG;
6310 goto done;
6311 }
6312
6313 if (dname == cct->_conf->client_snapdir &&
6314 dir->snapid == CEPH_NOSNAP) {
6315 *target = open_snapdir(dir);
6316 goto done;
6317 }
6318
6319 if (dir->dir &&
6320 dir->dir->dentries.count(dname)) {
6321 dn = dir->dir->dentries[dname];
6322
6323 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6324 << " seq " << dn->lease_seq
6325 << dendl;
6326
6327 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6328 // is dn lease valid?
6329 utime_t now = ceph_clock_now();
6330 if (dn->lease_mds >= 0 &&
6331 dn->lease_ttl > now &&
6332 mds_sessions.count(dn->lease_mds)) {
6333 MetaSession &s = mds_sessions.at(dn->lease_mds);
6334 if (s.cap_ttl > now &&
6335 s.cap_gen == dn->lease_gen) {
6336 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6337 // make trim_caps() behave.
6338 dir->try_touch_cap(dn->lease_mds);
6339 goto hit_dn;
6340 }
6341 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6342 << " vs lease_gen " << dn->lease_gen << dendl;
6343 }
6344 // dir lease?
6345 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6346 if (dn->cap_shared_gen == dir->shared_gen &&
6347 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6348 goto hit_dn;
6349 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6350 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6351 << *dir << " dn '" << dname << "'" << dendl;
6352 return -ENOENT;
6353 }
6354 }
6355 } else {
6356 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6357 }
6358 } else {
6359 // can we conclude ENOENT locally?
6360 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6361 (dir->flags & I_COMPLETE)) {
6362 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6363 return -ENOENT;
6364 }
6365 }
6366
6367 r = _do_lookup(dir, dname, mask, target, perms);
6368 goto done;
6369
6370 hit_dn:
6371 if (dn->inode) {
6372 *target = dn->inode;
6373 } else {
6374 r = -ENOENT;
6375 }
6376 touch_dn(dn);
6377
6378 done:
6379 if (r < 0)
6380 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6381 else
6382 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6383 return r;
6384 }
6385
6386 int Client::get_or_create(Inode *dir, const char* name,
6387 Dentry **pdn, bool expect_null)
6388 {
6389 // lookup
6390 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6391 dir->open_dir();
6392 if (dir->dir->dentries.count(name)) {
6393 Dentry *dn = dir->dir->dentries[name];
6394
6395 // is dn lease valid?
6396 utime_t now = ceph_clock_now();
6397 if (dn->inode &&
6398 dn->lease_mds >= 0 &&
6399 dn->lease_ttl > now &&
6400 mds_sessions.count(dn->lease_mds)) {
6401 MetaSession &s = mds_sessions.at(dn->lease_mds);
6402 if (s.cap_ttl > now &&
6403 s.cap_gen == dn->lease_gen) {
6404 if (expect_null)
6405 return -EEXIST;
6406 }
6407 }
6408 *pdn = dn;
6409 } else {
6410 // otherwise link up a new one
6411 *pdn = link(dir->dir, name, NULL, NULL);
6412 }
6413
6414 // success
6415 return 0;
6416 }
6417
6418 int Client::path_walk(const filepath& origpath, InodeRef *end,
6419 const UserPerm& perms, bool followsym, int mask)
6420 {
6421 filepath path = origpath;
6422 InodeRef cur;
6423 if (origpath.absolute())
6424 cur = root;
6425 else
6426 cur = cwd;
6427 ceph_assert(cur);
6428
6429 ldout(cct, 10) << __func__ << " " << path << dendl;
6430
6431 int symlinks = 0;
6432
6433 unsigned i=0;
6434 while (i < path.depth() && cur) {
6435 int caps = 0;
6436 const string &dname = path[i];
6437 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6438 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6439 InodeRef next;
6440 if (cct->_conf->client_permissions) {
6441 int r = may_lookup(cur.get(), perms);
6442 if (r < 0)
6443 return r;
6444 caps = CEPH_CAP_AUTH_SHARED;
6445 }
6446
6447 /* Get extra requested caps on the last component */
6448 if (i == (path.depth() - 1))
6449 caps |= mask;
6450 int r = _lookup(cur.get(), dname, caps, &next, perms);
6451 if (r < 0)
6452 return r;
6453 // only follow trailing symlink if followsym. always follow
6454 // 'directory' symlinks.
6455 if (next && next->is_symlink()) {
6456 symlinks++;
6457 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6458 if (symlinks > MAXSYMLINKS) {
6459 return -ELOOP;
6460 }
6461
6462 if (i < path.depth() - 1) {
6463 // dir symlink
6464 // replace consumed components of path with symlink dir target
6465 filepath resolved(next->symlink.c_str());
6466 resolved.append(path.postfixpath(i + 1));
6467 path = resolved;
6468 i = 0;
6469 if (next->symlink[0] == '/') {
6470 cur = root;
6471 }
6472 continue;
6473 } else if (followsym) {
6474 if (next->symlink[0] == '/') {
6475 path = next->symlink.c_str();
6476 i = 0;
6477 // reset position
6478 cur = root;
6479 } else {
6480 filepath more(next->symlink.c_str());
6481 // we need to remove the symlink component from off of the path
6482 // before adding the target that the symlink points to. remain
6483 // at the same position in the path.
6484 path.pop_dentry();
6485 path.append(more);
6486 }
6487 continue;
6488 }
6489 }
6490 cur.swap(next);
6491 i++;
6492 }
6493 if (!cur)
6494 return -ENOENT;
6495 if (end)
6496 end->swap(cur);
6497 return 0;
6498 }
6499
6500
6501 // namespace ops
6502
6503 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6504 {
6505 std::lock_guard lock(client_lock);
6506 tout(cct) << "link" << std::endl;
6507 tout(cct) << relexisting << std::endl;
6508 tout(cct) << relpath << std::endl;
6509
6510 if (unmounting)
6511 return -ENOTCONN;
6512
6513 filepath existing(relexisting);
6514
6515 InodeRef in, dir;
6516 int r = path_walk(existing, &in, perm, true);
6517 if (r < 0)
6518 return r;
6519 if (std::string(relpath) == "/") {
6520 r = -EEXIST;
6521 return r;
6522 }
6523 filepath path(relpath);
6524 string name = path.last_dentry();
6525 path.pop_dentry();
6526
6527 r = path_walk(path, &dir, perm, true);
6528 if (r < 0)
6529 return r;
6530 if (cct->_conf->client_permissions) {
6531 if (S_ISDIR(in->mode)) {
6532 r = -EPERM;
6533 return r;
6534 }
6535 r = may_hardlink(in.get(), perm);
6536 if (r < 0)
6537 return r;
6538 r = may_create(dir.get(), perm);
6539 if (r < 0)
6540 return r;
6541 }
6542 r = _link(in.get(), dir.get(), name.c_str(), perm);
6543 return r;
6544 }
6545
6546 int Client::unlink(const char *relpath, const UserPerm& perm)
6547 {
6548 std::lock_guard lock(client_lock);
6549 tout(cct) << __func__ << std::endl;
6550 tout(cct) << relpath << std::endl;
6551
6552 if (unmounting)
6553 return -ENOTCONN;
6554
6555 if (std::string(relpath) == "/")
6556 return -EISDIR;
6557
6558 filepath path(relpath);
6559 string name = path.last_dentry();
6560 path.pop_dentry();
6561 InodeRef dir;
6562 int r = path_walk(path, &dir, perm);
6563 if (r < 0)
6564 return r;
6565 if (cct->_conf->client_permissions) {
6566 r = may_delete(dir.get(), name.c_str(), perm);
6567 if (r < 0)
6568 return r;
6569 }
6570 return _unlink(dir.get(), name.c_str(), perm);
6571 }
6572
6573 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6574 {
6575 std::lock_guard lock(client_lock);
6576 tout(cct) << __func__ << std::endl;
6577 tout(cct) << relfrom << std::endl;
6578 tout(cct) << relto << std::endl;
6579
6580 if (unmounting)
6581 return -ENOTCONN;
6582
6583 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6584 return -EBUSY;
6585
6586 filepath from(relfrom);
6587 filepath to(relto);
6588 string fromname = from.last_dentry();
6589 from.pop_dentry();
6590 string toname = to.last_dentry();
6591 to.pop_dentry();
6592
6593 InodeRef fromdir, todir;
6594 int r = path_walk(from, &fromdir, perm);
6595 if (r < 0)
6596 goto out;
6597 r = path_walk(to, &todir, perm);
6598 if (r < 0)
6599 goto out;
6600
6601 if (cct->_conf->client_permissions) {
6602 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6603 if (r < 0)
6604 return r;
6605 r = may_delete(todir.get(), toname.c_str(), perm);
6606 if (r < 0 && r != -ENOENT)
6607 return r;
6608 }
6609 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6610 out:
6611 return r;
6612 }
6613
6614 // dirs
6615
6616 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6617 {
6618 std::lock_guard lock(client_lock);
6619 tout(cct) << __func__ << std::endl;
6620 tout(cct) << relpath << std::endl;
6621 tout(cct) << mode << std::endl;
6622 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6623
6624 if (unmounting)
6625 return -ENOTCONN;
6626
6627 if (std::string(relpath) == "/")
6628 return -EEXIST;
6629
6630 filepath path(relpath);
6631 string name = path.last_dentry();
6632 path.pop_dentry();
6633 InodeRef dir;
6634 int r = path_walk(path, &dir, perm);
6635 if (r < 0)
6636 return r;
6637 if (cct->_conf->client_permissions) {
6638 r = may_create(dir.get(), perm);
6639 if (r < 0)
6640 return r;
6641 }
6642 return _mkdir(dir.get(), name.c_str(), mode, perm);
6643 }
6644
6645 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6646 {
6647 std::lock_guard lock(client_lock);
6648 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6649 tout(cct) << __func__ << std::endl;
6650 tout(cct) << relpath << std::endl;
6651 tout(cct) << mode << std::endl;
6652
6653 if (unmounting)
6654 return -ENOTCONN;
6655
6656 //get through existing parts of path
6657 filepath path(relpath);
6658 unsigned int i;
6659 int r = 0, caps = 0;
6660 InodeRef cur, next;
6661 cur = cwd;
6662 for (i=0; i<path.depth(); ++i) {
6663 if (cct->_conf->client_permissions) {
6664 r = may_lookup(cur.get(), perms);
6665 if (r < 0)
6666 break;
6667 caps = CEPH_CAP_AUTH_SHARED;
6668 }
6669 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6670 if (r < 0)
6671 break;
6672 cur.swap(next);
6673 }
6674 if (r!=-ENOENT) return r;
6675 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6676 //make new directory at each level
6677 for (; i<path.depth(); ++i) {
6678 if (cct->_conf->client_permissions) {
6679 r = may_create(cur.get(), perms);
6680 if (r < 0)
6681 return r;
6682 }
6683 //make new dir
6684 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6685
6686 //check proper creation/existence
6687 if(-EEXIST == r && i < path.depth() - 1) {
6688 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6689 }
6690 if (r < 0)
6691 return r;
6692 //move to new dir and continue
6693 cur.swap(next);
6694 ldout(cct, 20) << __func__ << ": successfully created directory "
6695 << filepath(cur->ino).get_path() << dendl;
6696 }
6697 return 0;
6698 }
6699
6700 int Client::rmdir(const char *relpath, const UserPerm& perms)
6701 {
6702 std::lock_guard lock(client_lock);
6703 tout(cct) << __func__ << std::endl;
6704 tout(cct) << relpath << std::endl;
6705
6706 if (unmounting)
6707 return -ENOTCONN;
6708
6709 if (std::string(relpath) == "/")
6710 return -EBUSY;
6711
6712 filepath path(relpath);
6713 string name = path.last_dentry();
6714 path.pop_dentry();
6715 InodeRef dir;
6716 int r = path_walk(path, &dir, perms);
6717 if (r < 0)
6718 return r;
6719 if (cct->_conf->client_permissions) {
6720 int r = may_delete(dir.get(), name.c_str(), perms);
6721 if (r < 0)
6722 return r;
6723 }
6724 return _rmdir(dir.get(), name.c_str(), perms);
6725 }
6726
6727 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6728 {
6729 std::lock_guard lock(client_lock);
6730 tout(cct) << __func__ << std::endl;
6731 tout(cct) << relpath << std::endl;
6732 tout(cct) << mode << std::endl;
6733 tout(cct) << rdev << std::endl;
6734
6735 if (unmounting)
6736 return -ENOTCONN;
6737
6738 if (std::string(relpath) == "/")
6739 return -EEXIST;
6740
6741 filepath path(relpath);
6742 string name = path.last_dentry();
6743 path.pop_dentry();
6744 InodeRef dir;
6745 int r = path_walk(path, &dir, perms);
6746 if (r < 0)
6747 return r;
6748 if (cct->_conf->client_permissions) {
6749 int r = may_create(dir.get(), perms);
6750 if (r < 0)
6751 return r;
6752 }
6753 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6754 }
6755
6756 // symlinks
6757
6758 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6759 {
6760 std::lock_guard lock(client_lock);
6761 tout(cct) << __func__ << std::endl;
6762 tout(cct) << target << std::endl;
6763 tout(cct) << relpath << std::endl;
6764
6765 if (unmounting)
6766 return -ENOTCONN;
6767
6768 if (std::string(relpath) == "/")
6769 return -EEXIST;
6770
6771 filepath path(relpath);
6772 string name = path.last_dentry();
6773 path.pop_dentry();
6774 InodeRef dir;
6775 int r = path_walk(path, &dir, perms);
6776 if (r < 0)
6777 return r;
6778 if (cct->_conf->client_permissions) {
6779 int r = may_create(dir.get(), perms);
6780 if (r < 0)
6781 return r;
6782 }
6783 return _symlink(dir.get(), name.c_str(), target, perms);
6784 }
6785
6786 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6787 {
6788 std::lock_guard lock(client_lock);
6789 tout(cct) << __func__ << std::endl;
6790 tout(cct) << relpath << std::endl;
6791
6792 if (unmounting)
6793 return -ENOTCONN;
6794
6795 filepath path(relpath);
6796 InodeRef in;
6797 int r = path_walk(path, &in, perms, false);
6798 if (r < 0)
6799 return r;
6800
6801 return _readlink(in.get(), buf, size);
6802 }
6803
6804 int Client::_readlink(Inode *in, char *buf, size_t size)
6805 {
6806 if (!in->is_symlink())
6807 return -EINVAL;
6808
6809 // copy into buf (at most size bytes)
6810 int r = in->symlink.length();
6811 if (r > (int)size)
6812 r = size;
6813 memcpy(buf, in->symlink.c_str(), r);
6814 return r;
6815 }
6816
6817
6818 // inode stuff
6819
6820 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6821 {
6822 bool yes = in->caps_issued_mask(mask, true);
6823
6824 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6825 if (yes && !force)
6826 return 0;
6827
6828 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6829 filepath path;
6830 in->make_nosnap_relative_path(path);
6831 req->set_filepath(path);
6832 req->set_inode(in);
6833 req->head.args.getattr.mask = mask;
6834
6835 int res = make_request(req, perms);
6836 ldout(cct, 10) << __func__ << " result=" << res << dendl;
6837 return res;
6838 }
6839
6840 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6841 const UserPerm& perms, InodeRef *inp)
6842 {
6843 int issued = in->caps_issued();
6844
6845 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6846 ccap_string(issued) << dendl;
6847
6848 if (in->snapid != CEPH_NOSNAP) {
6849 return -EROFS;
6850 }
6851 if ((mask & CEPH_SETATTR_SIZE) &&
6852 (unsigned long)stx->stx_size > in->size &&
6853 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6854 perms)) {
6855 return -EDQUOT;
6856 }
6857
6858 // make the change locally?
6859 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6860 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6861 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6862 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6863 << in->cap_dirtier_gid << ", forcing sync setattr"
6864 << dendl;
6865 /*
6866 * This works because we implicitly flush the caps as part of the
6867 * request, so the cap update check will happen with the writeback
6868 * cap context, and then the setattr check will happen with the
6869 * caller's context.
6870 *
6871 * In reality this pattern is likely pretty rare (different users
6872 * setattr'ing the same file). If that turns out not to be the
6873 * case later, we can build a more complex pipelined cap writeback
6874 * infrastructure...
6875 */
6876 if (!mask)
6877 mask |= CEPH_SETATTR_CTIME;
6878 goto force_request;
6879 }
6880
6881 if (!mask) {
6882 // caller just needs us to bump the ctime
6883 in->ctime = ceph_clock_now();
6884 in->cap_dirtier_uid = perms.uid();
6885 in->cap_dirtier_gid = perms.gid();
6886 if (issued & CEPH_CAP_AUTH_EXCL)
6887 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6888 else if (issued & CEPH_CAP_FILE_EXCL)
6889 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6890 else if (issued & CEPH_CAP_XATTR_EXCL)
6891 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6892 else
6893 mask |= CEPH_SETATTR_CTIME;
6894 }
6895
6896 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6897 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6898
6899 mask &= ~CEPH_SETATTR_KILL_SGUID;
6900
6901 if (mask & CEPH_SETATTR_UID) {
6902 in->ctime = ceph_clock_now();
6903 in->cap_dirtier_uid = perms.uid();
6904 in->cap_dirtier_gid = perms.gid();
6905 in->uid = stx->stx_uid;
6906 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6907 mask &= ~CEPH_SETATTR_UID;
6908 kill_sguid = true;
6909 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6910 }
6911 if (mask & CEPH_SETATTR_GID) {
6912 in->ctime = ceph_clock_now();
6913 in->cap_dirtier_uid = perms.uid();
6914 in->cap_dirtier_gid = perms.gid();
6915 in->gid = stx->stx_gid;
6916 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6917 mask &= ~CEPH_SETATTR_GID;
6918 kill_sguid = true;
6919 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6920 }
6921
6922 if (mask & CEPH_SETATTR_MODE) {
6923 in->ctime = ceph_clock_now();
6924 in->cap_dirtier_uid = perms.uid();
6925 in->cap_dirtier_gid = perms.gid();
6926 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6927 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6928 mask &= ~CEPH_SETATTR_MODE;
6929 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6930 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6931 /* Must squash the any setuid/setgid bits with an ownership change */
6932 in->mode &= ~(S_ISUID|S_ISGID);
6933 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6934 }
6935
6936 if (mask & CEPH_SETATTR_BTIME) {
6937 in->ctime = ceph_clock_now();
6938 in->cap_dirtier_uid = perms.uid();
6939 in->cap_dirtier_gid = perms.gid();
6940 in->btime = utime_t(stx->stx_btime);
6941 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6942 mask &= ~CEPH_SETATTR_BTIME;
6943 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6944 }
6945 } else if (mask & CEPH_SETATTR_SIZE) {
6946 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6947 mask |= CEPH_SETATTR_KILL_SGUID;
6948 }
6949
6950 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6951 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6952 if (mask & CEPH_SETATTR_MTIME)
6953 in->mtime = utime_t(stx->stx_mtime);
6954 if (mask & CEPH_SETATTR_ATIME)
6955 in->atime = utime_t(stx->stx_atime);
6956 in->ctime = ceph_clock_now();
6957 in->cap_dirtier_uid = perms.uid();
6958 in->cap_dirtier_gid = perms.gid();
6959 in->time_warp_seq++;
6960 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6961 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6962 }
6963 }
6964 if (!mask) {
6965 in->change_attr++;
6966 return 0;
6967 }
6968
6969 force_request:
6970 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6971
6972 filepath path;
6973
6974 in->make_nosnap_relative_path(path);
6975 req->set_filepath(path);
6976 req->set_inode(in);
6977
6978 if (mask & CEPH_SETATTR_KILL_SGUID) {
6979 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6980 }
6981 if (mask & CEPH_SETATTR_MODE) {
6982 req->head.args.setattr.mode = stx->stx_mode;
6983 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6984 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6985 }
6986 if (mask & CEPH_SETATTR_UID) {
6987 req->head.args.setattr.uid = stx->stx_uid;
6988 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6989 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6990 }
6991 if (mask & CEPH_SETATTR_GID) {
6992 req->head.args.setattr.gid = stx->stx_gid;
6993 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6994 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6995 }
6996 if (mask & CEPH_SETATTR_BTIME) {
6997 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6998 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6999 }
7000 if (mask & CEPH_SETATTR_MTIME) {
7001 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7002 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7003 CEPH_CAP_FILE_WR;
7004 }
7005 if (mask & CEPH_SETATTR_ATIME) {
7006 req->head.args.setattr.atime = utime_t(stx->stx_atime);
7007 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7008 CEPH_CAP_FILE_WR;
7009 }
7010 if (mask & CEPH_SETATTR_SIZE) {
7011 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7012 req->head.args.setattr.size = stx->stx_size;
7013 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7014 } else { //too big!
7015 put_request(req);
7016 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7017 return -EFBIG;
7018 }
7019 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7020 CEPH_CAP_FILE_WR;
7021 }
7022 req->head.args.setattr.mask = mask;
7023
7024 req->regetattr_mask = mask;
7025
7026 int res = make_request(req, perms, inp);
7027 ldout(cct, 10) << "_setattr result=" << res << dendl;
7028 return res;
7029 }
7030
7031 /* Note that we only care about attrs that setattr cares about */
7032 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7033 {
7034 stx->stx_size = st->st_size;
7035 stx->stx_mode = st->st_mode;
7036 stx->stx_uid = st->st_uid;
7037 stx->stx_gid = st->st_gid;
7038 #ifdef __APPLE__
7039 stx->stx_mtime = st->st_mtimespec;
7040 stx->stx_atime = st->st_atimespec;
7041 #else
7042 stx->stx_mtime = st->st_mtim;
7043 stx->stx_atime = st->st_atim;
7044 #endif
7045 }
7046
7047 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7048 const UserPerm& perms, InodeRef *inp)
7049 {
7050 int ret = _do_setattr(in, stx, mask, perms, inp);
7051 if (ret < 0)
7052 return ret;
7053 if (mask & CEPH_SETATTR_MODE)
7054 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7055 return ret;
7056 }
7057
7058 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7059 const UserPerm& perms)
7060 {
7061 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7062 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7063 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7064 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7065 if (cct->_conf->client_permissions) {
7066 int r = may_setattr(in.get(), stx, mask, perms);
7067 if (r < 0)
7068 return r;
7069 }
7070 return __setattrx(in.get(), stx, mask, perms);
7071 }
7072
7073 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7074 const UserPerm& perms)
7075 {
7076 struct ceph_statx stx;
7077
7078 stat_to_statx(attr, &stx);
7079 mask &= ~CEPH_SETATTR_BTIME;
7080
7081 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7082 mask &= ~CEPH_SETATTR_UID;
7083 }
7084 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7085 mask &= ~CEPH_SETATTR_GID;
7086 }
7087
7088 return _setattrx(in, &stx, mask, perms);
7089 }
7090
7091 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7092 const UserPerm& perms)
7093 {
7094 std::lock_guard lock(client_lock);
7095 tout(cct) << __func__ << std::endl;
7096 tout(cct) << relpath << std::endl;
7097 tout(cct) << mask << std::endl;
7098
7099 if (unmounting)
7100 return -ENOTCONN;
7101
7102 filepath path(relpath);
7103 InodeRef in;
7104 int r = path_walk(path, &in, perms);
7105 if (r < 0)
7106 return r;
7107 return _setattr(in, attr, mask, perms);
7108 }
7109
7110 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7111 const UserPerm& perms, int flags)
7112 {
7113 std::lock_guard lock(client_lock);
7114 tout(cct) << __func__ << std::endl;
7115 tout(cct) << relpath << std::endl;
7116 tout(cct) << mask << std::endl;
7117
7118 if (unmounting)
7119 return -ENOTCONN;
7120
7121 filepath path(relpath);
7122 InodeRef in;
7123 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7124 if (r < 0)
7125 return r;
7126 return _setattrx(in, stx, mask, perms);
7127 }
7128
7129 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7130 {
7131 std::lock_guard lock(client_lock);
7132 tout(cct) << __func__ << std::endl;
7133 tout(cct) << fd << std::endl;
7134 tout(cct) << mask << std::endl;
7135
7136 if (unmounting)
7137 return -ENOTCONN;
7138
7139 Fh *f = get_filehandle(fd);
7140 if (!f)
7141 return -EBADF;
7142 #if defined(__linux__) && defined(O_PATH)
7143 if (f->flags & O_PATH)
7144 return -EBADF;
7145 #endif
7146 return _setattr(f->inode, attr, mask, perms);
7147 }
7148
7149 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7150 {
7151 std::lock_guard lock(client_lock);
7152 tout(cct) << __func__ << std::endl;
7153 tout(cct) << fd << std::endl;
7154 tout(cct) << mask << std::endl;
7155
7156 if (unmounting)
7157 return -ENOTCONN;
7158
7159 Fh *f = get_filehandle(fd);
7160 if (!f)
7161 return -EBADF;
7162 #if defined(__linux__) && defined(O_PATH)
7163 if (f->flags & O_PATH)
7164 return -EBADF;
7165 #endif
7166 return _setattrx(f->inode, stx, mask, perms);
7167 }
7168
7169 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7170 frag_info_t *dirstat, int mask)
7171 {
7172 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7173 std::lock_guard lock(client_lock);
7174 tout(cct) << "stat" << std::endl;
7175 tout(cct) << relpath << std::endl;
7176
7177 if (unmounting)
7178 return -ENOTCONN;
7179
7180 filepath path(relpath);
7181 InodeRef in;
7182 int r = path_walk(path, &in, perms, true, mask);
7183 if (r < 0)
7184 return r;
7185 r = _getattr(in, mask, perms);
7186 if (r < 0) {
7187 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7188 return r;
7189 }
7190 fill_stat(in, stbuf, dirstat);
7191 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7192 return r;
7193 }
7194
7195 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7196 {
7197 unsigned mask = 0;
7198
7199 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7200 if (flags & AT_NO_ATTR_SYNC)
7201 goto out;
7202
7203 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7204 mask |= CEPH_CAP_PIN;
7205 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7206 mask |= CEPH_CAP_AUTH_SHARED;
7207 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7208 mask |= CEPH_CAP_LINK_SHARED;
7209 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7210 mask |= CEPH_CAP_FILE_SHARED;
7211 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7212 mask |= CEPH_CAP_XATTR_SHARED;
7213 out:
7214 return mask;
7215 }
7216
7217 int Client::statx(const char *relpath, struct ceph_statx *stx,
7218 const UserPerm& perms,
7219 unsigned int want, unsigned int flags)
7220 {
7221 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7222 std::lock_guard lock(client_lock);
7223 tout(cct) << "statx" << std::endl;
7224 tout(cct) << relpath << std::endl;
7225
7226 if (unmounting)
7227 return -ENOTCONN;
7228
7229 filepath path(relpath);
7230 InodeRef in;
7231
7232 unsigned mask = statx_to_mask(flags, want);
7233
7234 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7235 if (r < 0)
7236 return r;
7237
7238 r = _getattr(in, mask, perms);
7239 if (r < 0) {
7240 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7241 return r;
7242 }
7243
7244 fill_statx(in, mask, stx);
7245 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7246 return r;
7247 }
7248
7249 int Client::lstat(const char *relpath, struct stat *stbuf,
7250 const UserPerm& perms, frag_info_t *dirstat, int mask)
7251 {
7252 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7253 std::lock_guard lock(client_lock);
7254 tout(cct) << __func__ << std::endl;
7255 tout(cct) << relpath << std::endl;
7256
7257 if (unmounting)
7258 return -ENOTCONN;
7259
7260 filepath path(relpath);
7261 InodeRef in;
7262 // don't follow symlinks
7263 int r = path_walk(path, &in, perms, false, mask);
7264 if (r < 0)
7265 return r;
7266 r = _getattr(in, mask, perms);
7267 if (r < 0) {
7268 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7269 return r;
7270 }
7271 fill_stat(in, stbuf, dirstat);
7272 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7273 return r;
7274 }
7275
7276 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7277 {
7278 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7279 << " mode 0" << oct << in->mode << dec
7280 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7281 memset(st, 0, sizeof(struct stat));
7282 if (use_faked_inos())
7283 st->st_ino = in->faked_ino;
7284 else
7285 st->st_ino = in->ino;
7286 st->st_dev = in->snapid;
7287 st->st_mode = in->mode;
7288 st->st_rdev = in->rdev;
7289 if (in->is_dir()) {
7290 switch (in->nlink) {
7291 case 0:
7292 st->st_nlink = 0; /* dir is unlinked */
7293 break;
7294 case 1:
7295 st->st_nlink = 1 /* parent dentry */
7296 + 1 /* <dir>/. */
7297 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7298 break;
7299 default:
7300 ceph_abort();
7301 }
7302 } else {
7303 st->st_nlink = in->nlink;
7304 }
7305 st->st_uid = in->uid;
7306 st->st_gid = in->gid;
7307 if (in->ctime > in->mtime) {
7308 stat_set_ctime_sec(st, in->ctime.sec());
7309 stat_set_ctime_nsec(st, in->ctime.nsec());
7310 } else {
7311 stat_set_ctime_sec(st, in->mtime.sec());
7312 stat_set_ctime_nsec(st, in->mtime.nsec());
7313 }
7314 stat_set_atime_sec(st, in->atime.sec());
7315 stat_set_atime_nsec(st, in->atime.nsec());
7316 stat_set_mtime_sec(st, in->mtime.sec());
7317 stat_set_mtime_nsec(st, in->mtime.nsec());
7318 if (in->is_dir()) {
7319 if (cct->_conf->client_dirsize_rbytes)
7320 st->st_size = in->rstat.rbytes;
7321 else
7322 st->st_size = in->dirstat.size();
7323 st->st_blocks = 1;
7324 } else {
7325 st->st_size = in->size;
7326 st->st_blocks = (in->size + 511) >> 9;
7327 }
7328 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7329
7330 if (dirstat)
7331 *dirstat = in->dirstat;
7332 if (rstat)
7333 *rstat = in->rstat;
7334
7335 return in->caps_issued();
7336 }
7337
7338 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7339 {
7340 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7341 << " mode 0" << oct << in->mode << dec
7342 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7343 memset(stx, 0, sizeof(struct ceph_statx));
7344
7345 /*
7346 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7347 * so that all bits are set.
7348 */
7349 if (!mask)
7350 mask = ~0;
7351
7352 /* These are always considered to be available */
7353 stx->stx_dev = in->snapid;
7354 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7355
7356 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7357 stx->stx_mode = S_IFMT & in->mode;
7358 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7359 stx->stx_rdev = in->rdev;
7360 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7361
7362 if (mask & CEPH_CAP_AUTH_SHARED) {
7363 stx->stx_uid = in->uid;
7364 stx->stx_gid = in->gid;
7365 stx->stx_mode = in->mode;
7366 in->btime.to_timespec(&stx->stx_btime);
7367 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7368 }
7369
7370 if (mask & CEPH_CAP_LINK_SHARED) {
7371 if (in->is_dir()) {
7372 switch (in->nlink) {
7373 case 0:
7374 stx->stx_nlink = 0; /* dir is unlinked */
7375 break;
7376 case 1:
7377 stx->stx_nlink = 1 /* parent dentry */
7378 + 1 /* <dir>/. */
7379 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7380 break;
7381 default:
7382 ceph_abort();
7383 }
7384 } else {
7385 stx->stx_nlink = in->nlink;
7386 }
7387 stx->stx_mask |= CEPH_STATX_NLINK;
7388 }
7389
7390 if (mask & CEPH_CAP_FILE_SHARED) {
7391
7392 in->atime.to_timespec(&stx->stx_atime);
7393 in->mtime.to_timespec(&stx->stx_mtime);
7394
7395 if (in->is_dir()) {
7396 if (cct->_conf->client_dirsize_rbytes)
7397 stx->stx_size = in->rstat.rbytes;
7398 else
7399 stx->stx_size = in->dirstat.size();
7400 stx->stx_blocks = 1;
7401 } else {
7402 stx->stx_size = in->size;
7403 stx->stx_blocks = (in->size + 511) >> 9;
7404 }
7405 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7406 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7407 }
7408
7409 /* Change time and change_attr both require all shared caps to view */
7410 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7411 stx->stx_version = in->change_attr;
7412 if (in->ctime > in->mtime)
7413 in->ctime.to_timespec(&stx->stx_ctime);
7414 else
7415 in->mtime.to_timespec(&stx->stx_ctime);
7416 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7417 }
7418
7419 }
7420
7421 void Client::touch_dn(Dentry *dn)
7422 {
7423 lru.lru_touch(dn);
7424 }
7425
7426 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7427 {
7428 std::lock_guard lock(client_lock);
7429 tout(cct) << __func__ << std::endl;
7430 tout(cct) << relpath << std::endl;
7431 tout(cct) << mode << std::endl;
7432
7433 if (unmounting)
7434 return -ENOTCONN;
7435
7436 filepath path(relpath);
7437 InodeRef in;
7438 int r = path_walk(path, &in, perms);
7439 if (r < 0)
7440 return r;
7441 struct stat attr;
7442 attr.st_mode = mode;
7443 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7444 }
7445
7446 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7447 {
7448 std::lock_guard lock(client_lock);
7449 tout(cct) << __func__ << std::endl;
7450 tout(cct) << fd << std::endl;
7451 tout(cct) << mode << std::endl;
7452
7453 if (unmounting)
7454 return -ENOTCONN;
7455
7456 Fh *f = get_filehandle(fd);
7457 if (!f)
7458 return -EBADF;
7459 #if defined(__linux__) && defined(O_PATH)
7460 if (f->flags & O_PATH)
7461 return -EBADF;
7462 #endif
7463 struct stat attr;
7464 attr.st_mode = mode;
7465 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7466 }
7467
7468 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7469 {
7470 std::lock_guard lock(client_lock);
7471 tout(cct) << __func__ << std::endl;
7472 tout(cct) << relpath << std::endl;
7473 tout(cct) << mode << std::endl;
7474
7475 if (unmounting)
7476 return -ENOTCONN;
7477
7478 filepath path(relpath);
7479 InodeRef in;
7480 // don't follow symlinks
7481 int r = path_walk(path, &in, perms, false);
7482 if (r < 0)
7483 return r;
7484 struct stat attr;
7485 attr.st_mode = mode;
7486 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7487 }
7488
7489 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7490 const UserPerm& perms)
7491 {
7492 std::lock_guard lock(client_lock);
7493 tout(cct) << __func__ << std::endl;
7494 tout(cct) << relpath << std::endl;
7495 tout(cct) << new_uid << std::endl;
7496 tout(cct) << new_gid << std::endl;
7497
7498 if (unmounting)
7499 return -ENOTCONN;
7500
7501 filepath path(relpath);
7502 InodeRef in;
7503 int r = path_walk(path, &in, perms);
7504 if (r < 0)
7505 return r;
7506 struct stat attr;
7507 attr.st_uid = new_uid;
7508 attr.st_gid = new_gid;
7509 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7510 }
7511
7512 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7513 {
7514 std::lock_guard lock(client_lock);
7515 tout(cct) << __func__ << std::endl;
7516 tout(cct) << fd << std::endl;
7517 tout(cct) << new_uid << std::endl;
7518 tout(cct) << new_gid << std::endl;
7519
7520 if (unmounting)
7521 return -ENOTCONN;
7522
7523 Fh *f = get_filehandle(fd);
7524 if (!f)
7525 return -EBADF;
7526 #if defined(__linux__) && defined(O_PATH)
7527 if (f->flags & O_PATH)
7528 return -EBADF;
7529 #endif
7530 struct stat attr;
7531 attr.st_uid = new_uid;
7532 attr.st_gid = new_gid;
7533 int mask = 0;
7534 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7535 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7536 return _setattr(f->inode, &attr, mask, perms);
7537 }
7538
7539 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7540 const UserPerm& perms)
7541 {
7542 std::lock_guard lock(client_lock);
7543 tout(cct) << __func__ << std::endl;
7544 tout(cct) << relpath << std::endl;
7545 tout(cct) << new_uid << std::endl;
7546 tout(cct) << new_gid << std::endl;
7547
7548 if (unmounting)
7549 return -ENOTCONN;
7550
7551 filepath path(relpath);
7552 InodeRef in;
7553 // don't follow symlinks
7554 int r = path_walk(path, &in, perms, false);
7555 if (r < 0)
7556 return r;
7557 struct stat attr;
7558 attr.st_uid = new_uid;
7559 attr.st_gid = new_gid;
7560 int mask = 0;
7561 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7562 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7563 return _setattr(in, &attr, mask, perms);
7564 }
7565
7566 static void attr_set_atime_and_mtime(struct stat *attr,
7567 const utime_t &atime,
7568 const utime_t &mtime)
7569 {
7570 stat_set_atime_sec(attr, atime.tv.tv_sec);
7571 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7572 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7573 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7574 }
7575
7576 // for [l]utime() invoke the timeval variant as the timespec
7577 // variant are not yet implemented. for futime[s](), invoke
7578 // the timespec variant.
7579 int Client::utime(const char *relpath, struct utimbuf *buf,
7580 const UserPerm& perms)
7581 {
7582 struct timeval tv[2];
7583 tv[0].tv_sec = buf->actime;
7584 tv[0].tv_usec = 0;
7585 tv[1].tv_sec = buf->modtime;
7586 tv[1].tv_usec = 0;
7587
7588 return utimes(relpath, tv, perms);
7589 }
7590
7591 int Client::lutime(const char *relpath, struct utimbuf *buf,
7592 const UserPerm& perms)
7593 {
7594 struct timeval tv[2];
7595 tv[0].tv_sec = buf->actime;
7596 tv[0].tv_usec = 0;
7597 tv[1].tv_sec = buf->modtime;
7598 tv[1].tv_usec = 0;
7599
7600 return lutimes(relpath, tv, perms);
7601 }
7602
7603 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7604 {
7605 struct timespec ts[2];
7606 ts[0].tv_sec = buf->actime;
7607 ts[0].tv_nsec = 0;
7608 ts[1].tv_sec = buf->modtime;
7609 ts[1].tv_nsec = 0;
7610
7611 return futimens(fd, ts, perms);
7612 }
7613
7614 int Client::utimes(const char *relpath, struct timeval times[2],
7615 const UserPerm& perms)
7616 {
7617 std::lock_guard lock(client_lock);
7618 tout(cct) << __func__ << std::endl;
7619 tout(cct) << relpath << std::endl;
7620 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7621 << std::endl;
7622 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7623 << std::endl;
7624
7625 if (unmounting)
7626 return -ENOTCONN;
7627
7628 filepath path(relpath);
7629 InodeRef in;
7630 int r = path_walk(path, &in, perms);
7631 if (r < 0)
7632 return r;
7633 struct stat attr;
7634 utime_t atime(times[0]);
7635 utime_t mtime(times[1]);
7636
7637 attr_set_atime_and_mtime(&attr, atime, mtime);
7638 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7639 }
7640
7641 int Client::lutimes(const char *relpath, struct timeval times[2],
7642 const UserPerm& perms)
7643 {
7644 std::lock_guard lock(client_lock);
7645 tout(cct) << __func__ << std::endl;
7646 tout(cct) << relpath << std::endl;
7647 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7648 << std::endl;
7649 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7650 << std::endl;
7651
7652 if (unmounting)
7653 return -ENOTCONN;
7654
7655 filepath path(relpath);
7656 InodeRef in;
7657 int r = path_walk(path, &in, perms, false);
7658 if (r < 0)
7659 return r;
7660 struct stat attr;
7661 utime_t atime(times[0]);
7662 utime_t mtime(times[1]);
7663
7664 attr_set_atime_and_mtime(&attr, atime, mtime);
7665 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7666 }
7667
7668 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7669 {
7670 struct timespec ts[2];
7671 ts[0].tv_sec = times[0].tv_sec;
7672 ts[0].tv_nsec = times[0].tv_usec * 1000;
7673 ts[1].tv_sec = times[1].tv_sec;
7674 ts[1].tv_nsec = times[1].tv_usec * 1000;
7675
7676 return futimens(fd, ts, perms);
7677 }
7678
7679 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7680 {
7681 std::lock_guard lock(client_lock);
7682 tout(cct) << __func__ << std::endl;
7683 tout(cct) << fd << std::endl;
7684 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7685 << std::endl;
7686 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7687 << std::endl;
7688
7689 if (unmounting)
7690 return -ENOTCONN;
7691
7692 Fh *f = get_filehandle(fd);
7693 if (!f)
7694 return -EBADF;
7695 #if defined(__linux__) && defined(O_PATH)
7696 if (f->flags & O_PATH)
7697 return -EBADF;
7698 #endif
7699 struct stat attr;
7700 utime_t atime(times[0]);
7701 utime_t mtime(times[1]);
7702
7703 attr_set_atime_and_mtime(&attr, atime, mtime);
7704 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7705 }
7706
7707 int Client::flock(int fd, int operation, uint64_t owner)
7708 {
7709 std::lock_guard lock(client_lock);
7710 tout(cct) << __func__ << std::endl;
7711 tout(cct) << fd << std::endl;
7712 tout(cct) << operation << std::endl;
7713 tout(cct) << owner << std::endl;
7714
7715 if (unmounting)
7716 return -ENOTCONN;
7717
7718 Fh *f = get_filehandle(fd);
7719 if (!f)
7720 return -EBADF;
7721
7722 return _flock(f, operation, owner);
7723 }
7724
7725 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7726 {
7727 std::lock_guard lock(client_lock);
7728 tout(cct) << __func__ << std::endl;
7729 tout(cct) << relpath << std::endl;
7730
7731 if (unmounting)
7732 return -ENOTCONN;
7733
7734 filepath path(relpath);
7735 InodeRef in;
7736 int r = path_walk(path, &in, perms, true);
7737 if (r < 0)
7738 return r;
7739 if (cct->_conf->client_permissions) {
7740 int r = may_open(in.get(), O_RDONLY, perms);
7741 if (r < 0)
7742 return r;
7743 }
7744 r = _opendir(in.get(), dirpp, perms);
7745 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7746 if (r != -ENOTDIR)
7747 tout(cct) << (unsigned long)*dirpp << std::endl;
7748 return r;
7749 }
7750
7751 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7752 {
7753 if (!in->is_dir())
7754 return -ENOTDIR;
7755 *dirpp = new dir_result_t(in, perms);
7756 opened_dirs.insert(*dirpp);
7757 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7758 return 0;
7759 }
7760
7761
7762 int Client::closedir(dir_result_t *dir)
7763 {
7764 std::lock_guard lock(client_lock);
7765 tout(cct) << __func__ << std::endl;
7766 tout(cct) << (unsigned long)dir << std::endl;
7767
7768 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7769 _closedir(dir);
7770 return 0;
7771 }
7772
7773 void Client::_closedir(dir_result_t *dirp)
7774 {
7775 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7776 if (dirp->inode) {
7777 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7778 dirp->inode.reset();
7779 }
7780 _readdir_drop_dirp_buffer(dirp);
7781 opened_dirs.erase(dirp);
7782 delete dirp;
7783 }
7784
7785 void Client::rewinddir(dir_result_t *dirp)
7786 {
7787 std::lock_guard lock(client_lock);
7788 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7789
7790 if (unmounting)
7791 return;
7792
7793 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7794 _readdir_drop_dirp_buffer(d);
7795 d->reset();
7796 }
7797
7798 loff_t Client::telldir(dir_result_t *dirp)
7799 {
7800 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7801 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7802 return d->offset;
7803 }
7804
7805 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7806 {
7807 std::lock_guard lock(client_lock);
7808
7809 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7810
7811 if (unmounting)
7812 return;
7813
7814 if (offset == dirp->offset)
7815 return;
7816
7817 if (offset > dirp->offset)
7818 dirp->release_count = 0; // bump if we do a forward seek
7819 else
7820 dirp->ordered_count = 0; // disable filling readdir cache
7821
7822 if (dirp->hash_order()) {
7823 if (dirp->offset > offset) {
7824 _readdir_drop_dirp_buffer(dirp);
7825 dirp->reset();
7826 }
7827 } else {
7828 if (offset == 0 ||
7829 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7830 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7831 _readdir_drop_dirp_buffer(dirp);
7832 dirp->reset();
7833 }
7834 }
7835
7836 dirp->offset = offset;
7837 }
7838
7839
7840 //struct dirent {
7841 // ino_t d_ino; /* inode number */
7842 // off_t d_off; /* offset to the next dirent */
7843 // unsigned short d_reclen; /* length of this record */
7844 // unsigned char d_type; /* type of file */
7845 // char d_name[256]; /* filename */
7846 //};
7847 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7848 {
7849 strncpy(de->d_name, name, 255);
7850 de->d_name[255] = '\0';
7851 #ifndef __CYGWIN__
7852 de->d_ino = ino;
7853 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7854 de->d_off = next_off;
7855 #endif
7856 de->d_reclen = 1;
7857 de->d_type = IFTODT(type);
7858 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7859 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7860 #endif
7861 }
7862
7863 void Client::_readdir_next_frag(dir_result_t *dirp)
7864 {
7865 frag_t fg = dirp->buffer_frag;
7866
7867 if (fg.is_rightmost()) {
7868 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7869 dirp->set_end();
7870 return;
7871 }
7872
7873 // advance
7874 fg = fg.next();
7875 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7876
7877 if (dirp->hash_order()) {
7878 // keep last_name
7879 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7880 if (dirp->offset < new_offset) // don't decrease offset
7881 dirp->offset = new_offset;
7882 } else {
7883 dirp->last_name.clear();
7884 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7885 _readdir_rechoose_frag(dirp);
7886 }
7887 }
7888
7889 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7890 {
7891 ceph_assert(dirp->inode);
7892
7893 if (dirp->hash_order())
7894 return;
7895
7896 frag_t cur = frag_t(dirp->offset_high());
7897 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7898 if (fg != cur) {
7899 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7900 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7901 dirp->last_name.clear();
7902 dirp->next_offset = 2;
7903 }
7904 }
7905
7906 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7907 {
7908 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7909 dirp->buffer.clear();
7910 }
7911
7912 int Client::_readdir_get_frag(dir_result_t *dirp)
7913 {
7914 ceph_assert(dirp);
7915 ceph_assert(dirp->inode);
7916
7917 // get the current frag.
7918 frag_t fg;
7919 if (dirp->hash_order())
7920 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7921 else
7922 fg = frag_t(dirp->offset_high());
7923
7924 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7925 << " offset " << hex << dirp->offset << dec << dendl;
7926
7927 int op = CEPH_MDS_OP_READDIR;
7928 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7929 op = CEPH_MDS_OP_LSSNAP;
7930
7931 InodeRef& diri = dirp->inode;
7932
7933 MetaRequest *req = new MetaRequest(op);
7934 filepath path;
7935 diri->make_nosnap_relative_path(path);
7936 req->set_filepath(path);
7937 req->set_inode(diri.get());
7938 req->head.args.readdir.frag = fg;
7939 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7940 if (dirp->last_name.length()) {
7941 req->path2.set_path(dirp->last_name);
7942 } else if (dirp->hash_order()) {
7943 req->head.args.readdir.offset_hash = dirp->offset_high();
7944 }
7945 req->dirp = dirp;
7946
7947 bufferlist dirbl;
7948 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7949
7950 if (res == -EAGAIN) {
7951 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7952 _readdir_rechoose_frag(dirp);
7953 return _readdir_get_frag(dirp);
7954 }
7955
7956 if (res == 0) {
7957 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7958 << " size " << dirp->buffer.size() << dendl;
7959 } else {
7960 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7961 dirp->set_end();
7962 }
7963
7964 return res;
7965 }
7966
7967 struct dentry_off_lt {
7968 bool operator()(const Dentry* dn, int64_t off) const {
7969 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7970 }
7971 };
7972
7973 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7974 int caps, bool getref)
7975 {
7976 ceph_assert(client_lock.is_locked());
7977 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7978 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7979 << dendl;
7980 Dir *dir = dirp->inode->dir;
7981
7982 if (!dir) {
7983 ldout(cct, 10) << " dir is empty" << dendl;
7984 dirp->set_end();
7985 return 0;
7986 }
7987
7988 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7989 dir->readdir_cache.end(),
7990 dirp->offset, dentry_off_lt());
7991
7992 string dn_name;
7993 while (true) {
7994 if (!dirp->inode->is_complete_and_ordered())
7995 return -EAGAIN;
7996 if (pd == dir->readdir_cache.end())
7997 break;
7998 Dentry *dn = *pd;
7999 if (dn->inode == NULL) {
8000 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8001 ++pd;
8002 continue;
8003 }
8004 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8005 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8006 ++pd;
8007 continue;
8008 }
8009
8010 int r = _getattr(dn->inode, caps, dirp->perms);
8011 if (r < 0)
8012 return r;
8013
8014 struct ceph_statx stx;
8015 struct dirent de;
8016 fill_statx(dn->inode, caps, &stx);
8017
8018 uint64_t next_off = dn->offset + 1;
8019 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8020 ++pd;
8021 if (pd == dir->readdir_cache.end())
8022 next_off = dir_result_t::END;
8023
8024 Inode *in = NULL;
8025 if (getref) {
8026 in = dn->inode.get();
8027 _ll_get(in);
8028 }
8029
8030 dn_name = dn->name; // fill in name while we have lock
8031
8032 client_lock.Unlock();
8033 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8034 client_lock.Lock();
8035 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8036 << " = " << r << dendl;
8037 if (r < 0) {
8038 return r;
8039 }
8040
8041 dirp->offset = next_off;
8042 if (dirp->at_end())
8043 dirp->next_offset = 2;
8044 else
8045 dirp->next_offset = dirp->offset_low();
8046 dirp->last_name = dn_name; // we successfully returned this one; update!
8047 dirp->release_count = 0; // last_name no longer match cache index
8048 if (r > 0)
8049 return r;
8050 }
8051
8052 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8053 dirp->set_end();
8054 return 0;
8055 }
8056
8057 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8058 unsigned want, unsigned flags, bool getref)
8059 {
8060 int caps = statx_to_mask(flags, want);
8061
8062 std::lock_guard lock(client_lock);
8063
8064 if (unmounting)
8065 return -ENOTCONN;
8066
8067 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8068
8069 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8070 << dec << " at_end=" << dirp->at_end()
8071 << " hash_order=" << dirp->hash_order() << dendl;
8072
8073 struct dirent de;
8074 struct ceph_statx stx;
8075 memset(&de, 0, sizeof(de));
8076 memset(&stx, 0, sizeof(stx));
8077
8078 InodeRef& diri = dirp->inode;
8079
8080 if (dirp->at_end())
8081 return 0;
8082
8083 if (dirp->offset == 0) {
8084 ldout(cct, 15) << " including ." << dendl;
8085 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8086 uint64_t next_off = 1;
8087
8088 int r;
8089 r = _getattr(diri, caps, dirp->perms);
8090 if (r < 0)
8091 return r;
8092
8093 fill_statx(diri, caps, &stx);
8094 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8095
8096 Inode *inode = NULL;
8097 if (getref) {
8098 inode = diri.get();
8099 _ll_get(inode);
8100 }
8101
8102 client_lock.Unlock();
8103 r = cb(p, &de, &stx, next_off, inode);
8104 client_lock.Lock();
8105 if (r < 0)
8106 return r;
8107
8108 dirp->offset = next_off;
8109 if (r > 0)
8110 return r;
8111 }
8112 if (dirp->offset == 1) {
8113 ldout(cct, 15) << " including .." << dendl;
8114 uint64_t next_off = 2;
8115 InodeRef in;
8116 if (diri->dentries.empty())
8117 in = diri;
8118 else
8119 in = diri->get_first_parent()->dir->parent_inode;
8120
8121 int r;
8122 r = _getattr(in, caps, dirp->perms);
8123 if (r < 0)
8124 return r;
8125
8126 fill_statx(in, caps, &stx);
8127 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8128
8129 Inode *inode = NULL;
8130 if (getref) {
8131 inode = in.get();
8132 _ll_get(inode);
8133 }
8134
8135 client_lock.Unlock();
8136 r = cb(p, &de, &stx, next_off, inode);
8137 client_lock.Lock();
8138 if (r < 0)
8139 return r;
8140
8141 dirp->offset = next_off;
8142 if (r > 0)
8143 return r;
8144 }
8145
8146 // can we read from our cache?
8147 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8148 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8149 << dirp->inode->is_complete_and_ordered()
8150 << " issued " << ccap_string(dirp->inode->caps_issued())
8151 << dendl;
8152 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8153 dirp->inode->is_complete_and_ordered() &&
8154 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8155 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8156 if (err != -EAGAIN)
8157 return err;
8158 }
8159
8160 while (1) {
8161 if (dirp->at_end())
8162 return 0;
8163
8164 bool check_caps = true;
8165 if (!dirp->is_cached()) {
8166 int r = _readdir_get_frag(dirp);
8167 if (r)
8168 return r;
8169 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8170 // different than the requested one. (our dirfragtree was outdated)
8171 check_caps = false;
8172 }
8173 frag_t fg = dirp->buffer_frag;
8174
8175 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8176 << " offset " << hex << dirp->offset << dendl;
8177
8178 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8179 dirp->offset, dir_result_t::dentry_off_lt());
8180 it != dirp->buffer.end();
8181 ++it) {
8182 dir_result_t::dentry &entry = *it;
8183
8184 uint64_t next_off = entry.offset + 1;
8185
8186 int r;
8187 if (check_caps) {
8188 r = _getattr(entry.inode, caps, dirp->perms);
8189 if (r < 0)
8190 return r;
8191 }
8192
8193 fill_statx(entry.inode, caps, &stx);
8194 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8195
8196 Inode *inode = NULL;
8197 if (getref) {
8198 inode = entry.inode.get();
8199 _ll_get(inode);
8200 }
8201
8202 client_lock.Unlock();
8203 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8204 client_lock.Lock();
8205
8206 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8207 << " = " << r << dendl;
8208 if (r < 0)
8209 return r;
8210
8211 dirp->offset = next_off;
8212 if (r > 0)
8213 return r;
8214 }
8215
8216 if (dirp->next_offset > 2) {
8217 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8218 _readdir_drop_dirp_buffer(dirp);
8219 continue; // more!
8220 }
8221
8222 if (!fg.is_rightmost()) {
8223 // next frag!
8224 _readdir_next_frag(dirp);
8225 continue;
8226 }
8227
8228 if (diri->shared_gen == dirp->start_shared_gen &&
8229 diri->dir_release_count == dirp->release_count) {
8230 if (diri->dir_ordered_count == dirp->ordered_count) {
8231 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8232 if (diri->dir) {
8233 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8234 diri->dir->readdir_cache.resize(dirp->cache_index);
8235 }
8236 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8237 } else {
8238 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8239 diri->flags |= I_COMPLETE;
8240 }
8241 }
8242
8243 dirp->set_end();
8244 return 0;
8245 }
8246 ceph_abort();
8247 return 0;
8248 }
8249
8250
8251 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8252 {
8253 return readdirplus_r(d, de, 0, 0, 0, NULL);
8254 }
8255
8256 /*
8257 * readdirplus_r
8258 *
8259 * returns
8260 * 1 if we got a dirent
8261 * 0 for end of directory
8262 * <0 on error
8263 */
8264
8265 struct single_readdir {
8266 struct dirent *de;
8267 struct ceph_statx *stx;
8268 Inode *inode;
8269 bool full;
8270 };
8271
8272 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8273 struct ceph_statx *stx, off_t off,
8274 Inode *in)
8275 {
8276 single_readdir *c = static_cast<single_readdir *>(p);
8277
8278 if (c->full)
8279 return -1; // already filled this dirent
8280
8281 *c->de = *de;
8282 if (c->stx)
8283 *c->stx = *stx;
8284 c->inode = in;
8285 c->full = true;
8286 return 1;
8287 }
8288
8289 struct dirent *Client::readdir(dir_result_t *d)
8290 {
8291 int ret;
8292 static struct dirent de;
8293 single_readdir sr;
8294 sr.de = &de;
8295 sr.stx = NULL;
8296 sr.inode = NULL;
8297 sr.full = false;
8298
8299 // our callback fills the dirent and sets sr.full=true on first
8300 // call, and returns -1 the second time around.
8301 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8302 if (ret < -1) {
8303 errno = -ret; // this sucks.
8304 return (dirent *) NULL;
8305 }
8306 if (sr.full) {
8307 return &de;
8308 }
8309 return (dirent *) NULL;
8310 }
8311
8312 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8313 struct ceph_statx *stx, unsigned want,
8314 unsigned flags, Inode **out)
8315 {
8316 single_readdir sr;
8317 sr.de = de;
8318 sr.stx = stx;
8319 sr.inode = NULL;
8320 sr.full = false;
8321
8322 // our callback fills the dirent and sets sr.full=true on first
8323 // call, and returns -1 the second time around.
8324 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8325 if (r < -1)
8326 return r;
8327 if (out)
8328 *out = sr.inode;
8329 if (sr.full)
8330 return 1;
8331 return 0;
8332 }
8333
8334
8335 /* getdents */
8336 struct getdents_result {
8337 char *buf;
8338 int buflen;
8339 int pos;
8340 bool fullent;
8341 };
8342
8343 static int _readdir_getdent_cb(void *p, struct dirent *de,
8344 struct ceph_statx *stx, off_t off, Inode *in)
8345 {
8346 struct getdents_result *c = static_cast<getdents_result *>(p);
8347
8348 int dlen;
8349 if (c->fullent)
8350 dlen = sizeof(*de);
8351 else
8352 dlen = strlen(de->d_name) + 1;
8353
8354 if (c->pos + dlen > c->buflen)
8355 return -1; // doesn't fit
8356
8357 if (c->fullent) {
8358 memcpy(c->buf + c->pos, de, sizeof(*de));
8359 } else {
8360 memcpy(c->buf + c->pos, de->d_name, dlen);
8361 }
8362 c->pos += dlen;
8363 return 0;
8364 }
8365
8366 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8367 {
8368 getdents_result gr;
8369 gr.buf = buf;
8370 gr.buflen = buflen;
8371 gr.fullent = fullent;
8372 gr.pos = 0;
8373
8374 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8375
8376 if (r < 0) { // some error
8377 if (r == -1) { // buffer ran out of space
8378 if (gr.pos) { // but we got some entries already!
8379 return gr.pos;
8380 } // or we need a larger buffer
8381 return -ERANGE;
8382 } else { // actual error, return it
8383 return r;
8384 }
8385 }
8386 return gr.pos;
8387 }
8388
8389
8390 /* getdir */
8391 struct getdir_result {
8392 list<string> *contents;
8393 int num;
8394 };
8395
8396 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8397 {
8398 getdir_result *r = static_cast<getdir_result *>(p);
8399
8400 r->contents->push_back(de->d_name);
8401 r->num++;
8402 return 0;
8403 }
8404
8405 int Client::getdir(const char *relpath, list<string>& contents,
8406 const UserPerm& perms)
8407 {
8408 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8409 {
8410 std::lock_guard lock(client_lock);
8411 tout(cct) << "getdir" << std::endl;
8412 tout(cct) << relpath << std::endl;
8413 }
8414
8415 dir_result_t *d;
8416 int r = opendir(relpath, &d, perms);
8417 if (r < 0)
8418 return r;
8419
8420 getdir_result gr;
8421 gr.contents = &contents;
8422 gr.num = 0;
8423 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8424
8425 closedir(d);
8426
8427 if (r < 0)
8428 return r;
8429 return gr.num;
8430 }
8431
8432
8433 /****** file i/o **********/
8434 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8435 mode_t mode, int stripe_unit, int stripe_count,
8436 int object_size, const char *data_pool)
8437 {
8438 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8439 std::lock_guard lock(client_lock);
8440 tout(cct) << "open" << std::endl;
8441 tout(cct) << relpath << std::endl;
8442 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8443
8444 if (unmounting)
8445 return -ENOTCONN;
8446
8447 Fh *fh = NULL;
8448
8449 #if defined(__linux__) && defined(O_PATH)
8450 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8451 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8452 * in kernel (fs/open.c). */
8453 if (flags & O_PATH)
8454 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8455 #endif
8456
8457 filepath path(relpath);
8458 InodeRef in;
8459 bool created = false;
8460 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8461 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8462 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8463
8464 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8465 return -EEXIST;
8466
8467 #if defined(__linux__) && defined(O_PATH)
8468 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8469 #else
8470 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8471 #endif
8472 return -ELOOP;
8473
8474 if (r == -ENOENT && (flags & O_CREAT)) {
8475 filepath dirpath = path;
8476 string dname = dirpath.last_dentry();
8477 dirpath.pop_dentry();
8478 InodeRef dir;
8479 r = path_walk(dirpath, &dir, perms, true,
8480 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8481 if (r < 0)
8482 goto out;
8483 if (cct->_conf->client_permissions) {
8484 r = may_create(dir.get(), perms);
8485 if (r < 0)
8486 goto out;
8487 }
8488 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8489 stripe_count, object_size, data_pool, &created, perms);
8490 }
8491 if (r < 0)
8492 goto out;
8493
8494 if (!created) {
8495 // posix says we can only check permissions of existing files
8496 if (cct->_conf->client_permissions) {
8497 r = may_open(in.get(), flags, perms);
8498 if (r < 0)
8499 goto out;
8500 }
8501 }
8502
8503 if (!fh)
8504 r = _open(in.get(), flags, mode, &fh, perms);
8505 if (r >= 0) {
8506 // allocate a integer file descriptor
8507 ceph_assert(fh);
8508 r = get_fd();
8509 ceph_assert(fd_map.count(r) == 0);
8510 fd_map[r] = fh;
8511 }
8512
8513 out:
8514 tout(cct) << r << std::endl;
8515 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8516 return r;
8517 }
8518
8519 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8520 {
8521 /* Use default file striping parameters */
8522 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8523 }
8524
8525 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8526 const UserPerm& perms)
8527 {
8528 std::lock_guard lock(client_lock);
8529 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8530
8531 if (unmounting)
8532 return -ENOTCONN;
8533
8534 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8535 filepath path(ino);
8536 req->set_filepath(path);
8537
8538 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8539 char f[30];
8540 sprintf(f, "%u", h);
8541 filepath path2(dirino);
8542 path2.push_dentry(string(f));
8543 req->set_filepath2(path2);
8544
8545 int r = make_request(req, perms, NULL, NULL,
8546 rand() % mdsmap->get_num_in_mds());
8547 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8548 return r;
8549 }
8550
8551
8552 /**
8553 * Load inode into local cache.
8554 *
8555 * If inode pointer is non-NULL, and take a reference on
8556 * the resulting Inode object in one operation, so that caller
8557 * can safely assume inode will still be there after return.
8558 */
8559 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8560 {
8561 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8562
8563 if (unmounting)
8564 return -ENOTCONN;
8565
8566 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8567 filepath path(ino);
8568 req->set_filepath(path);
8569
8570 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8571 if (r == 0 && inode != NULL) {
8572 vinodeno_t vino(ino, CEPH_NOSNAP);
8573 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8574 ceph_assert(p != inode_map.end());
8575 *inode = p->second;
8576 _ll_get(*inode);
8577 }
8578 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8579 return r;
8580 }
8581
8582 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8583 {
8584 std::lock_guard lock(client_lock);
8585 return _lookup_ino(ino, perms, inode);
8586 }
8587
8588 /**
8589 * Find the parent inode of `ino` and insert it into
8590 * our cache. Conditionally also set `parent` to a referenced
8591 * Inode* if caller provides non-NULL value.
8592 */
8593 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8594 {
8595 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8596
8597 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8598 filepath path(ino->ino);
8599 req->set_filepath(path);
8600
8601 InodeRef target;
8602 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8603 // Give caller a reference to the parent ino if they provided a pointer.
8604 if (parent != NULL) {
8605 if (r == 0) {
8606 *parent = target.get();
8607 _ll_get(*parent);
8608 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8609 } else {
8610 *parent = NULL;
8611 }
8612 }
8613 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8614 return r;
8615 }
8616
8617 /**
8618 * Populate the parent dentry for `ino`, provided it is
8619 * a child of `parent`.
8620 */
8621 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8622 {
8623 ceph_assert(parent->is_dir());
8624 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8625
8626 if (unmounting)
8627 return -ENOTCONN;
8628
8629 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8630 req->set_filepath2(filepath(parent->ino));
8631 req->set_filepath(filepath(ino->ino));
8632 req->set_inode(ino);
8633
8634 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8635 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8636 return r;
8637 }
8638
8639 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8640 {
8641 std::lock_guard lock(client_lock);
8642 return _lookup_name(ino, parent, perms);
8643 }
8644
8645 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8646 {
8647 ceph_assert(in);
8648 Fh *f = new Fh(in, flags, cmode, perms);
8649
8650 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8651
8652 if (in->snapid != CEPH_NOSNAP) {
8653 in->snap_cap_refs++;
8654 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8655 << ccap_string(in->caps_issued()) << dendl;
8656 }
8657
8658 const auto& conf = cct->_conf;
8659 f->readahead.set_trigger_requests(1);
8660 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8661 uint64_t max_readahead = Readahead::NO_LIMIT;
8662 if (conf->client_readahead_max_bytes) {
8663 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8664 }
8665 if (conf->client_readahead_max_periods) {
8666 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8667 }
8668 f->readahead.set_max_readahead_size(max_readahead);
8669 vector<uint64_t> alignments;
8670 alignments.push_back(in->layout.get_period());
8671 alignments.push_back(in->layout.stripe_unit);
8672 f->readahead.set_alignments(alignments);
8673
8674 return f;
8675 }
8676
8677 int Client::_release_fh(Fh *f)
8678 {
8679 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8680 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8681 Inode *in = f->inode.get();
8682 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8683
8684 in->unset_deleg(f);
8685
8686 if (in->snapid == CEPH_NOSNAP) {
8687 if (in->put_open_ref(f->mode)) {
8688 _flush(in, new C_Client_FlushComplete(this, in));
8689 check_caps(in, 0);
8690 }
8691 } else {
8692 ceph_assert(in->snap_cap_refs > 0);
8693 in->snap_cap_refs--;
8694 }
8695
8696 _release_filelocks(f);
8697
8698 // Finally, read any async err (i.e. from flushes)
8699 int err = f->take_async_err();
8700 if (err != 0) {
8701 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8702 << cpp_strerror(err) << dendl;
8703 } else {
8704 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8705 }
8706
8707 _put_fh(f);
8708
8709 return err;
8710 }
8711
8712 void Client::_put_fh(Fh *f)
8713 {
8714 int left = f->put();
8715 if (!left) {
8716 delete f;
8717 }
8718 }
8719
8720 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8721 const UserPerm& perms)
8722 {
8723 if (in->snapid != CEPH_NOSNAP &&
8724 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8725 return -EROFS;
8726 }
8727
8728 // use normalized flags to generate cmode
8729 int cflags = ceph_flags_sys2wire(flags);
8730 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8731 cflags |= CEPH_O_LAZY;
8732
8733 int cmode = ceph_flags_to_mode(cflags);
8734 int want = ceph_caps_for_mode(cmode);
8735 int result = 0;
8736
8737 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8738
8739 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8740 // update wanted?
8741 check_caps(in, CHECK_CAPS_NODELAY);
8742 } else {
8743
8744 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8745 filepath path;
8746 in->make_nosnap_relative_path(path);
8747 req->set_filepath(path);
8748 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8749 req->head.args.open.mode = mode;
8750 req->head.args.open.pool = -1;
8751 if (cct->_conf->client_debug_getattr_caps)
8752 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8753 else
8754 req->head.args.open.mask = 0;
8755 req->head.args.open.old_size = in->size; // for O_TRUNC
8756 req->set_inode(in);
8757 result = make_request(req, perms);
8758
8759 /*
8760 * NFS expects that delegations will be broken on a conflicting open,
8761 * not just when there is actual conflicting access to the file. SMB leases
8762 * and oplocks also have similar semantics.
8763 *
8764 * Ensure that clients that have delegations enabled will wait on minimal
8765 * caps during open, just to ensure that other clients holding delegations
8766 * return theirs first.
8767 */
8768 if (deleg_timeout && result == 0) {
8769 int need = 0, have;
8770
8771 if (cmode & CEPH_FILE_MODE_WR)
8772 need |= CEPH_CAP_FILE_WR;
8773 if (cmode & CEPH_FILE_MODE_RD)
8774 need |= CEPH_CAP_FILE_RD;
8775
8776 result = get_caps(in, need, want, &have, -1);
8777 if (result < 0) {
8778 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8779 " . Denying open: " <<
8780 cpp_strerror(result) << dendl;
8781 in->put_open_ref(cmode);
8782 } else {
8783 put_cap_ref(in, need);
8784 }
8785 }
8786 }
8787
8788 // success?
8789 if (result >= 0) {
8790 if (fhp)
8791 *fhp = _create_fh(in, flags, cmode, perms);
8792 } else {
8793 in->put_open_ref(cmode);
8794 }
8795
8796 trim_cache();
8797
8798 return result;
8799 }
8800
8801 int Client::_renew_caps(Inode *in)
8802 {
8803 int wanted = in->caps_file_wanted();
8804 if (in->is_any_caps() &&
8805 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8806 check_caps(in, CHECK_CAPS_NODELAY);
8807 return 0;
8808 }
8809
8810 int flags = 0;
8811 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8812 flags = O_RDWR;
8813 else if (wanted & CEPH_CAP_FILE_RD)
8814 flags = O_RDONLY;
8815 else if (wanted & CEPH_CAP_FILE_WR)
8816 flags = O_WRONLY;
8817
8818 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8819 filepath path;
8820 in->make_nosnap_relative_path(path);
8821 req->set_filepath(path);
8822 req->head.args.open.flags = flags;
8823 req->head.args.open.pool = -1;
8824 if (cct->_conf->client_debug_getattr_caps)
8825 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8826 else
8827 req->head.args.open.mask = 0;
8828 req->set_inode(in);
8829
8830 // duplicate in case Cap goes away; not sure if that race is a concern?
8831 const UserPerm *pperm = in->get_best_perms();
8832 UserPerm perms;
8833 if (pperm != NULL)
8834 perms = *pperm;
8835 int ret = make_request(req, perms);
8836 return ret;
8837 }
8838
8839 int Client::close(int fd)
8840 {
8841 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8842 std::lock_guard lock(client_lock);
8843 tout(cct) << "close" << std::endl;
8844 tout(cct) << fd << std::endl;
8845
8846 if (unmounting)
8847 return -ENOTCONN;
8848
8849 Fh *fh = get_filehandle(fd);
8850 if (!fh)
8851 return -EBADF;
8852 int err = _release_fh(fh);
8853 fd_map.erase(fd);
8854 put_fd(fd);
8855 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8856 return err;
8857 }
8858
8859
8860 // ------------
8861 // read, write
8862
8863 loff_t Client::lseek(int fd, loff_t offset, int whence)
8864 {
8865 std::lock_guard lock(client_lock);
8866 tout(cct) << "lseek" << std::endl;
8867 tout(cct) << fd << std::endl;
8868 tout(cct) << offset << std::endl;
8869 tout(cct) << whence << std::endl;
8870
8871 if (unmounting)
8872 return -ENOTCONN;
8873
8874 Fh *f = get_filehandle(fd);
8875 if (!f)
8876 return -EBADF;
8877 #if defined(__linux__) && defined(O_PATH)
8878 if (f->flags & O_PATH)
8879 return -EBADF;
8880 #endif
8881 return _lseek(f, offset, whence);
8882 }
8883
8884 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8885 {
8886 Inode *in = f->inode.get();
8887 int r;
8888 loff_t pos = -1;
8889
8890 switch (whence) {
8891 case SEEK_SET:
8892 pos = offset;
8893 break;
8894
8895 case SEEK_CUR:
8896 pos += offset;
8897 break;
8898
8899 case SEEK_END:
8900 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8901 if (r < 0)
8902 return r;
8903 pos = in->size + offset;
8904 break;
8905
8906 default:
8907 ceph_abort();
8908 }
8909
8910 if (pos < 0) {
8911 return -EINVAL;
8912 } else {
8913 f->pos = pos;
8914 }
8915
8916 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8917 return f->pos;
8918 }
8919
8920
8921 void Client::lock_fh_pos(Fh *f)
8922 {
8923 ldout(cct, 10) << __func__ << " " << f << dendl;
8924
8925 if (f->pos_locked || !f->pos_waiters.empty()) {
8926 Cond cond;
8927 f->pos_waiters.push_back(&cond);
8928 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
8929 while (f->pos_locked || f->pos_waiters.front() != &cond)
8930 cond.Wait(client_lock);
8931 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8932 ceph_assert(f->pos_waiters.front() == &cond);
8933 f->pos_waiters.pop_front();
8934 }
8935
8936 f->pos_locked = true;
8937 }
8938
8939 void Client::unlock_fh_pos(Fh *f)
8940 {
8941 ldout(cct, 10) << __func__ << " " << f << dendl;
8942 f->pos_locked = false;
8943 }
8944
8945 int Client::uninline_data(Inode *in, Context *onfinish)
8946 {
8947 if (!in->inline_data.length()) {
8948 onfinish->complete(0);
8949 return 0;
8950 }
8951
8952 char oid_buf[32];
8953 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8954 object_t oid = oid_buf;
8955
8956 ObjectOperation create_ops;
8957 create_ops.create(false);
8958
8959 objecter->mutate(oid,
8960 OSDMap::file_to_object_locator(in->layout),
8961 create_ops,
8962 in->snaprealm->get_snap_context(),
8963 ceph::real_clock::now(),
8964 0,
8965 NULL);
8966
8967 bufferlist inline_version_bl;
8968 encode(in->inline_version, inline_version_bl);
8969
8970 ObjectOperation uninline_ops;
8971 uninline_ops.cmpxattr("inline_version",
8972 CEPH_OSD_CMPXATTR_OP_GT,
8973 CEPH_OSD_CMPXATTR_MODE_U64,
8974 inline_version_bl);
8975 bufferlist inline_data = in->inline_data;
8976 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8977 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8978
8979 objecter->mutate(oid,
8980 OSDMap::file_to_object_locator(in->layout),
8981 uninline_ops,
8982 in->snaprealm->get_snap_context(),
8983 ceph::real_clock::now(),
8984 0,
8985 onfinish);
8986
8987 return 0;
8988 }
8989
8990 //
8991
8992 // blocking osd interface
8993
8994 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8995 {
8996 std::lock_guard lock(client_lock);
8997 tout(cct) << "read" << std::endl;
8998 tout(cct) << fd << std::endl;
8999 tout(cct) << size << std::endl;
9000 tout(cct) << offset << std::endl;
9001
9002 if (unmounting)
9003 return -ENOTCONN;
9004
9005 Fh *f = get_filehandle(fd);
9006 if (!f)
9007 return -EBADF;
9008 #if defined(__linux__) && defined(O_PATH)
9009 if (f->flags & O_PATH)
9010 return -EBADF;
9011 #endif
9012 bufferlist bl;
9013 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9014 size = std::min(size, (loff_t)INT_MAX);
9015 int r = _read(f, offset, size, &bl);
9016 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9017 if (r >= 0) {
9018 bl.copy(0, bl.length(), buf);
9019 r = bl.length();
9020 }
9021 return r;
9022 }
9023
9024 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9025 {
9026 if (iovcnt < 0)
9027 return -EINVAL;
9028 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9029 }
9030
9031 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9032 {
9033 int want, have = 0;
9034 bool movepos = false;
9035 std::unique_ptr<C_SaferCond> onuninline;
9036 int64_t r = 0;
9037 const auto& conf = cct->_conf;
9038 Inode *in = f->inode.get();
9039 utime_t lat;
9040 utime_t start = ceph_clock_now();
9041
9042 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9043 return -EBADF;
9044 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9045
9046 if (offset < 0) {
9047 lock_fh_pos(f);
9048 offset = f->pos;
9049 movepos = true;
9050 }
9051 loff_t start_pos = offset;
9052
9053 if (in->inline_version == 0) {
9054 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9055 if (r < 0) {
9056 goto done;
9057 }
9058 ceph_assert(in->inline_version > 0);
9059 }
9060
9061 retry:
9062 if (f->mode & CEPH_FILE_MODE_LAZY)
9063 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9064 else
9065 want = CEPH_CAP_FILE_CACHE;
9066 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
9067 if (r < 0) {
9068 goto done;
9069 }
9070 if (f->flags & O_DIRECT)
9071 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9072
9073 if (in->inline_version < CEPH_INLINE_NONE) {
9074 if (!(have & CEPH_CAP_FILE_CACHE)) {
9075 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9076 uninline_data(in, onuninline.get());
9077 } else {
9078 uint32_t len = in->inline_data.length();
9079 uint64_t endoff = offset + size;
9080 if (endoff > in->size)
9081 endoff = in->size;
9082
9083 if (offset < len) {
9084 if (endoff <= len) {
9085 bl->substr_of(in->inline_data, offset, endoff - offset);
9086 } else {
9087 bl->substr_of(in->inline_data, offset, len - offset);
9088 bl->append_zero(endoff - len);
9089 }
9090 r = endoff - offset;
9091 } else if ((uint64_t)offset < endoff) {
9092 bl->append_zero(endoff - offset);
9093 r = endoff - offset;
9094 } else {
9095 r = 0;
9096 }
9097 goto success;
9098 }
9099 }
9100
9101 if (!conf->client_debug_force_sync_read &&
9102 conf->client_oc &&
9103 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9104
9105 if (f->flags & O_RSYNC) {
9106 _flush_range(in, offset, size);
9107 }
9108 r = _read_async(f, offset, size, bl);
9109 if (r < 0)
9110 goto done;
9111 } else {
9112 if (f->flags & O_DIRECT)
9113 _flush_range(in, offset, size);
9114
9115 bool checkeof = false;
9116 r = _read_sync(f, offset, size, bl, &checkeof);
9117 if (r < 0)
9118 goto done;
9119 if (checkeof) {
9120 offset += r;
9121 size -= r;
9122
9123 put_cap_ref(in, CEPH_CAP_FILE_RD);
9124 have = 0;
9125 // reverify size
9126 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9127 if (r < 0)
9128 goto done;
9129
9130 // eof? short read.
9131 if ((uint64_t)offset < in->size)
9132 goto retry;
9133 }
9134 }
9135
9136 success:
9137 ceph_assert(r >= 0);
9138 if (movepos) {
9139 // adjust fd pos
9140 f->pos = start_pos + r;
9141 }
9142
9143 lat = ceph_clock_now();
9144 lat -= start;
9145 logger->tinc(l_c_read, lat);
9146
9147 done:
9148 // done!
9149
9150 if (onuninline) {
9151 client_lock.Unlock();
9152 int ret = onuninline->wait();
9153 client_lock.Lock();
9154 if (ret >= 0 || ret == -ECANCELED) {
9155 in->inline_data.clear();
9156 in->inline_version = CEPH_INLINE_NONE;
9157 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9158 check_caps(in, 0);
9159 } else
9160 r = ret;
9161 }
9162 if (have) {
9163 put_cap_ref(in, CEPH_CAP_FILE_RD);
9164 }
9165 if (movepos) {
9166 unlock_fh_pos(f);
9167 }
9168 return r;
9169 }
9170
9171 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9172 client(c), f(f) {
9173 f->get();
9174 f->readahead.inc_pending();
9175 }
9176
9177 Client::C_Readahead::~C_Readahead() {
9178 f->readahead.dec_pending();
9179 client->_put_fh(f);
9180 }
9181
9182 void Client::C_Readahead::finish(int r) {
9183 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9184 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9185 }
9186
9187 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9188 {
9189 const auto& conf = cct->_conf;
9190 Inode *in = f->inode.get();
9191
9192 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9193
9194 // trim read based on file size?
9195 if (off >= in->size)
9196 return 0;
9197 if (len == 0)
9198 return 0;
9199 if (off + len > in->size) {
9200 len = in->size - off;
9201 }
9202
9203 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9204 << " max_bytes=" << f->readahead.get_max_readahead_size()
9205 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9206
9207 // read (and possibly block)
9208 int r = 0;
9209 C_SaferCond onfinish("Client::_read_async flock");
9210 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9211 off, len, bl, 0, &onfinish);
9212 if (r == 0) {
9213 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9214 client_lock.Unlock();
9215 r = onfinish.wait();
9216 client_lock.Lock();
9217 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9218 }
9219
9220 if(f->readahead.get_min_readahead_size() > 0) {
9221 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9222 if (readahead_extent.second > 0) {
9223 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9224 << " (caller wants " << off << "~" << len << ")" << dendl;
9225 Context *onfinish2 = new C_Readahead(this, f);
9226 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9227 readahead_extent.first, readahead_extent.second,
9228 NULL, 0, onfinish2);
9229 if (r2 == 0) {
9230 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9231 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9232 } else {
9233 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9234 delete onfinish2;
9235 }
9236 }
9237 }
9238
9239 return r;
9240 }
9241
9242 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9243 bool *checkeof)
9244 {
9245 Inode *in = f->inode.get();
9246 uint64_t pos = off;
9247 int left = len;
9248 int read = 0;
9249
9250 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9251
9252 Mutex flock("Client::_read_sync flock");
9253 Cond cond;
9254 while (left > 0) {
9255 C_SaferCond onfinish("Client::_read_sync flock");
9256 bufferlist tbl;
9257
9258 int wanted = left;
9259 filer->read_trunc(in->ino, &in->layout, in->snapid,
9260 pos, left, &tbl, 0,
9261 in->truncate_size, in->truncate_seq,
9262 &onfinish);
9263 client_lock.Unlock();
9264 int r = onfinish.wait();
9265 client_lock.Lock();
9266
9267 // if we get ENOENT from OSD, assume 0 bytes returned
9268 if (r == -ENOENT)
9269 r = 0;
9270 if (r < 0)
9271 return r;
9272 if (tbl.length()) {
9273 r = tbl.length();
9274
9275 read += r;
9276 pos += r;
9277 left -= r;
9278 bl->claim_append(tbl);
9279 }
9280 // short read?
9281 if (r >= 0 && r < wanted) {
9282 if (pos < in->size) {
9283 // zero up to known EOF
9284 int64_t some = in->size - pos;
9285 if (some > left)
9286 some = left;
9287 auto z = buffer::ptr_node::create(some);
9288 z->zero();
9289 bl->push_back(std::move(z));
9290 read += some;
9291 pos += some;
9292 left -= some;
9293 if (left == 0)
9294 return read;
9295 }
9296
9297 *checkeof = true;
9298 return read;
9299 }
9300 }
9301 return read;
9302 }
9303
9304
9305 /*
9306 * we keep count of uncommitted sync writes on the inode, so that
9307 * fsync can DDRT.
9308 */
9309 void Client::_sync_write_commit(Inode *in)
9310 {
9311 ceph_assert(unsafe_sync_write > 0);
9312 unsafe_sync_write--;
9313
9314 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9315
9316 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9317 if (unsafe_sync_write == 0 && unmounting) {
9318 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9319 mount_cond.Signal();
9320 }
9321 }
9322
9323 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9324 {
9325 std::lock_guard lock(client_lock);
9326 tout(cct) << "write" << std::endl;
9327 tout(cct) << fd << std::endl;
9328 tout(cct) << size << std::endl;
9329 tout(cct) << offset << std::endl;
9330
9331 if (unmounting)
9332 return -ENOTCONN;
9333
9334 Fh *fh = get_filehandle(fd);
9335 if (!fh)
9336 return -EBADF;
9337 #if defined(__linux__) && defined(O_PATH)
9338 if (fh->flags & O_PATH)
9339 return -EBADF;
9340 #endif
9341 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9342 size = std::min(size, (loff_t)INT_MAX);
9343 int r = _write(fh, offset, size, buf, NULL, false);
9344 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9345 return r;
9346 }
9347
9348 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9349 {
9350 if (iovcnt < 0)
9351 return -EINVAL;
9352 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9353 }
9354
9355 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9356 unsigned iovcnt, int64_t offset, bool write,
9357 bool clamp_to_int)
9358 {
9359 #if defined(__linux__) && defined(O_PATH)
9360 if (fh->flags & O_PATH)
9361 return -EBADF;
9362 #endif
9363 loff_t totallen = 0;
9364 for (unsigned i = 0; i < iovcnt; i++) {
9365 totallen += iov[i].iov_len;
9366 }
9367
9368 /*
9369 * Some of the API functions take 64-bit size values, but only return
9370 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9371 * we don't do I/Os larger than the values we can return.
9372 */
9373 if (clamp_to_int) {
9374 totallen = std::min(totallen, (loff_t)INT_MAX);
9375 }
9376 if (write) {
9377 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9378 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9379 return w;
9380 } else {
9381 bufferlist bl;
9382 int64_t r = _read(fh, offset, totallen, &bl);
9383 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
9384 if (r <= 0)
9385 return r;
9386
9387 int bufoff = 0;
9388 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9389 /*
9390 * This piece of code aims to handle the case that bufferlist does not have enough data
9391 * to fill in the iov
9392 */
9393 if (resid < iov[j].iov_len) {
9394 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9395 break;
9396 } else {
9397 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9398 }
9399 resid -= iov[j].iov_len;
9400 bufoff += iov[j].iov_len;
9401 }
9402 return r;
9403 }
9404 }
9405
9406 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9407 {
9408 std::lock_guard lock(client_lock);
9409 tout(cct) << fd << std::endl;
9410 tout(cct) << offset << std::endl;
9411
9412 if (unmounting)
9413 return -ENOTCONN;
9414
9415 Fh *fh = get_filehandle(fd);
9416 if (!fh)
9417 return -EBADF;
9418 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9419 }
9420
9421 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9422 const struct iovec *iov, int iovcnt)
9423 {
9424 uint64_t fpos = 0;
9425
9426 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9427 return -EFBIG;
9428
9429 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9430 Inode *in = f->inode.get();
9431
9432 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9433 return -ENOSPC;
9434 }
9435
9436 ceph_assert(in->snapid == CEPH_NOSNAP);
9437
9438 // was Fh opened as writeable?
9439 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9440 return -EBADF;
9441
9442 // use/adjust fd pos?
9443 if (offset < 0) {
9444 lock_fh_pos(f);
9445 /*
9446 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9447 * change out from under us.
9448 */
9449 if (f->flags & O_APPEND) {
9450 int r = _lseek(f, 0, SEEK_END);
9451 if (r < 0) {
9452 unlock_fh_pos(f);
9453 return r;
9454 }
9455 }
9456 offset = f->pos;
9457 fpos = offset+size;
9458 unlock_fh_pos(f);
9459 }
9460
9461 // check quota
9462 uint64_t endoff = offset + size;
9463 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9464 f->actor_perms)) {
9465 return -EDQUOT;
9466 }
9467
9468 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9469
9470 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9471
9472 // time it.
9473 utime_t start = ceph_clock_now();
9474
9475 if (in->inline_version == 0) {
9476 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9477 if (r < 0)
9478 return r;
9479 ceph_assert(in->inline_version > 0);
9480 }
9481
9482 // copy into fresh buffer (since our write may be resub, async)
9483 bufferlist bl;
9484 if (buf) {
9485 if (size > 0)
9486 bl.append(buf, size);
9487 } else if (iov){
9488 for (int i = 0; i < iovcnt; i++) {
9489 if (iov[i].iov_len > 0) {
9490 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9491 }
9492 }
9493 }
9494
9495 utime_t lat;
9496 uint64_t totalwritten;
9497 int want, have;
9498 if (f->mode & CEPH_FILE_MODE_LAZY)
9499 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9500 else
9501 want = CEPH_CAP_FILE_BUFFER;
9502 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9503 if (r < 0)
9504 return r;
9505
9506 /* clear the setuid/setgid bits, if any */
9507 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9508 struct ceph_statx stx = { 0 };
9509
9510 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9511 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9512 if (r < 0)
9513 return r;
9514 } else {
9515 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9516 }
9517
9518 if (f->flags & O_DIRECT)
9519 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9520
9521 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9522
9523 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9524
9525 if (in->inline_version < CEPH_INLINE_NONE) {
9526 if (endoff > cct->_conf->client_max_inline_size ||
9527 endoff > CEPH_INLINE_MAX_SIZE ||
9528 !(have & CEPH_CAP_FILE_BUFFER)) {
9529 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9530 uninline_data(in, onuninline.get());
9531 } else {
9532 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9533
9534 uint32_t len = in->inline_data.length();
9535
9536 if (endoff < len)
9537 in->inline_data.copy(endoff, len - endoff, bl);
9538
9539 if (offset < len)
9540 in->inline_data.splice(offset, len - offset);
9541 else if (offset > len)
9542 in->inline_data.append_zero(offset - len);
9543
9544 in->inline_data.append(bl);
9545 in->inline_version++;
9546
9547 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9548
9549 goto success;
9550 }
9551 }
9552
9553 if (cct->_conf->client_oc &&
9554 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9555 // do buffered write
9556 if (!in->oset.dirty_or_tx)
9557 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9558
9559 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9560
9561 // async, caching, non-blocking.
9562 r = objectcacher->file_write(&in->oset, &in->layout,
9563 in->snaprealm->get_snap_context(),
9564 offset, size, bl, ceph::real_clock::now(),
9565 0);
9566 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9567
9568 if (r < 0)
9569 goto done;
9570
9571 // flush cached write if O_SYNC is set on file fh
9572 // O_DSYNC == O_SYNC on linux < 2.6.33
9573 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9574 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9575 _flush_range(in, offset, size);
9576 }
9577 } else {
9578 if (f->flags & O_DIRECT)
9579 _flush_range(in, offset, size);
9580
9581 // simple, non-atomic sync write
9582 C_SaferCond onfinish("Client::_write flock");
9583 unsafe_sync_write++;
9584 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9585
9586 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9587 offset, size, bl, ceph::real_clock::now(), 0,
9588 in->truncate_size, in->truncate_seq,
9589 &onfinish);
9590 client_lock.Unlock();
9591 onfinish.wait();
9592 client_lock.Lock();
9593 _sync_write_commit(in);
9594 }
9595
9596 // if we get here, write was successful, update client metadata
9597 success:
9598 // time
9599 lat = ceph_clock_now();
9600 lat -= start;
9601 logger->tinc(l_c_wrlat, lat);
9602
9603 if (fpos) {
9604 lock_fh_pos(f);
9605 f->pos = fpos;
9606 unlock_fh_pos(f);
9607 }
9608 totalwritten = size;
9609 r = (int64_t)totalwritten;
9610
9611 // extend file?
9612 if (totalwritten + offset > in->size) {
9613 in->size = totalwritten + offset;
9614 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9615
9616 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9617 check_caps(in, CHECK_CAPS_NODELAY);
9618 } else if (is_max_size_approaching(in)) {
9619 check_caps(in, 0);
9620 }
9621
9622 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9623 } else {
9624 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9625 }
9626
9627 // mtime
9628 in->mtime = in->ctime = ceph_clock_now();
9629 in->change_attr++;
9630 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9631
9632 done:
9633
9634 if (nullptr != onuninline) {
9635 client_lock.Unlock();
9636 int uninline_ret = onuninline->wait();
9637 client_lock.Lock();
9638
9639 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9640 in->inline_data.clear();
9641 in->inline_version = CEPH_INLINE_NONE;
9642 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9643 check_caps(in, 0);
9644 } else
9645 r = uninline_ret;
9646 }
9647
9648 put_cap_ref(in, CEPH_CAP_FILE_WR);
9649 return r;
9650 }
9651
9652 int Client::_flush(Fh *f)
9653 {
9654 Inode *in = f->inode.get();
9655 int err = f->take_async_err();
9656 if (err != 0) {
9657 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9658 << cpp_strerror(err) << dendl;
9659 } else {
9660 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9661 }
9662
9663 return err;
9664 }
9665
9666 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9667 {
9668 struct ceph_statx stx;
9669 stx.stx_size = length;
9670 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9671 }
9672
9673 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9674 {
9675 std::lock_guard lock(client_lock);
9676 tout(cct) << __func__ << std::endl;
9677 tout(cct) << fd << std::endl;
9678 tout(cct) << length << std::endl;
9679
9680 if (unmounting)
9681 return -ENOTCONN;
9682
9683 Fh *f = get_filehandle(fd);
9684 if (!f)
9685 return -EBADF;
9686 #if defined(__linux__) && defined(O_PATH)
9687 if (f->flags & O_PATH)
9688 return -EBADF;
9689 #endif
9690 struct stat attr;
9691 attr.st_size = length;
9692 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9693 }
9694
9695 int Client::fsync(int fd, bool syncdataonly)
9696 {
9697 std::lock_guard lock(client_lock);
9698 tout(cct) << "fsync" << std::endl;
9699 tout(cct) << fd << std::endl;
9700 tout(cct) << syncdataonly << std::endl;
9701
9702 if (unmounting)
9703 return -ENOTCONN;
9704
9705 Fh *f = get_filehandle(fd);
9706 if (!f)
9707 return -EBADF;
9708 #if defined(__linux__) && defined(O_PATH)
9709 if (f->flags & O_PATH)
9710 return -EBADF;
9711 #endif
9712 int r = _fsync(f, syncdataonly);
9713 if (r == 0) {
9714 // The IOs in this fsync were okay, but maybe something happened
9715 // in the background that we shoudl be reporting?
9716 r = f->take_async_err();
9717 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9718 << ") = 0, async_err = " << r << dendl;
9719 } else {
9720 // Assume that an error we encountered during fsync, even reported
9721 // synchronously, would also have applied the error to the Fh, and we
9722 // should clear it here to avoid returning the same error again on next
9723 // call.
9724 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9725 << r << dendl;
9726 f->take_async_err();
9727 }
9728 return r;
9729 }
9730
9731 int Client::_fsync(Inode *in, bool syncdataonly)
9732 {
9733 int r = 0;
9734 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9735 ceph_tid_t flush_tid = 0;
9736 InodeRef tmp_ref;
9737 utime_t lat;
9738 utime_t start = ceph_clock_now();
9739
9740 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9741
9742 if (cct->_conf->client_oc) {
9743 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9744 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9745 _flush(in, object_cacher_completion.get());
9746 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9747 }
9748
9749 if (!syncdataonly && in->dirty_caps) {
9750 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9751 if (in->flushing_caps)
9752 flush_tid = last_flush_tid;
9753 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9754
9755 if (!syncdataonly && !in->unsafe_ops.empty()) {
9756 flush_mdlog_sync();
9757
9758 MetaRequest *req = in->unsafe_ops.back();
9759 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9760
9761 req->get();
9762 wait_on_list(req->waitfor_safe);
9763 put_request(req);
9764 }
9765
9766 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9767 client_lock.Unlock();
9768 ldout(cct, 15) << "waiting on data to flush" << dendl;
9769 r = object_cacher_completion->wait();
9770 client_lock.Lock();
9771 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9772 } else {
9773 // FIXME: this can starve
9774 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9775 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9776 << " uncommitted, waiting" << dendl;
9777 wait_on_list(in->waitfor_commit);
9778 }
9779 }
9780
9781 if (!r) {
9782 if (flush_tid > 0)
9783 wait_sync_caps(in, flush_tid);
9784
9785 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9786 } else {
9787 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9788 << cpp_strerror(-r) << dendl;
9789 }
9790
9791 lat = ceph_clock_now();
9792 lat -= start;
9793 logger->tinc(l_c_fsync, lat);
9794
9795 return r;
9796 }
9797
9798 int Client::_fsync(Fh *f, bool syncdataonly)
9799 {
9800 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9801 return _fsync(f->inode.get(), syncdataonly);
9802 }
9803
9804 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9805 {
9806 std::lock_guard lock(client_lock);
9807 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9808 tout(cct) << fd << std::endl;
9809
9810 if (unmounting)
9811 return -ENOTCONN;
9812
9813 Fh *f = get_filehandle(fd);
9814 if (!f)
9815 return -EBADF;
9816 int r = _getattr(f->inode, mask, perms);
9817 if (r < 0)
9818 return r;
9819 fill_stat(f->inode, stbuf, NULL);
9820 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9821 return r;
9822 }
9823
9824 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9825 unsigned int want, unsigned int flags)
9826 {
9827 std::lock_guard lock(client_lock);
9828 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9829 tout(cct) << fd << std::endl;
9830
9831 if (unmounting)
9832 return -ENOTCONN;
9833
9834 Fh *f = get_filehandle(fd);
9835 if (!f)
9836 return -EBADF;
9837
9838 unsigned mask = statx_to_mask(flags, want);
9839
9840 int r = 0;
9841 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9842 r = _getattr(f->inode, mask, perms);
9843 if (r < 0) {
9844 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9845 return r;
9846 }
9847 }
9848
9849 fill_statx(f->inode, mask, stx);
9850 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9851 return r;
9852 }
9853
9854 // not written yet, but i want to link!
9855
9856 int Client::chdir(const char *relpath, std::string &new_cwd,
9857 const UserPerm& perms)
9858 {
9859 std::lock_guard lock(client_lock);
9860 tout(cct) << "chdir" << std::endl;
9861 tout(cct) << relpath << std::endl;
9862
9863 if (unmounting)
9864 return -ENOTCONN;
9865
9866 filepath path(relpath);
9867 InodeRef in;
9868 int r = path_walk(path, &in, perms);
9869 if (r < 0)
9870 return r;
9871 if (cwd != in)
9872 cwd.swap(in);
9873 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9874
9875 _getcwd(new_cwd, perms);
9876 return 0;
9877 }
9878
9879 void Client::_getcwd(string& dir, const UserPerm& perms)
9880 {
9881 filepath path;
9882 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
9883
9884 Inode *in = cwd.get();
9885 while (in != root) {
9886 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
9887
9888 // A cwd or ancester is unlinked
9889 if (in->dentries.empty()) {
9890 return;
9891 }
9892
9893 Dentry *dn = in->get_first_parent();
9894
9895
9896 if (!dn) {
9897 // look it up
9898 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
9899 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9900 filepath path(in->ino);
9901 req->set_filepath(path);
9902 req->set_inode(in);
9903 int res = make_request(req, perms);
9904 if (res < 0)
9905 break;
9906
9907 // start over
9908 path = filepath();
9909 in = cwd.get();
9910 continue;
9911 }
9912 path.push_front_dentry(dn->name);
9913 in = dn->dir->parent_inode;
9914 }
9915 dir = "/";
9916 dir += path.get_path();
9917 }
9918
9919 void Client::getcwd(string& dir, const UserPerm& perms)
9920 {
9921 std::lock_guard l(client_lock);
9922 if (!unmounting)
9923 _getcwd(dir, perms);
9924 }
9925
9926 int Client::statfs(const char *path, struct statvfs *stbuf,
9927 const UserPerm& perms)
9928 {
9929 std::lock_guard l(client_lock);
9930 tout(cct) << __func__ << std::endl;
9931 unsigned long int total_files_on_fs;
9932
9933 if (unmounting)
9934 return -ENOTCONN;
9935
9936 ceph_statfs stats;
9937 C_SaferCond cond;
9938
9939 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9940 if (data_pools.size() == 1) {
9941 objecter->get_fs_stats(stats, data_pools[0], &cond);
9942 } else {
9943 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9944 }
9945
9946 client_lock.Unlock();
9947 int rval = cond.wait();
9948 assert(root);
9949 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9950 client_lock.Lock();
9951
9952 if (rval < 0) {
9953 ldout(cct, 1) << "underlying call to statfs returned error: "
9954 << cpp_strerror(rval)
9955 << dendl;
9956 return rval;
9957 }
9958
9959 memset(stbuf, 0, sizeof(*stbuf));
9960
9961 /*
9962 * we're going to set a block size of 4MB so we can represent larger
9963 * FSes without overflowing. Additionally convert the space
9964 * measurements from KB to bytes while making them in terms of
9965 * blocks. We use 4MB only because it is big enough, and because it
9966 * actually *is* the (ceph) default block size.
9967 */
9968 const int CEPH_BLOCK_SHIFT = 22;
9969 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9970 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9971 stbuf->f_files = total_files_on_fs;
9972 stbuf->f_ffree = 0;
9973 stbuf->f_favail = -1;
9974 stbuf->f_fsid = -1; // ??
9975 stbuf->f_flag = 0; // ??
9976 stbuf->f_namemax = NAME_MAX;
9977
9978 // Usually quota_root will == root_ancestor, but if the mount root has no
9979 // quota but we can see a parent of it that does have a quota, we'll
9980 // respect that one instead.
9981 ceph_assert(root != nullptr);
9982 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9983
9984 // get_quota_root should always give us something
9985 // because client quotas are always enabled
9986 ceph_assert(quota_root != nullptr);
9987
9988 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9989
9990 // Skip the getattr if any sessions are stale, as we don't want to
9991 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9992 // is unhealthy.
9993 if (!_any_stale_sessions()) {
9994 int r = _getattr(quota_root, 0, perms, true);
9995 if (r != 0) {
9996 // Ignore return value: error getting latest inode metadata is not a good
9997 // reason to break "df".
9998 lderr(cct) << "Error in getattr on quota root 0x"
9999 << std::hex << quota_root->ino << std::dec
10000 << " statfs result may be outdated" << dendl;
10001 }
10002 }
10003
10004 // Special case: if there is a size quota set on the Inode acting
10005 // as the root for this client mount, then report the quota status
10006 // as the filesystem statistics.
10007 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10008 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10009 // It is possible for a quota to be exceeded: arithmetic here must
10010 // handle case where used > total.
10011 const fsblkcnt_t free = total > used ? total - used : 0;
10012
10013 stbuf->f_blocks = total;
10014 stbuf->f_bfree = free;
10015 stbuf->f_bavail = free;
10016 } else {
10017 // General case: report the cluster statistics returned from RADOS. Because
10018 // multiple pools may be used without one filesystem namespace via
10019 // layouts, this is the most correct thing we can do.
10020 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10021 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10022 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10023 }
10024
10025 return rval;
10026 }
10027
10028 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10029 struct flock *fl, uint64_t owner, bool removing)
10030 {
10031 ldout(cct, 10) << __func__ << " ino " << in->ino
10032 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10033 << " type " << fl->l_type << " owner " << owner
10034 << " " << fl->l_start << "~" << fl->l_len << dendl;
10035
10036 int lock_cmd;
10037 if (F_RDLCK == fl->l_type)
10038 lock_cmd = CEPH_LOCK_SHARED;
10039 else if (F_WRLCK == fl->l_type)
10040 lock_cmd = CEPH_LOCK_EXCL;
10041 else if (F_UNLCK == fl->l_type)
10042 lock_cmd = CEPH_LOCK_UNLOCK;
10043 else
10044 return -EIO;
10045
10046 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10047 sleep = 0;
10048
10049 /*
10050 * Set the most significant bit, so that MDS knows the 'owner'
10051 * is sufficient to identify the owner of lock. (old code uses
10052 * both 'owner' and 'pid')
10053 */
10054 owner |= (1ULL << 63);
10055
10056 MetaRequest *req = new MetaRequest(op);
10057 filepath path;
10058 in->make_nosnap_relative_path(path);
10059 req->set_filepath(path);
10060 req->set_inode(in);
10061
10062 req->head.args.filelock_change.rule = lock_type;
10063 req->head.args.filelock_change.type = lock_cmd;
10064 req->head.args.filelock_change.owner = owner;
10065 req->head.args.filelock_change.pid = fl->l_pid;
10066 req->head.args.filelock_change.start = fl->l_start;
10067 req->head.args.filelock_change.length = fl->l_len;
10068 req->head.args.filelock_change.wait = sleep;
10069
10070 int ret;
10071 bufferlist bl;
10072
10073 if (sleep && switch_interrupt_cb) {
10074 // enable interrupt
10075 switch_interrupt_cb(callback_handle, req->get());
10076 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10077 // disable interrupt
10078 switch_interrupt_cb(callback_handle, NULL);
10079 if (ret == 0 && req->aborted()) {
10080 // effect of this lock request has been revoked by the 'lock intr' request
10081 ret = req->get_abort_code();
10082 }
10083 put_request(req);
10084 } else {
10085 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10086 }
10087
10088 if (ret == 0) {
10089 if (op == CEPH_MDS_OP_GETFILELOCK) {
10090 ceph_filelock filelock;
10091 auto p = bl.cbegin();
10092 decode(filelock, p);
10093
10094 if (CEPH_LOCK_SHARED == filelock.type)
10095 fl->l_type = F_RDLCK;
10096 else if (CEPH_LOCK_EXCL == filelock.type)
10097 fl->l_type = F_WRLCK;
10098 else
10099 fl->l_type = F_UNLCK;
10100
10101 fl->l_whence = SEEK_SET;
10102 fl->l_start = filelock.start;
10103 fl->l_len = filelock.length;
10104 fl->l_pid = filelock.pid;
10105 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10106 ceph_lock_state_t *lock_state;
10107 if (lock_type == CEPH_LOCK_FCNTL) {
10108 if (!in->fcntl_locks)
10109 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10110 lock_state = in->fcntl_locks.get();
10111 } else if (lock_type == CEPH_LOCK_FLOCK) {
10112 if (!in->flock_locks)
10113 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10114 lock_state = in->flock_locks.get();
10115 } else {
10116 ceph_abort();
10117 return -EINVAL;
10118 }
10119 _update_lock_state(fl, owner, lock_state);
10120
10121 if (!removing) {
10122 if (lock_type == CEPH_LOCK_FCNTL) {
10123 if (!fh->fcntl_locks)
10124 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10125 lock_state = fh->fcntl_locks.get();
10126 } else {
10127 if (!fh->flock_locks)
10128 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10129 lock_state = fh->flock_locks.get();
10130 }
10131 _update_lock_state(fl, owner, lock_state);
10132 }
10133 } else
10134 ceph_abort();
10135 }
10136 return ret;
10137 }
10138
10139 int Client::_interrupt_filelock(MetaRequest *req)
10140 {
10141 // Set abort code, but do not kick. The abort code prevents the request
10142 // from being re-sent.
10143 req->abort(-EINTR);
10144 if (req->mds < 0)
10145 return 0; // haven't sent the request
10146
10147 Inode *in = req->inode();
10148
10149 int lock_type;
10150 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10151 lock_type = CEPH_LOCK_FLOCK_INTR;
10152 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10153 lock_type = CEPH_LOCK_FCNTL_INTR;
10154 else {
10155 ceph_abort();
10156 return -EINVAL;
10157 }
10158
10159 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10160 filepath path;
10161 in->make_nosnap_relative_path(path);
10162 intr_req->set_filepath(path);
10163 intr_req->set_inode(in);
10164 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10165 intr_req->head.args.filelock_change.rule = lock_type;
10166 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10167
10168 UserPerm perms(req->get_uid(), req->get_gid());
10169 return make_request(intr_req, perms, NULL, NULL, -1);
10170 }
10171
10172 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10173 {
10174 if (!in->fcntl_locks && !in->flock_locks)
10175 return;
10176
10177 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10178 encode(nr_fcntl_locks, bl);
10179 if (nr_fcntl_locks) {
10180 auto &lock_state = in->fcntl_locks;
10181 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10182 p != lock_state->held_locks.end();
10183 ++p)
10184 encode(p->second, bl);
10185 }
10186
10187 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10188 encode(nr_flock_locks, bl);
10189 if (nr_flock_locks) {
10190 auto &lock_state = in->flock_locks;
10191 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10192 p != lock_state->held_locks.end();
10193 ++p)
10194 encode(p->second, bl);
10195 }
10196
10197 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10198 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10199 }
10200
10201 void Client::_release_filelocks(Fh *fh)
10202 {
10203 if (!fh->fcntl_locks && !fh->flock_locks)
10204 return;
10205
10206 Inode *in = fh->inode.get();
10207 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10208
10209 list<pair<int, ceph_filelock> > to_release;
10210
10211 if (fh->fcntl_locks) {
10212 auto &lock_state = fh->fcntl_locks;
10213 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10214 p != lock_state->held_locks.end();
10215 ++p)
10216 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10217 lock_state.reset();
10218 }
10219 if (fh->flock_locks) {
10220 auto &lock_state = fh->flock_locks;
10221 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10222 p != lock_state->held_locks.end();
10223 ++p)
10224 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10225 lock_state.reset();
10226 }
10227
10228 if (to_release.empty())
10229 return;
10230
10231 // mds has already released filelocks if session was closed.
10232 if (in->caps.empty())
10233 return;
10234
10235 struct flock fl;
10236 memset(&fl, 0, sizeof(fl));
10237 fl.l_whence = SEEK_SET;
10238 fl.l_type = F_UNLCK;
10239
10240 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10241 p != to_release.end();
10242 ++p) {
10243 fl.l_start = p->second.start;
10244 fl.l_len = p->second.length;
10245 fl.l_pid = p->second.pid;
10246 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10247 p->second.owner, true);
10248 }
10249 }
10250
10251 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10252 ceph_lock_state_t *lock_state)
10253 {
10254 int lock_cmd;
10255 if (F_RDLCK == fl->l_type)
10256 lock_cmd = CEPH_LOCK_SHARED;
10257 else if (F_WRLCK == fl->l_type)
10258 lock_cmd = CEPH_LOCK_EXCL;
10259 else
10260 lock_cmd = CEPH_LOCK_UNLOCK;;
10261
10262 ceph_filelock filelock;
10263 filelock.start = fl->l_start;
10264 filelock.length = fl->l_len;
10265 filelock.client = 0;
10266 // see comment in _do_filelock()
10267 filelock.owner = owner | (1ULL << 63);
10268 filelock.pid = fl->l_pid;
10269 filelock.type = lock_cmd;
10270
10271 if (filelock.type == CEPH_LOCK_UNLOCK) {
10272 list<ceph_filelock> activated_locks;
10273 lock_state->remove_lock(filelock, activated_locks);
10274 } else {
10275 bool r = lock_state->add_lock(filelock, false, false, NULL);
10276 ceph_assert(r);
10277 }
10278 }
10279
10280 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10281 {
10282 Inode *in = fh->inode.get();
10283 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10284 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10285 return ret;
10286 }
10287
10288 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10289 {
10290 Inode *in = fh->inode.get();
10291 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10292 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10293 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10294 return ret;
10295 }
10296
10297 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10298 {
10299 Inode *in = fh->inode.get();
10300 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10301
10302 int sleep = !(cmd & LOCK_NB);
10303 cmd &= ~LOCK_NB;
10304
10305 int type;
10306 switch (cmd) {
10307 case LOCK_SH:
10308 type = F_RDLCK;
10309 break;
10310 case LOCK_EX:
10311 type = F_WRLCK;
10312 break;
10313 case LOCK_UN:
10314 type = F_UNLCK;
10315 break;
10316 default:
10317 return -EINVAL;
10318 }
10319
10320 struct flock fl;
10321 memset(&fl, 0, sizeof(fl));
10322 fl.l_type = type;
10323 fl.l_whence = SEEK_SET;
10324
10325 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10326 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10327 return ret;
10328 }
10329
10330 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10331 {
10332 /* Since the only thing this does is wrap a call to statfs, and
10333 statfs takes a lock, it doesn't seem we have a need to split it
10334 out. */
10335 return statfs(0, stbuf, perms);
10336 }
10337
10338 void Client::ll_register_callbacks(struct client_callback_args *args)
10339 {
10340 if (!args)
10341 return;
10342 std::lock_guard l(client_lock);
10343 ldout(cct, 10) << __func__ << " cb " << args->handle
10344 << " invalidate_ino_cb " << args->ino_cb
10345 << " invalidate_dentry_cb " << args->dentry_cb
10346 << " switch_interrupt_cb " << args->switch_intr_cb
10347 << " remount_cb " << args->remount_cb
10348 << dendl;
10349 callback_handle = args->handle;
10350 if (args->ino_cb) {
10351 ino_invalidate_cb = args->ino_cb;
10352 async_ino_invalidator.start();
10353 }
10354 if (args->dentry_cb) {
10355 dentry_invalidate_cb = args->dentry_cb;
10356 async_dentry_invalidator.start();
10357 }
10358 if (args->switch_intr_cb) {
10359 switch_interrupt_cb = args->switch_intr_cb;
10360 interrupt_finisher.start();
10361 }
10362 if (args->remount_cb) {
10363 remount_cb = args->remount_cb;
10364 remount_finisher.start();
10365 }
10366 umask_cb = args->umask_cb;
10367 }
10368
10369 int Client::test_dentry_handling(bool can_invalidate)
10370 {
10371 int r = 0;
10372
10373 can_invalidate_dentries = can_invalidate;
10374
10375 if (can_invalidate_dentries) {
10376 ceph_assert(dentry_invalidate_cb);
10377 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10378 r = 0;
10379 } else {
10380 ceph_assert(remount_cb);
10381 ldout(cct, 1) << "using remount_cb" << dendl;
10382 r = _do_remount(false);
10383 }
10384
10385 return r;
10386 }
10387
10388 int Client::_sync_fs()
10389 {
10390 ldout(cct, 10) << __func__ << dendl;
10391
10392 // flush file data
10393 std::unique_ptr<C_SaferCond> cond = nullptr;
10394 if (cct->_conf->client_oc) {
10395 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10396 objectcacher->flush_all(cond.get());
10397 }
10398
10399 // flush caps
10400 flush_caps_sync();
10401 ceph_tid_t flush_tid = last_flush_tid;
10402
10403 // wait for unsafe mds requests
10404 wait_unsafe_requests();
10405
10406 wait_sync_caps(flush_tid);
10407
10408 if (nullptr != cond) {
10409 client_lock.Unlock();
10410 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10411 cond->wait();
10412 ldout(cct, 15) << __func__ << " flush finished" << dendl;
10413 client_lock.Lock();
10414 }
10415
10416 return 0;
10417 }
10418
10419 int Client::sync_fs()
10420 {
10421 std::lock_guard l(client_lock);
10422
10423 if (unmounting)
10424 return -ENOTCONN;
10425
10426 return _sync_fs();
10427 }
10428
10429 int64_t Client::drop_caches()
10430 {
10431 std::lock_guard l(client_lock);
10432 return objectcacher->release_all();
10433 }
10434
10435 int Client::_lazyio(Fh *fh, int enable)
10436 {
10437 Inode *in = fh->inode.get();
10438 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10439
10440 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10441 return 0;
10442
10443 int orig_mode = fh->mode;
10444 if (enable) {
10445 fh->mode |= CEPH_FILE_MODE_LAZY;
10446 in->get_open_ref(fh->mode);
10447 in->put_open_ref(orig_mode);
10448 check_caps(in, CHECK_CAPS_NODELAY);
10449 } else {
10450 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10451 in->get_open_ref(fh->mode);
10452 in->put_open_ref(orig_mode);
10453 check_caps(in, 0);
10454 }
10455
10456 return 0;
10457 }
10458
10459 int Client::lazyio(int fd, int enable)
10460 {
10461 std::lock_guard l(client_lock);
10462 Fh *f = get_filehandle(fd);
10463 if (!f)
10464 return -EBADF;
10465
10466 return _lazyio(f, enable);
10467 }
10468
10469 int Client::ll_lazyio(Fh *fh, int enable)
10470 {
10471 std::lock_guard lock(client_lock);
10472 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10473 tout(cct) << __func__ << std::endl;
10474
10475 return _lazyio(fh, enable);
10476 }
10477
10478 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10479 {
10480 std::lock_guard l(client_lock);
10481 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10482 << ", " << offset << ", " << count << ")" << dendl;
10483
10484 Fh *f = get_filehandle(fd);
10485 if (!f)
10486 return -EBADF;
10487
10488 // for now
10489 _fsync(f, true);
10490
10491 return 0;
10492 }
10493
10494 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10495 {
10496 std::lock_guard l(client_lock);
10497 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10498 << ", " << offset << ", " << count << ")" << dendl;
10499
10500 Fh *f = get_filehandle(fd);
10501 if (!f)
10502 return -EBADF;
10503 Inode *in = f->inode.get();
10504
10505 _fsync(f, true);
10506 if (_release(in))
10507 check_caps(in, 0);
10508 return 0;
10509 }
10510
10511
10512 // =============================
10513 // snaps
10514
10515 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10516 {
10517 std::lock_guard l(client_lock);
10518
10519 if (unmounting)
10520 return -ENOTCONN;
10521
10522 filepath path(relpath);
10523 InodeRef in;
10524 int r = path_walk(path, &in, perm);
10525 if (r < 0)
10526 return r;
10527 if (cct->_conf->client_permissions) {
10528 r = may_create(in.get(), perm);
10529 if (r < 0)
10530 return r;
10531 }
10532 Inode *snapdir = open_snapdir(in.get());
10533 return _mkdir(snapdir, name, 0, perm);
10534 }
10535
10536 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10537 {
10538 std::lock_guard l(client_lock);
10539
10540 if (unmounting)
10541 return -ENOTCONN;
10542
10543 filepath path(relpath);
10544 InodeRef in;
10545 int r = path_walk(path, &in, perms);
10546 if (r < 0)
10547 return r;
10548 if (cct->_conf->client_permissions) {
10549 r = may_delete(in.get(), NULL, perms);
10550 if (r < 0)
10551 return r;
10552 }
10553 Inode *snapdir = open_snapdir(in.get());
10554 return _rmdir(snapdir, name, perms);
10555 }
10556
10557 // =============================
10558 // expose caps
10559
10560 int Client::get_caps_issued(int fd) {
10561
10562 std::lock_guard lock(client_lock);
10563
10564 if (unmounting)
10565 return -ENOTCONN;
10566
10567 Fh *f = get_filehandle(fd);
10568 if (!f)
10569 return -EBADF;
10570
10571 return f->inode->caps_issued();
10572 }
10573
10574 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10575 {
10576 std::lock_guard lock(client_lock);
10577
10578 if (unmounting)
10579 return -ENOTCONN;
10580
10581 filepath p(path);
10582 InodeRef in;
10583 int r = path_walk(p, &in, perms, true);
10584 if (r < 0)
10585 return r;
10586 return in->caps_issued();
10587 }
10588
10589 // =========================================
10590 // low level
10591
10592 Inode *Client::open_snapdir(Inode *diri)
10593 {
10594 Inode *in;
10595 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10596 if (!inode_map.count(vino)) {
10597 in = new Inode(this, vino, &diri->layout);
10598
10599 in->ino = diri->ino;
10600 in->snapid = CEPH_SNAPDIR;
10601 in->mode = diri->mode;
10602 in->uid = diri->uid;
10603 in->gid = diri->gid;
10604 in->nlink = 1;
10605 in->mtime = diri->mtime;
10606 in->ctime = diri->ctime;
10607 in->btime = diri->btime;
10608 in->size = diri->size;
10609 in->change_attr = diri->change_attr;
10610
10611 in->dirfragtree.clear();
10612 in->snapdir_parent = diri;
10613 diri->flags |= I_SNAPDIR_OPEN;
10614 inode_map[vino] = in;
10615 if (use_faked_inos())
10616 _assign_faked_ino(in);
10617 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10618 } else {
10619 in = inode_map[vino];
10620 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10621 }
10622 return in;
10623 }
10624
10625 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10626 Inode **out, const UserPerm& perms)
10627 {
10628 std::lock_guard lock(client_lock);
10629 vinodeno_t vparent = _get_vino(parent);
10630 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10631 tout(cct) << __func__ << std::endl;
10632 tout(cct) << name << std::endl;
10633
10634 if (unmounting)
10635 return -ENOTCONN;
10636
10637 int r = 0;
10638 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10639 "fuse_default_permissions");
10640 if (!fuse_default_permissions) {
10641 if (strcmp(name, ".") && strcmp(name, "..")) {
10642 r = may_lookup(parent, perms);
10643 if (r < 0)
10644 return r;
10645 }
10646 }
10647
10648 string dname(name);
10649 InodeRef in;
10650
10651 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10652 if (r < 0) {
10653 attr->st_ino = 0;
10654 goto out;
10655 }
10656
10657 ceph_assert(in);
10658 fill_stat(in, attr);
10659 _ll_get(in.get());
10660
10661 out:
10662 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10663 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10664 tout(cct) << attr->st_ino << std::endl;
10665 *out = in.get();
10666 return r;
10667 }
10668
10669 int Client::ll_lookup_inode(
10670 struct inodeno_t ino,
10671 const UserPerm& perms,
10672 Inode **inode)
10673 {
10674 ceph_assert(inode != NULL);
10675 std::lock_guard lock(client_lock);
10676 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10677
10678 if (unmounting)
10679 return -ENOTCONN;
10680
10681 // Num1: get inode and *inode
10682 int r = _lookup_ino(ino, perms, inode);
10683 if (r)
10684 return r;
10685
10686 ceph_assert(*inode != NULL);
10687
10688 if (!(*inode)->dentries.empty()) {
10689 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10690 return 0;
10691 }
10692
10693 if ((*inode)->is_root()) {
10694 ldout(cct, 8) << "ino is root, no parent" << dendl;
10695 return 0;
10696 }
10697
10698 // Num2: Request the parent inode, so that we can look up the name
10699 Inode *parent;
10700 r = _lookup_parent(*inode, perms, &parent);
10701 if (r) {
10702 _ll_forget(*inode, 1);
10703 return r;
10704 }
10705
10706 ceph_assert(parent != NULL);
10707
10708 // Num3: Finally, get the name (dentry) of the requested inode
10709 r = _lookup_name(*inode, parent, perms);
10710 if (r) {
10711 // Unexpected error
10712 _ll_forget(parent, 1);
10713 _ll_forget(*inode, 1);
10714 return r;
10715 }
10716
10717 _ll_forget(parent, 1);
10718 return 0;
10719 }
10720
10721 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10722 struct ceph_statx *stx, unsigned want, unsigned flags,
10723 const UserPerm& perms)
10724 {
10725 std::lock_guard lock(client_lock);
10726 vinodeno_t vparent = _get_vino(parent);
10727 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10728 tout(cct) << "ll_lookupx" << std::endl;
10729 tout(cct) << name << std::endl;
10730
10731 if (unmounting)
10732 return -ENOTCONN;
10733
10734 int r = 0;
10735 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10736 "fuse_default_permissions");
10737 if (!fuse_default_permissions) {
10738 r = may_lookup(parent, perms);
10739 if (r < 0)
10740 return r;
10741 }
10742
10743 string dname(name);
10744 InodeRef in;
10745
10746 unsigned mask = statx_to_mask(flags, want);
10747 r = _lookup(parent, dname, mask, &in, perms);
10748 if (r < 0) {
10749 stx->stx_ino = 0;
10750 stx->stx_mask = 0;
10751 } else {
10752 ceph_assert(in);
10753 fill_statx(in, mask, stx);
10754 _ll_get(in.get());
10755 }
10756
10757 ldout(cct, 3) << __func__ << " " << vparent << " " << name
10758 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10759 tout(cct) << stx->stx_ino << std::endl;
10760 *out = in.get();
10761 return r;
10762 }
10763
10764 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10765 unsigned int want, unsigned int flags, const UserPerm& perms)
10766 {
10767 std::lock_guard lock(client_lock);
10768
10769 if (unmounting)
10770 return -ENOTCONN;
10771
10772 filepath fp(name, 0);
10773 InodeRef in;
10774 int rc;
10775 unsigned mask = statx_to_mask(flags, want);
10776
10777 ldout(cct, 3) << __func__ << " " << name << dendl;
10778 tout(cct) << __func__ << std::endl;
10779 tout(cct) << name << std::endl;
10780
10781 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10782 if (rc < 0) {
10783 /* zero out mask, just in case... */
10784 stx->stx_mask = 0;
10785 stx->stx_ino = 0;
10786 *out = NULL;
10787 return rc;
10788 } else {
10789 ceph_assert(in);
10790 fill_statx(in, mask, stx);
10791 _ll_get(in.get());
10792 *out = in.get();
10793 return 0;
10794 }
10795 }
10796
10797 void Client::_ll_get(Inode *in)
10798 {
10799 if (in->ll_ref == 0) {
10800 in->get();
10801 if (in->is_dir() && !in->dentries.empty()) {
10802 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10803 in->get_first_parent()->get(); // pin dentry
10804 }
10805 if (in->snapid != CEPH_NOSNAP)
10806 ll_snap_ref[in->snapid]++;
10807 }
10808 in->ll_get();
10809 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10810 }
10811
10812 int Client::_ll_put(Inode *in, uint64_t num)
10813 {
10814 in->ll_put(num);
10815 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10816 if (in->ll_ref == 0) {
10817 if (in->is_dir() && !in->dentries.empty()) {
10818 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10819 in->get_first_parent()->put(); // unpin dentry
10820 }
10821 if (in->snapid != CEPH_NOSNAP) {
10822 auto p = ll_snap_ref.find(in->snapid);
10823 ceph_assert(p != ll_snap_ref.end());
10824 ceph_assert(p->second > 0);
10825 if (--p->second == 0)
10826 ll_snap_ref.erase(p);
10827 }
10828 put_inode(in);
10829 return 0;
10830 } else {
10831 return in->ll_ref;
10832 }
10833 }
10834
10835 void Client::_ll_drop_pins()
10836 {
10837 ldout(cct, 10) << __func__ << dendl;
10838 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10839 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10840 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10841 it != inode_map.end();
10842 it = next) {
10843 Inode *in = it->second;
10844 next = it;
10845 ++next;
10846 if (in->ll_ref){
10847 to_be_put.insert(in);
10848 _ll_put(in, in->ll_ref);
10849 }
10850 }
10851 }
10852
10853 bool Client::_ll_forget(Inode *in, uint64_t count)
10854 {
10855 inodeno_t ino = in->ino;
10856
10857 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10858 tout(cct) << __func__ << std::endl;
10859 tout(cct) << ino.val << std::endl;
10860 tout(cct) << count << std::endl;
10861
10862 // Ignore forget if we're no longer mounted
10863 if (unmounting)
10864 return true;
10865
10866 if (ino == 1) return true; // ignore forget on root.
10867
10868 bool last = false;
10869 if (in->ll_ref < count) {
10870 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10871 << ", which only has ll_ref=" << in->ll_ref << dendl;
10872 _ll_put(in, in->ll_ref);
10873 last = true;
10874 } else {
10875 if (_ll_put(in, count) == 0)
10876 last = true;
10877 }
10878
10879 return last;
10880 }
10881
10882 bool Client::ll_forget(Inode *in, uint64_t count)
10883 {
10884 std::lock_guard lock(client_lock);
10885 return _ll_forget(in, count);
10886 }
10887
10888 bool Client::ll_put(Inode *in)
10889 {
10890 /* ll_forget already takes the lock */
10891 return ll_forget(in, 1);
10892 }
10893
10894 int Client::ll_get_snap_ref(snapid_t snap)
10895 {
10896 std::lock_guard lock(client_lock);
10897 auto p = ll_snap_ref.find(snap);
10898 if (p != ll_snap_ref.end())
10899 return p->second;
10900 return 0;
10901 }
10902
10903 snapid_t Client::ll_get_snapid(Inode *in)
10904 {
10905 std::lock_guard lock(client_lock);
10906 return in->snapid;
10907 }
10908
10909 Inode *Client::ll_get_inode(ino_t ino)
10910 {
10911 std::lock_guard lock(client_lock);
10912
10913 if (unmounting)
10914 return NULL;
10915
10916 vinodeno_t vino = _map_faked_ino(ino);
10917 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10918 if (p == inode_map.end())
10919 return NULL;
10920 Inode *in = p->second;
10921 _ll_get(in);
10922 return in;
10923 }
10924
10925 Inode *Client::ll_get_inode(vinodeno_t vino)
10926 {
10927 std::lock_guard lock(client_lock);
10928
10929 if (unmounting)
10930 return NULL;
10931
10932 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10933 if (p == inode_map.end())
10934 return NULL;
10935 Inode *in = p->second;
10936 _ll_get(in);
10937 return in;
10938 }
10939
10940 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10941 {
10942 vinodeno_t vino = _get_vino(in);
10943
10944 ldout(cct, 8) << __func__ << " " << vino << dendl;
10945 tout(cct) << __func__ << std::endl;
10946 tout(cct) << vino.ino.val << std::endl;
10947
10948 if (vino.snapid < CEPH_NOSNAP)
10949 return 0;
10950 else
10951 return _getattr(in, caps, perms);
10952 }
10953
10954 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10955 {
10956 std::lock_guard lock(client_lock);
10957
10958 if (unmounting)
10959 return -ENOTCONN;
10960
10961 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10962
10963 if (res == 0)
10964 fill_stat(in, attr);
10965 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
10966 return res;
10967 }
10968
10969 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10970 unsigned int flags, const UserPerm& perms)
10971 {
10972 std::lock_guard lock(client_lock);
10973
10974 if (unmounting)
10975 return -ENOTCONN;
10976
10977 int res = 0;
10978 unsigned mask = statx_to_mask(flags, want);
10979
10980 if (mask && !in->caps_issued_mask(mask, true))
10981 res = _ll_getattr(in, mask, perms);
10982
10983 if (res == 0)
10984 fill_statx(in, mask, stx);
10985 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
10986 return res;
10987 }
10988
10989 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10990 const UserPerm& perms, InodeRef *inp)
10991 {
10992 vinodeno_t vino = _get_vino(in);
10993
10994 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
10995 << dendl;
10996 tout(cct) << __func__ << std::endl;
10997 tout(cct) << vino.ino.val << std::endl;
10998 tout(cct) << stx->stx_mode << std::endl;
10999 tout(cct) << stx->stx_uid << std::endl;
11000 tout(cct) << stx->stx_gid << std::endl;
11001 tout(cct) << stx->stx_size << std::endl;
11002 tout(cct) << stx->stx_mtime << std::endl;
11003 tout(cct) << stx->stx_atime << std::endl;
11004 tout(cct) << stx->stx_btime << std::endl;
11005 tout(cct) << mask << std::endl;
11006
11007 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11008 "fuse_default_permissions");
11009 if (!fuse_default_permissions) {
11010 int res = may_setattr(in, stx, mask, perms);
11011 if (res < 0)
11012 return res;
11013 }
11014
11015 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11016
11017 return __setattrx(in, stx, mask, perms, inp);
11018 }
11019
11020 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11021 const UserPerm& perms)
11022 {
11023 std::lock_guard lock(client_lock);
11024
11025 if (unmounting)
11026 return -ENOTCONN;
11027
11028 InodeRef target(in);
11029 int res = _ll_setattrx(in, stx, mask, perms, &target);
11030 if (res == 0) {
11031 ceph_assert(in == target.get());
11032 fill_statx(in, in->caps_issued(), stx);
11033 }
11034
11035 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11036 return res;
11037 }
11038
11039 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11040 const UserPerm& perms)
11041 {
11042 struct ceph_statx stx;
11043 stat_to_statx(attr, &stx);
11044
11045 std::lock_guard lock(client_lock);
11046
11047 if (unmounting)
11048 return -ENOTCONN;
11049
11050 InodeRef target(in);
11051 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11052 if (res == 0) {
11053 ceph_assert(in == target.get());
11054 fill_stat(in, attr);
11055 }
11056
11057 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11058 return res;
11059 }
11060
11061
11062 // ----------
11063 // xattrs
11064
11065 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11066 const UserPerm& perms)
11067 {
11068 std::lock_guard lock(client_lock);
11069
11070 if (unmounting)
11071 return -ENOTCONN;
11072
11073 InodeRef in;
11074 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11075 if (r < 0)
11076 return r;
11077 return _getxattr(in, name, value, size, perms);
11078 }
11079
11080 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11081 const UserPerm& perms)
11082 {
11083 std::lock_guard lock(client_lock);
11084
11085 if (unmounting)
11086 return -ENOTCONN;
11087
11088 InodeRef in;
11089 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11090 if (r < 0)
11091 return r;
11092 return _getxattr(in, name, value, size, perms);
11093 }
11094
11095 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11096 const UserPerm& perms)
11097 {
11098 std::lock_guard lock(client_lock);
11099
11100 if (unmounting)
11101 return -ENOTCONN;
11102
11103 Fh *f = get_filehandle(fd);
11104 if (!f)
11105 return -EBADF;
11106 return _getxattr(f->inode, name, value, size, perms);
11107 }
11108
11109 int Client::listxattr(const char *path, char *list, size_t size,
11110 const UserPerm& perms)
11111 {
11112 std::lock_guard lock(client_lock);
11113
11114 if (unmounting)
11115 return -ENOTCONN;
11116
11117 InodeRef in;
11118 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11119 if (r < 0)
11120 return r;
11121 return Client::_listxattr(in.get(), list, size, perms);
11122 }
11123
11124 int Client::llistxattr(const char *path, char *list, size_t size,
11125 const UserPerm& perms)
11126 {
11127 std::lock_guard lock(client_lock);
11128
11129 if (unmounting)
11130 return -ENOTCONN;
11131
11132 InodeRef in;
11133 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11134 if (r < 0)
11135 return r;
11136 return Client::_listxattr(in.get(), list, size, perms);
11137 }
11138
11139 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11140 {
11141 std::lock_guard lock(client_lock);
11142
11143 if (unmounting)
11144 return -ENOTCONN;
11145
11146 Fh *f = get_filehandle(fd);
11147 if (!f)
11148 return -EBADF;
11149 return Client::_listxattr(f->inode.get(), list, size, perms);
11150 }
11151
11152 int Client::removexattr(const char *path, const char *name,
11153 const UserPerm& perms)
11154 {
11155 std::lock_guard lock(client_lock);
11156
11157 if (unmounting)
11158 return -ENOTCONN;
11159
11160 InodeRef in;
11161 int r = Client::path_walk(path, &in, perms, true);
11162 if (r < 0)
11163 return r;
11164 return _removexattr(in, name, perms);
11165 }
11166
11167 int Client::lremovexattr(const char *path, const char *name,
11168 const UserPerm& perms)
11169 {
11170 std::lock_guard lock(client_lock);
11171
11172 if (unmounting)
11173 return -ENOTCONN;
11174
11175 InodeRef in;
11176 int r = Client::path_walk(path, &in, perms, false);
11177 if (r < 0)
11178 return r;
11179 return _removexattr(in, name, perms);
11180 }
11181
11182 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11183 {
11184 std::lock_guard lock(client_lock);
11185
11186 if (unmounting)
11187 return -ENOTCONN;
11188
11189 Fh *f = get_filehandle(fd);
11190 if (!f)
11191 return -EBADF;
11192 return _removexattr(f->inode, name, perms);
11193 }
11194
11195 int Client::setxattr(const char *path, const char *name, const void *value,
11196 size_t size, int flags, const UserPerm& perms)
11197 {
11198 _setxattr_maybe_wait_for_osdmap(name, value, size);
11199
11200 std::lock_guard lock(client_lock);
11201
11202 if (unmounting)
11203 return -ENOTCONN;
11204
11205 InodeRef in;
11206 int r = Client::path_walk(path, &in, perms, true);
11207 if (r < 0)
11208 return r;
11209 return _setxattr(in, name, value, size, flags, perms);
11210 }
11211
11212 int Client::lsetxattr(const char *path, const char *name, const void *value,
11213 size_t size, int flags, const UserPerm& perms)
11214 {
11215 _setxattr_maybe_wait_for_osdmap(name, value, size);
11216
11217 std::lock_guard lock(client_lock);
11218
11219 if (unmounting)
11220 return -ENOTCONN;
11221
11222 InodeRef in;
11223 int r = Client::path_walk(path, &in, perms, false);
11224 if (r < 0)
11225 return r;
11226 return _setxattr(in, name, value, size, flags, perms);
11227 }
11228
11229 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11230 int flags, const UserPerm& perms)
11231 {
11232 _setxattr_maybe_wait_for_osdmap(name, value, size);
11233
11234 std::lock_guard lock(client_lock);
11235
11236 if (unmounting)
11237 return -ENOTCONN;
11238
11239 Fh *f = get_filehandle(fd);
11240 if (!f)
11241 return -EBADF;
11242 return _setxattr(f->inode, name, value, size, flags, perms);
11243 }
11244
11245 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11246 const UserPerm& perms)
11247 {
11248 int r;
11249
11250 const VXattr *vxattr = _match_vxattr(in, name);
11251 if (vxattr) {
11252 r = -ENODATA;
11253
11254 // Do a force getattr to get the latest quota before returning
11255 // a value to userspace.
11256 int flags = 0;
11257 if (vxattr->flags & VXATTR_RSTAT) {
11258 flags |= CEPH_STAT_RSTAT;
11259 }
11260 r = _getattr(in, flags, perms, true);
11261 if (r != 0) {
11262 // Error from getattr!
11263 return r;
11264 }
11265
11266 // call pointer-to-member function
11267 char buf[256];
11268 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11269 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11270 } else {
11271 r = -ENODATA;
11272 }
11273
11274 if (size != 0) {
11275 if (r > (int)size) {
11276 r = -ERANGE;
11277 } else if (r > 0) {
11278 memcpy(value, buf, r);
11279 }
11280 }
11281 goto out;
11282 }
11283
11284 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11285 r = -EOPNOTSUPP;
11286 goto out;
11287 }
11288
11289 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11290 if (r == 0) {
11291 string n(name);
11292 r = -ENODATA;
11293 if (in->xattrs.count(n)) {
11294 r = in->xattrs[n].length();
11295 if (r > 0 && size != 0) {
11296 if (size >= (unsigned)r)
11297 memcpy(value, in->xattrs[n].c_str(), r);
11298 else
11299 r = -ERANGE;
11300 }
11301 }
11302 }
11303 out:
11304 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11305 return r;
11306 }
11307
11308 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11309 const UserPerm& perms)
11310 {
11311 if (cct->_conf->client_permissions) {
11312 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11313 if (r < 0)
11314 return r;
11315 }
11316 return _getxattr(in.get(), name, value, size, perms);
11317 }
11318
11319 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11320 size_t size, const UserPerm& perms)
11321 {
11322 std::lock_guard lock(client_lock);
11323
11324 if (unmounting)
11325 return -ENOTCONN;
11326
11327 vinodeno_t vino = _get_vino(in);
11328
11329 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11330 tout(cct) << __func__ << std::endl;
11331 tout(cct) << vino.ino.val << std::endl;
11332 tout(cct) << name << std::endl;
11333
11334 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11335 "fuse_default_permissions");
11336 if (!fuse_default_permissions) {
11337 int r = xattr_permission(in, name, MAY_READ, perms);
11338 if (r < 0)
11339 return r;
11340 }
11341
11342 return _getxattr(in, name, value, size, perms);
11343 }
11344
11345 int Client::_listxattr(Inode *in, char *name, size_t size,
11346 const UserPerm& perms)
11347 {
11348 bool len_only = (size == 0);
11349 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11350 if (r != 0) {
11351 goto out;
11352 }
11353
11354 r = 0;
11355 for (const auto& p : in->xattrs) {
11356 size_t this_len = p.first.length() + 1;
11357 r += this_len;
11358 if (len_only)
11359 continue;
11360
11361 if (this_len > size) {
11362 r = -ERANGE;
11363 goto out;
11364 }
11365
11366 memcpy(name, p.first.c_str(), this_len);
11367 name += this_len;
11368 size -= this_len;
11369 }
11370
11371 const VXattr *vxattr;
11372 for (vxattr = _get_vxattrs(in); vxattr && !vxattr->name.empty(); vxattr++) {
11373 if (vxattr->hidden)
11374 continue;
11375 // call pointer-to-member function
11376 if (vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))
11377 continue;
11378
11379 size_t this_len = vxattr->name.length() + 1;
11380 r += this_len;
11381 if (len_only)
11382 continue;
11383
11384 if (this_len > size) {
11385 r = -ERANGE;
11386 goto out;
11387 }
11388
11389 memcpy(name, vxattr->name.c_str(), this_len);
11390 name += this_len;
11391 size -= this_len;
11392 }
11393 out:
11394 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11395 return r;
11396 }
11397
11398 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11399 const UserPerm& perms)
11400 {
11401 std::lock_guard lock(client_lock);
11402
11403 if (unmounting)
11404 return -ENOTCONN;
11405
11406 vinodeno_t vino = _get_vino(in);
11407
11408 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11409 tout(cct) << __func__ << std::endl;
11410 tout(cct) << vino.ino.val << std::endl;
11411 tout(cct) << size << std::endl;
11412
11413 return _listxattr(in, names, size, perms);
11414 }
11415
11416 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11417 size_t size, int flags, const UserPerm& perms)
11418 {
11419
11420 int xattr_flags = 0;
11421 if (!value)
11422 xattr_flags |= CEPH_XATTR_REMOVE;
11423 if (flags & XATTR_CREATE)
11424 xattr_flags |= CEPH_XATTR_CREATE;
11425 if (flags & XATTR_REPLACE)
11426 xattr_flags |= CEPH_XATTR_REPLACE;
11427
11428 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11429 filepath path;
11430 in->make_nosnap_relative_path(path);
11431 req->set_filepath(path);
11432 req->set_string2(name);
11433 req->set_inode(in);
11434 req->head.args.setxattr.flags = xattr_flags;
11435
11436 bufferlist bl;
11437 assert (value || size == 0);
11438 bl.append((const char*)value, size);
11439 req->set_data(bl);
11440
11441 int res = make_request(req, perms);
11442
11443 trim_cache();
11444 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11445 res << dendl;
11446 return res;
11447 }
11448
11449 int Client::_setxattr(Inode *in, const char *name, const void *value,
11450 size_t size, int flags, const UserPerm& perms)
11451 {
11452 if (in->snapid != CEPH_NOSNAP) {
11453 return -EROFS;
11454 }
11455
11456 bool posix_acl_xattr = false;
11457 if (acl_type == POSIX_ACL)
11458 posix_acl_xattr = !strncmp(name, "system.", 7);
11459
11460 if (strncmp(name, "user.", 5) &&
11461 strncmp(name, "security.", 9) &&
11462 strncmp(name, "trusted.", 8) &&
11463 strncmp(name, "ceph.", 5) &&
11464 !posix_acl_xattr)
11465 return -EOPNOTSUPP;
11466
11467 bool check_realm = false;
11468
11469 if (posix_acl_xattr) {
11470 if (!strcmp(name, ACL_EA_ACCESS)) {
11471 mode_t new_mode = in->mode;
11472 if (value) {
11473 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11474 if (ret < 0)
11475 return ret;
11476 if (ret == 0) {
11477 value = NULL;
11478 size = 0;
11479 }
11480 if (new_mode != in->mode) {
11481 struct ceph_statx stx;
11482 stx.stx_mode = new_mode;
11483 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11484 if (ret < 0)
11485 return ret;
11486 }
11487 }
11488 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11489 if (value) {
11490 if (!S_ISDIR(in->mode))
11491 return -EACCES;
11492 int ret = posix_acl_check(value, size);
11493 if (ret < 0)
11494 return -EINVAL;
11495 if (ret == 0) {
11496 value = NULL;
11497 size = 0;
11498 }
11499 }
11500 } else {
11501 return -EOPNOTSUPP;
11502 }
11503 } else {
11504 const VXattr *vxattr = _match_vxattr(in, name);
11505 if (vxattr) {
11506 if (vxattr->readonly)
11507 return -EOPNOTSUPP;
11508 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11509 check_realm = true;
11510 }
11511 }
11512
11513 int ret = _do_setxattr(in, name, value, size, flags, perms);
11514 if (ret >= 0 && check_realm) {
11515 // check if snaprealm was created for quota inode
11516 if (in->quota.is_enable() &&
11517 !(in->snaprealm && in->snaprealm->ino == in->ino))
11518 ret = -EOPNOTSUPP;
11519 }
11520
11521 return ret;
11522 }
11523
11524 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11525 size_t size, int flags, const UserPerm& perms)
11526 {
11527 if (cct->_conf->client_permissions) {
11528 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11529 if (r < 0)
11530 return r;
11531 }
11532 return _setxattr(in.get(), name, value, size, flags, perms);
11533 }
11534
11535 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11536 {
11537 string tmp;
11538 if (name == "layout") {
11539 string::iterator begin = value.begin();
11540 string::iterator end = value.end();
11541 keys_and_values<string::iterator> p; // create instance of parser
11542 std::map<string, string> m; // map to receive results
11543 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11544 return -EINVAL;
11545 }
11546 if (begin != end)
11547 return -EINVAL;
11548 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11549 if (q->first == "pool") {
11550 tmp = q->second;
11551 break;
11552 }
11553 }
11554 } else if (name == "layout.pool") {
11555 tmp = value;
11556 }
11557
11558 if (tmp.length()) {
11559 int64_t pool;
11560 try {
11561 pool = boost::lexical_cast<unsigned>(tmp);
11562 if (!osdmap->have_pg_pool(pool))
11563 return -ENOENT;
11564 } catch (boost::bad_lexical_cast const&) {
11565 pool = osdmap->lookup_pg_pool_name(tmp);
11566 if (pool < 0) {
11567 return -ENOENT;
11568 }
11569 }
11570 }
11571
11572 return 0;
11573 }
11574
11575 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11576 {
11577 // For setting pool of layout, MetaRequest need osdmap epoch.
11578 // There is a race which create a new data pool but client and mds both don't have.
11579 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11580 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11581 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11582 string rest(strstr(name, "layout"));
11583 string v((const char*)value, size);
11584 int r = objecter->with_osdmap([&](const OSDMap& o) {
11585 return _setxattr_check_data_pool(rest, v, &o);
11586 });
11587
11588 if (r == -ENOENT) {
11589 C_SaferCond ctx;
11590 objecter->wait_for_latest_osdmap(&ctx);
11591 ctx.wait();
11592 }
11593 }
11594 }
11595
11596 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11597 size_t size, int flags, const UserPerm& perms)
11598 {
11599 _setxattr_maybe_wait_for_osdmap(name, value, size);
11600
11601 std::lock_guard lock(client_lock);
11602
11603 if (unmounting)
11604 return -ENOTCONN;
11605
11606 vinodeno_t vino = _get_vino(in);
11607
11608 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11609 tout(cct) << __func__ << std::endl;
11610 tout(cct) << vino.ino.val << std::endl;
11611 tout(cct) << name << std::endl;
11612
11613 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11614 "fuse_default_permissions");
11615 if (!fuse_default_permissions) {
11616 int r = xattr_permission(in, name, MAY_WRITE, perms);
11617 if (r < 0)
11618 return r;
11619 }
11620 return _setxattr(in, name, value, size, flags, perms);
11621 }
11622
11623 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11624 {
11625 if (in->snapid != CEPH_NOSNAP) {
11626 return -EROFS;
11627 }
11628
11629 // same xattrs supported by kernel client
11630 if (strncmp(name, "user.", 5) &&
11631 strncmp(name, "system.", 7) &&
11632 strncmp(name, "security.", 9) &&
11633 strncmp(name, "trusted.", 8) &&
11634 strncmp(name, "ceph.", 5))
11635 return -EOPNOTSUPP;
11636
11637 const VXattr *vxattr = _match_vxattr(in, name);
11638 if (vxattr && vxattr->readonly)
11639 return -EOPNOTSUPP;
11640
11641 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11642 filepath path;
11643 in->make_nosnap_relative_path(path);
11644 req->set_filepath(path);
11645 req->set_filepath2(name);
11646 req->set_inode(in);
11647
11648 int res = make_request(req, perms);
11649
11650 trim_cache();
11651 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11652 return res;
11653 }
11654
11655 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11656 {
11657 if (cct->_conf->client_permissions) {
11658 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11659 if (r < 0)
11660 return r;
11661 }
11662 return _removexattr(in.get(), name, perms);
11663 }
11664
11665 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11666 {
11667 std::lock_guard lock(client_lock);
11668
11669 if (unmounting)
11670 return -ENOTCONN;
11671
11672 vinodeno_t vino = _get_vino(in);
11673
11674 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11675 tout(cct) << "ll_removexattr" << std::endl;
11676 tout(cct) << vino.ino.val << std::endl;
11677 tout(cct) << name << std::endl;
11678
11679 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11680 "fuse_default_permissions");
11681 if (!fuse_default_permissions) {
11682 int r = xattr_permission(in, name, MAY_WRITE, perms);
11683 if (r < 0)
11684 return r;
11685 }
11686
11687 return _removexattr(in, name, perms);
11688 }
11689
11690 bool Client::_vxattrcb_quota_exists(Inode *in)
11691 {
11692 return in->quota.is_enable() &&
11693 in->snaprealm && in->snaprealm->ino == in->ino;
11694 }
11695 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11696 {
11697 return snprintf(val, size,
11698 "max_bytes=%lld max_files=%lld",
11699 (long long int)in->quota.max_bytes,
11700 (long long int)in->quota.max_files);
11701 }
11702 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11703 {
11704 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11705 }
11706 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11707 {
11708 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11709 }
11710
11711 bool Client::_vxattrcb_layout_exists(Inode *in)
11712 {
11713 return in->layout != file_layout_t();
11714 }
11715 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11716 {
11717 int r = snprintf(val, size,
11718 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11719 (unsigned long long)in->layout.stripe_unit,
11720 (unsigned long long)in->layout.stripe_count,
11721 (unsigned long long)in->layout.object_size);
11722 objecter->with_osdmap([&](const OSDMap& o) {
11723 if (o.have_pg_pool(in->layout.pool_id))
11724 r += snprintf(val + r, size - r, "%s",
11725 o.get_pool_name(in->layout.pool_id).c_str());
11726 else
11727 r += snprintf(val + r, size - r, "%" PRIu64,
11728 (uint64_t)in->layout.pool_id);
11729 });
11730 if (in->layout.pool_ns.length())
11731 r += snprintf(val + r, size - r, " pool_namespace=%s",
11732 in->layout.pool_ns.c_str());
11733 return r;
11734 }
11735 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11736 {
11737 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11738 }
11739 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11740 {
11741 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11742 }
11743 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11744 {
11745 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11746 }
11747 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11748 {
11749 size_t r;
11750 objecter->with_osdmap([&](const OSDMap& o) {
11751 if (o.have_pg_pool(in->layout.pool_id))
11752 r = snprintf(val, size, "%s", o.get_pool_name(
11753 in->layout.pool_id).c_str());
11754 else
11755 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11756 });
11757 return r;
11758 }
11759 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11760 {
11761 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11762 }
11763 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11764 {
11765 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11766 }
11767 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11768 {
11769 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11770 }
11771 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11772 {
11773 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11774 }
11775 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11776 {
11777 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11778 }
11779 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11780 {
11781 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11782 }
11783 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11784 {
11785 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11786 }
11787 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11788 {
11789 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11790 }
11791 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11792 {
11793 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11794 (long)in->rstat.rctime.nsec());
11795 }
11796 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11797 {
11798 return in->dir_pin != -ENODATA;
11799 }
11800 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11801 {
11802 return snprintf(val, size, "%ld", (long)in->dir_pin);
11803 }
11804
11805 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11806 {
11807 return !in->snap_btime.is_zero();
11808 }
11809
11810 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11811 {
11812 return snprintf(val, size, "%llu.%09lu",
11813 (long long unsigned)in->snap_btime.sec(),
11814 (long unsigned)in->snap_btime.nsec());
11815 }
11816
11817 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11818 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11819
11820 #define XATTR_NAME_CEPH(_type, _name) \
11821 { \
11822 name: CEPH_XATTR_NAME(_type, _name), \
11823 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11824 readonly: true, \
11825 hidden: false, \
11826 exists_cb: NULL, \
11827 flags: 0, \
11828 }
11829 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11830 { \
11831 name: CEPH_XATTR_NAME(_type, _name), \
11832 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11833 readonly: true, \
11834 hidden: false, \
11835 exists_cb: NULL, \
11836 flags: _flags, \
11837 }
11838 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11839 { \
11840 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11841 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11842 readonly: false, \
11843 hidden: true, \
11844 exists_cb: &Client::_vxattrcb_layout_exists, \
11845 flags: 0, \
11846 }
11847 #define XATTR_QUOTA_FIELD(_type, _name) \
11848 { \
11849 name: CEPH_XATTR_NAME(_type, _name), \
11850 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11851 readonly: false, \
11852 hidden: true, \
11853 exists_cb: &Client::_vxattrcb_quota_exists, \
11854 flags: 0, \
11855 }
11856
11857 const Client::VXattr Client::_dir_vxattrs[] = {
11858 {
11859 name: "ceph.dir.layout",
11860 getxattr_cb: &Client::_vxattrcb_layout,
11861 readonly: false,
11862 hidden: true,
11863 exists_cb: &Client::_vxattrcb_layout_exists,
11864 flags: 0,
11865 },
11866 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11867 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11868 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11869 XATTR_LAYOUT_FIELD(dir, layout, pool),
11870 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11871 XATTR_NAME_CEPH(dir, entries),
11872 XATTR_NAME_CEPH(dir, files),
11873 XATTR_NAME_CEPH(dir, subdirs),
11874 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11875 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11876 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11877 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11878 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11879 {
11880 name: "ceph.quota",
11881 getxattr_cb: &Client::_vxattrcb_quota,
11882 readonly: false,
11883 hidden: true,
11884 exists_cb: &Client::_vxattrcb_quota_exists,
11885 flags: 0,
11886 },
11887 XATTR_QUOTA_FIELD(quota, max_bytes),
11888 XATTR_QUOTA_FIELD(quota, max_files),
11889 {
11890 name: "ceph.dir.pin",
11891 getxattr_cb: &Client::_vxattrcb_dir_pin,
11892 readonly: false,
11893 hidden: true,
11894 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11895 flags: 0,
11896 },
11897 {
11898 name: "ceph.snap.btime",
11899 getxattr_cb: &Client::_vxattrcb_snap_btime,
11900 readonly: true,
11901 hidden: false,
11902 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11903 flags: 0,
11904 },
11905 { name: "" } /* Required table terminator */
11906 };
11907
11908 const Client::VXattr Client::_file_vxattrs[] = {
11909 {
11910 name: "ceph.file.layout",
11911 getxattr_cb: &Client::_vxattrcb_layout,
11912 readonly: false,
11913 hidden: true,
11914 exists_cb: &Client::_vxattrcb_layout_exists,
11915 flags: 0,
11916 },
11917 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11918 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11919 XATTR_LAYOUT_FIELD(file, layout, object_size),
11920 XATTR_LAYOUT_FIELD(file, layout, pool),
11921 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11922 {
11923 name: "ceph.snap.btime",
11924 getxattr_cb: &Client::_vxattrcb_snap_btime,
11925 readonly: true,
11926 hidden: false,
11927 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11928 flags: 0,
11929 },
11930 { name: "" } /* Required table terminator */
11931 };
11932
11933 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11934 {
11935 if (in->is_dir())
11936 return _dir_vxattrs;
11937 else if (in->is_file())
11938 return _file_vxattrs;
11939 return NULL;
11940 }
11941
11942 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11943 {
11944 if (strncmp(name, "ceph.", 5) == 0) {
11945 const VXattr *vxattr = _get_vxattrs(in);
11946 if (vxattr) {
11947 while (!vxattr->name.empty()) {
11948 if (vxattr->name == name)
11949 return vxattr;
11950 vxattr++;
11951 }
11952 }
11953 }
11954 return NULL;
11955 }
11956
11957 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11958 {
11959 std::lock_guard lock(client_lock);
11960
11961 if (unmounting)
11962 return -ENOTCONN;
11963
11964 vinodeno_t vino = _get_vino(in);
11965
11966 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11967 tout(cct) << "ll_readlink" << std::endl;
11968 tout(cct) << vino.ino.val << std::endl;
11969
11970 for (auto dn : in->dentries) {
11971 touch_dn(dn);
11972 }
11973
11974 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11975 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11976 return r;
11977 }
11978
11979 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11980 const UserPerm& perms, InodeRef *inp)
11981 {
11982 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11983 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11984 << ", gid " << perms.gid() << ")" << dendl;
11985
11986 if (strlen(name) > NAME_MAX)
11987 return -ENAMETOOLONG;
11988
11989 if (dir->snapid != CEPH_NOSNAP) {
11990 return -EROFS;
11991 }
11992 if (is_quota_files_exceeded(dir, perms)) {
11993 return -EDQUOT;
11994 }
11995
11996 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11997
11998 filepath path;
11999 dir->make_nosnap_relative_path(path);
12000 path.push_dentry(name);
12001 req->set_filepath(path);
12002 req->set_inode(dir);
12003 req->head.args.mknod.rdev = rdev;
12004 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12005 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12006
12007 bufferlist xattrs_bl;
12008 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12009 if (res < 0)
12010 goto fail;
12011 req->head.args.mknod.mode = mode;
12012 if (xattrs_bl.length() > 0)
12013 req->set_data(xattrs_bl);
12014
12015 Dentry *de;
12016 res = get_or_create(dir, name, &de);
12017 if (res < 0)
12018 goto fail;
12019 req->set_dentry(de);
12020
12021 res = make_request(req, perms, inp);
12022
12023 trim_cache();
12024
12025 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12026 return res;
12027
12028 fail:
12029 put_request(req);
12030 return res;
12031 }
12032
12033 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12034 dev_t rdev, struct stat *attr, Inode **out,
12035 const UserPerm& perms)
12036 {
12037 std::lock_guard lock(client_lock);
12038
12039 if (unmounting)
12040 return -ENOTCONN;
12041
12042 vinodeno_t vparent = _get_vino(parent);
12043
12044 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12045 tout(cct) << "ll_mknod" << std::endl;
12046 tout(cct) << vparent.ino.val << std::endl;
12047 tout(cct) << name << std::endl;
12048 tout(cct) << mode << std::endl;
12049 tout(cct) << rdev << std::endl;
12050
12051 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12052 "fuse_default_permissions");
12053 if (!fuse_default_permissions) {
12054 int r = may_create(parent, perms);
12055 if (r < 0)
12056 return r;
12057 }
12058
12059 InodeRef in;
12060 int r = _mknod(parent, name, mode, rdev, perms, &in);
12061 if (r == 0) {
12062 fill_stat(in, attr);
12063 _ll_get(in.get());
12064 }
12065 tout(cct) << attr->st_ino << std::endl;
12066 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12067 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12068 *out = in.get();
12069 return r;
12070 }
12071
12072 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12073 dev_t rdev, Inode **out,
12074 struct ceph_statx *stx, unsigned want, unsigned flags,
12075 const UserPerm& perms)
12076 {
12077 unsigned caps = statx_to_mask(flags, want);
12078 std::lock_guard lock(client_lock);
12079
12080 if (unmounting)
12081 return -ENOTCONN;
12082
12083 vinodeno_t vparent = _get_vino(parent);
12084
12085 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12086 tout(cct) << "ll_mknodx" << std::endl;
12087 tout(cct) << vparent.ino.val << std::endl;
12088 tout(cct) << name << std::endl;
12089 tout(cct) << mode << std::endl;
12090 tout(cct) << rdev << std::endl;
12091
12092 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12093 "fuse_default_permissions");
12094 if (!fuse_default_permissions) {
12095 int r = may_create(parent, perms);
12096 if (r < 0)
12097 return r;
12098 }
12099
12100 InodeRef in;
12101 int r = _mknod(parent, name, mode, rdev, perms, &in);
12102 if (r == 0) {
12103 fill_statx(in, caps, stx);
12104 _ll_get(in.get());
12105 }
12106 tout(cct) << stx->stx_ino << std::endl;
12107 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12108 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12109 *out = in.get();
12110 return r;
12111 }
12112
12113 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12114 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12115 int object_size, const char *data_pool, bool *created,
12116 const UserPerm& perms)
12117 {
12118 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12119 mode << dec << ")" << dendl;
12120
12121 if (strlen(name) > NAME_MAX)
12122 return -ENAMETOOLONG;
12123 if (dir->snapid != CEPH_NOSNAP) {
12124 return -EROFS;
12125 }
12126 if (is_quota_files_exceeded(dir, perms)) {
12127 return -EDQUOT;
12128 }
12129
12130 // use normalized flags to generate cmode
12131 int cflags = ceph_flags_sys2wire(flags);
12132 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12133 cflags |= CEPH_O_LAZY;
12134
12135 int cmode = ceph_flags_to_mode(cflags);
12136
12137 int64_t pool_id = -1;
12138 if (data_pool && *data_pool) {
12139 pool_id = objecter->with_osdmap(
12140 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12141 if (pool_id < 0)
12142 return -EINVAL;
12143 if (pool_id > 0xffffffffll)
12144 return -ERANGE; // bummer!
12145 }
12146
12147 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12148
12149 filepath path;
12150 dir->make_nosnap_relative_path(path);
12151 path.push_dentry(name);
12152 req->set_filepath(path);
12153 req->set_inode(dir);
12154 req->head.args.open.flags = cflags | CEPH_O_CREAT;
12155
12156 req->head.args.open.stripe_unit = stripe_unit;
12157 req->head.args.open.stripe_count = stripe_count;
12158 req->head.args.open.object_size = object_size;
12159 if (cct->_conf->client_debug_getattr_caps)
12160 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12161 else
12162 req->head.args.open.mask = 0;
12163 req->head.args.open.pool = pool_id;
12164 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12165 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12166
12167 mode |= S_IFREG;
12168 bufferlist xattrs_bl;
12169 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12170 if (res < 0)
12171 goto fail;
12172 req->head.args.open.mode = mode;
12173 if (xattrs_bl.length() > 0)
12174 req->set_data(xattrs_bl);
12175
12176 Dentry *de;
12177 res = get_or_create(dir, name, &de);
12178 if (res < 0)
12179 goto fail;
12180 req->set_dentry(de);
12181
12182 res = make_request(req, perms, inp, created);
12183 if (res < 0) {
12184 goto reply_error;
12185 }
12186
12187 /* If the caller passed a value in fhp, do the open */
12188 if(fhp) {
12189 (*inp)->get_open_ref(cmode);
12190 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12191 }
12192
12193 reply_error:
12194 trim_cache();
12195
12196 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12197 << " layout " << stripe_unit
12198 << ' ' << stripe_count
12199 << ' ' << object_size
12200 <<") = " << res << dendl;
12201 return res;
12202
12203 fail:
12204 put_request(req);
12205 return res;
12206 }
12207
12208
12209 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12210 InodeRef *inp)
12211 {
12212 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12213 << mode << dec << ", uid " << perm.uid()
12214 << ", gid " << perm.gid() << ")" << dendl;
12215
12216 if (strlen(name) > NAME_MAX)
12217 return -ENAMETOOLONG;
12218
12219 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12220 return -EROFS;
12221 }
12222 if (is_quota_files_exceeded(dir, perm)) {
12223 return -EDQUOT;
12224 }
12225 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12226 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12227
12228 filepath path;
12229 dir->make_nosnap_relative_path(path);
12230 path.push_dentry(name);
12231 req->set_filepath(path);
12232 req->set_inode(dir);
12233 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12234 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12235
12236 mode |= S_IFDIR;
12237 bufferlist xattrs_bl;
12238 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12239 if (res < 0)
12240 goto fail;
12241 req->head.args.mkdir.mode = mode;
12242 if (xattrs_bl.length() > 0)
12243 req->set_data(xattrs_bl);
12244
12245 Dentry *de;
12246 res = get_or_create(dir, name, &de);
12247 if (res < 0)
12248 goto fail;
12249 req->set_dentry(de);
12250
12251 ldout(cct, 10) << "_mkdir: making request" << dendl;
12252 res = make_request(req, perm, inp);
12253 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12254
12255 trim_cache();
12256
12257 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12258 return res;
12259
12260 fail:
12261 put_request(req);
12262 return res;
12263 }
12264
12265 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12266 struct stat *attr, Inode **out, const UserPerm& perm)
12267 {
12268 std::lock_guard lock(client_lock);
12269
12270 if (unmounting)
12271 return -ENOTCONN;
12272
12273 vinodeno_t vparent = _get_vino(parent);
12274
12275 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12276 tout(cct) << "ll_mkdir" << std::endl;
12277 tout(cct) << vparent.ino.val << std::endl;
12278 tout(cct) << name << std::endl;
12279 tout(cct) << mode << std::endl;
12280
12281 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12282 "fuse_default_permissions");
12283 if (!fuse_default_permissions) {
12284 int r = may_create(parent, perm);
12285 if (r < 0)
12286 return r;
12287 }
12288
12289 InodeRef in;
12290 int r = _mkdir(parent, name, mode, perm, &in);
12291 if (r == 0) {
12292 fill_stat(in, attr);
12293 _ll_get(in.get());
12294 }
12295 tout(cct) << attr->st_ino << std::endl;
12296 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12297 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12298 *out = in.get();
12299 return r;
12300 }
12301
12302 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12303 struct ceph_statx *stx, unsigned want, unsigned flags,
12304 const UserPerm& perms)
12305 {
12306 std::lock_guard lock(client_lock);
12307
12308 if (unmounting)
12309 return -ENOTCONN;
12310
12311 vinodeno_t vparent = _get_vino(parent);
12312
12313 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12314 tout(cct) << "ll_mkdirx" << std::endl;
12315 tout(cct) << vparent.ino.val << std::endl;
12316 tout(cct) << name << std::endl;
12317 tout(cct) << mode << std::endl;
12318
12319 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12320 "fuse_default_permissions");
12321 if (!fuse_default_permissions) {
12322 int r = may_create(parent, perms);
12323 if (r < 0)
12324 return r;
12325 }
12326
12327 InodeRef in;
12328 int r = _mkdir(parent, name, mode, perms, &in);
12329 if (r == 0) {
12330 fill_statx(in, statx_to_mask(flags, want), stx);
12331 _ll_get(in.get());
12332 } else {
12333 stx->stx_ino = 0;
12334 stx->stx_mask = 0;
12335 }
12336 tout(cct) << stx->stx_ino << std::endl;
12337 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12338 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12339 *out = in.get();
12340 return r;
12341 }
12342
12343 int Client::_symlink(Inode *dir, const char *name, const char *target,
12344 const UserPerm& perms, InodeRef *inp)
12345 {
12346 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12347 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12348 << dendl;
12349
12350 if (strlen(name) > NAME_MAX)
12351 return -ENAMETOOLONG;
12352
12353 if (dir->snapid != CEPH_NOSNAP) {
12354 return -EROFS;
12355 }
12356 if (is_quota_files_exceeded(dir, perms)) {
12357 return -EDQUOT;
12358 }
12359
12360 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12361
12362 filepath path;
12363 dir->make_nosnap_relative_path(path);
12364 path.push_dentry(name);
12365 req->set_filepath(path);
12366 req->set_inode(dir);
12367 req->set_string2(target);
12368 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12369 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12370
12371 Dentry *de;
12372 int res = get_or_create(dir, name, &de);
12373 if (res < 0)
12374 goto fail;
12375 req->set_dentry(de);
12376
12377 res = make_request(req, perms, inp);
12378
12379 trim_cache();
12380 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12381 res << dendl;
12382 return res;
12383
12384 fail:
12385 put_request(req);
12386 return res;
12387 }
12388
12389 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12390 struct stat *attr, Inode **out, const UserPerm& perms)
12391 {
12392 std::lock_guard lock(client_lock);
12393
12394 if (unmounting)
12395 return -ENOTCONN;
12396
12397 vinodeno_t vparent = _get_vino(parent);
12398
12399 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12400 << dendl;
12401 tout(cct) << "ll_symlink" << std::endl;
12402 tout(cct) << vparent.ino.val << std::endl;
12403 tout(cct) << name << std::endl;
12404 tout(cct) << value << std::endl;
12405
12406 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12407 "fuse_default_permissions");
12408 if (!fuse_default_permissions) {
12409 int r = may_create(parent, perms);
12410 if (r < 0)
12411 return r;
12412 }
12413
12414 InodeRef in;
12415 int r = _symlink(parent, name, value, perms, &in);
12416 if (r == 0) {
12417 fill_stat(in, attr);
12418 _ll_get(in.get());
12419 }
12420 tout(cct) << attr->st_ino << std::endl;
12421 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12422 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12423 *out = in.get();
12424 return r;
12425 }
12426
12427 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12428 Inode **out, struct ceph_statx *stx, unsigned want,
12429 unsigned flags, const UserPerm& perms)
12430 {
12431 std::lock_guard lock(client_lock);
12432
12433 if (unmounting)
12434 return -ENOTCONN;
12435
12436 vinodeno_t vparent = _get_vino(parent);
12437
12438 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12439 << dendl;
12440 tout(cct) << "ll_symlinkx" << std::endl;
12441 tout(cct) << vparent.ino.val << std::endl;
12442 tout(cct) << name << std::endl;
12443 tout(cct) << value << std::endl;
12444
12445 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12446 "fuse_default_permissions");
12447 if (!fuse_default_permissions) {
12448 int r = may_create(parent, perms);
12449 if (r < 0)
12450 return r;
12451 }
12452
12453 InodeRef in;
12454 int r = _symlink(parent, name, value, perms, &in);
12455 if (r == 0) {
12456 fill_statx(in, statx_to_mask(flags, want), stx);
12457 _ll_get(in.get());
12458 }
12459 tout(cct) << stx->stx_ino << std::endl;
12460 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12461 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12462 *out = in.get();
12463 return r;
12464 }
12465
12466 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12467 {
12468 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12469 << " uid " << perm.uid() << " gid " << perm.gid()
12470 << ")" << dendl;
12471
12472 if (dir->snapid != CEPH_NOSNAP) {
12473 return -EROFS;
12474 }
12475
12476 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12477
12478 filepath path;
12479 dir->make_nosnap_relative_path(path);
12480 path.push_dentry(name);
12481 req->set_filepath(path);
12482
12483 InodeRef otherin;
12484 Inode *in;
12485 Dentry *de;
12486
12487 int res = get_or_create(dir, name, &de);
12488 if (res < 0)
12489 goto fail;
12490 req->set_dentry(de);
12491 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12492 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12493
12494 res = _lookup(dir, name, 0, &otherin, perm);
12495 if (res < 0)
12496 goto fail;
12497
12498 in = otherin.get();
12499 req->set_other_inode(in);
12500 in->break_all_delegs();
12501 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12502
12503 req->set_inode(dir);
12504
12505 res = make_request(req, perm);
12506
12507 trim_cache();
12508 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12509 return res;
12510
12511 fail:
12512 put_request(req);
12513 return res;
12514 }
12515
12516 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12517 {
12518 std::lock_guard lock(client_lock);
12519
12520 if (unmounting)
12521 return -ENOTCONN;
12522
12523 vinodeno_t vino = _get_vino(in);
12524
12525 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12526 tout(cct) << "ll_unlink" << std::endl;
12527 tout(cct) << vino.ino.val << std::endl;
12528 tout(cct) << name << std::endl;
12529
12530 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12531 "fuse_default_permissions");
12532 if (!fuse_default_permissions) {
12533 int r = may_delete(in, name, perm);
12534 if (r < 0)
12535 return r;
12536 }
12537 return _unlink(in, name, perm);
12538 }
12539
12540 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12541 {
12542 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12543 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12544
12545 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12546 return -EROFS;
12547 }
12548
12549 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12550 MetaRequest *req = new MetaRequest(op);
12551 filepath path;
12552 dir->make_nosnap_relative_path(path);
12553 path.push_dentry(name);
12554 req->set_filepath(path);
12555 req->set_inode(dir);
12556
12557 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12558 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12559 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12560
12561 InodeRef in;
12562
12563 Dentry *de;
12564 int res = get_or_create(dir, name, &de);
12565 if (res < 0)
12566 goto fail;
12567 if (op == CEPH_MDS_OP_RMDIR)
12568 req->set_dentry(de);
12569 else
12570 de->get();
12571
12572 res = _lookup(dir, name, 0, &in, perms);
12573 if (res < 0)
12574 goto fail;
12575
12576 if (op == CEPH_MDS_OP_RMSNAP) {
12577 unlink(de, true, true);
12578 de->put();
12579 }
12580 req->set_other_inode(in.get());
12581
12582 res = make_request(req, perms);
12583
12584 trim_cache();
12585 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12586 return res;
12587
12588 fail:
12589 put_request(req);
12590 return res;
12591 }
12592
12593 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12594 {
12595 std::lock_guard lock(client_lock);
12596
12597 if (unmounting)
12598 return -ENOTCONN;
12599
12600 vinodeno_t vino = _get_vino(in);
12601
12602 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12603 tout(cct) << "ll_rmdir" << std::endl;
12604 tout(cct) << vino.ino.val << std::endl;
12605 tout(cct) << name << std::endl;
12606
12607 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12608 "fuse_default_permissions");
12609 if (!fuse_default_permissions) {
12610 int r = may_delete(in, name, perms);
12611 if (r < 0)
12612 return r;
12613 }
12614
12615 return _rmdir(in, name, perms);
12616 }
12617
12618 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12619 {
12620 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12621 << todir->ino << " " << toname
12622 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12623 << dendl;
12624
12625 if (fromdir->snapid != todir->snapid)
12626 return -EXDEV;
12627
12628 int op = CEPH_MDS_OP_RENAME;
12629 if (fromdir->snapid != CEPH_NOSNAP) {
12630 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12631 op = CEPH_MDS_OP_RENAMESNAP;
12632 else
12633 return -EROFS;
12634 }
12635 if (fromdir != todir) {
12636 Inode *fromdir_root =
12637 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12638 Inode *todir_root =
12639 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12640 if (fromdir_root != todir_root) {
12641 return -EXDEV;
12642 }
12643 }
12644
12645 InodeRef target;
12646 MetaRequest *req = new MetaRequest(op);
12647
12648 filepath from;
12649 fromdir->make_nosnap_relative_path(from);
12650 from.push_dentry(fromname);
12651 filepath to;
12652 todir->make_nosnap_relative_path(to);
12653 to.push_dentry(toname);
12654 req->set_filepath(to);
12655 req->set_filepath2(from);
12656
12657 Dentry *oldde;
12658 int res = get_or_create(fromdir, fromname, &oldde);
12659 if (res < 0)
12660 goto fail;
12661 Dentry *de;
12662 res = get_or_create(todir, toname, &de);
12663 if (res < 0)
12664 goto fail;
12665
12666 if (op == CEPH_MDS_OP_RENAME) {
12667 req->set_old_dentry(oldde);
12668 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12669 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12670
12671 req->set_dentry(de);
12672 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12673 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12674
12675 InodeRef oldin, otherin;
12676 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12677 if (res < 0)
12678 goto fail;
12679
12680 Inode *oldinode = oldin.get();
12681 oldinode->break_all_delegs();
12682 req->set_old_inode(oldinode);
12683 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12684
12685 res = _lookup(todir, toname, 0, &otherin, perm);
12686 switch (res) {
12687 case 0:
12688 {
12689 Inode *in = otherin.get();
12690 req->set_other_inode(in);
12691 in->break_all_delegs();
12692 }
12693 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12694 break;
12695 case -ENOENT:
12696 break;
12697 default:
12698 goto fail;
12699 }
12700
12701 req->set_inode(todir);
12702 } else {
12703 // renamesnap reply contains no tracedn, so we need to invalidate
12704 // dentry manually
12705 unlink(oldde, true, true);
12706 unlink(de, true, true);
12707
12708 req->set_inode(todir);
12709 }
12710
12711 res = make_request(req, perm, &target);
12712 ldout(cct, 10) << "rename result is " << res << dendl;
12713
12714 // renamed item from our cache
12715
12716 trim_cache();
12717 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12718 return res;
12719
12720 fail:
12721 put_request(req);
12722 return res;
12723 }
12724
12725 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12726 const char *newname, const UserPerm& perm)
12727 {
12728 std::lock_guard lock(client_lock);
12729
12730 if (unmounting)
12731 return -ENOTCONN;
12732
12733 vinodeno_t vparent = _get_vino(parent);
12734 vinodeno_t vnewparent = _get_vino(newparent);
12735
12736 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12737 << vnewparent << " " << newname << dendl;
12738 tout(cct) << "ll_rename" << std::endl;
12739 tout(cct) << vparent.ino.val << std::endl;
12740 tout(cct) << name << std::endl;
12741 tout(cct) << vnewparent.ino.val << std::endl;
12742 tout(cct) << newname << std::endl;
12743
12744 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12745 "fuse_default_permissions");
12746 if (!fuse_default_permissions) {
12747 int r = may_delete(parent, name, perm);
12748 if (r < 0)
12749 return r;
12750 r = may_delete(newparent, newname, perm);
12751 if (r < 0 && r != -ENOENT)
12752 return r;
12753 }
12754
12755 return _rename(parent, name, newparent, newname, perm);
12756 }
12757
12758 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12759 {
12760 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12761 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12762
12763 if (strlen(newname) > NAME_MAX)
12764 return -ENAMETOOLONG;
12765
12766 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12767 return -EROFS;
12768 }
12769 if (is_quota_files_exceeded(dir, perm)) {
12770 return -EDQUOT;
12771 }
12772
12773 in->break_all_delegs();
12774 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12775
12776 filepath path(newname, dir->ino);
12777 req->set_filepath(path);
12778 filepath existing(in->ino);
12779 req->set_filepath2(existing);
12780
12781 req->set_inode(dir);
12782 req->inode_drop = CEPH_CAP_FILE_SHARED;
12783 req->inode_unless = CEPH_CAP_FILE_EXCL;
12784
12785 Dentry *de;
12786 int res = get_or_create(dir, newname, &de);
12787 if (res < 0)
12788 goto fail;
12789 req->set_dentry(de);
12790
12791 res = make_request(req, perm, inp);
12792 ldout(cct, 10) << "link result is " << res << dendl;
12793
12794 trim_cache();
12795 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12796 return res;
12797
12798 fail:
12799 put_request(req);
12800 return res;
12801 }
12802
12803 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12804 const UserPerm& perm)
12805 {
12806 std::lock_guard lock(client_lock);
12807
12808 if (unmounting)
12809 return -ENOTCONN;
12810
12811 vinodeno_t vino = _get_vino(in);
12812 vinodeno_t vnewparent = _get_vino(newparent);
12813
12814 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12815 newname << dendl;
12816 tout(cct) << "ll_link" << std::endl;
12817 tout(cct) << vino.ino.val << std::endl;
12818 tout(cct) << vnewparent << std::endl;
12819 tout(cct) << newname << std::endl;
12820
12821 InodeRef target;
12822
12823 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12824 "fuse_default_permissions");
12825 if (!fuse_default_permissions) {
12826 if (S_ISDIR(in->mode))
12827 return -EPERM;
12828
12829 int r = may_hardlink(in, perm);
12830 if (r < 0)
12831 return r;
12832
12833 r = may_create(newparent, perm);
12834 if (r < 0)
12835 return r;
12836 }
12837
12838 return _link(in, newparent, newname, perm, &target);
12839 }
12840
12841 int Client::ll_num_osds(void)
12842 {
12843 std::lock_guard lock(client_lock);
12844 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12845 }
12846
12847 int Client::ll_osdaddr(int osd, uint32_t *addr)
12848 {
12849 std::lock_guard lock(client_lock);
12850
12851 entity_addr_t g;
12852 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12853 if (!o.exists(osd))
12854 return false;
12855 g = o.get_addrs(osd).front();
12856 return true;
12857 });
12858 if (!exists)
12859 return -1;
12860 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12861 *addr = ntohl(nb_addr);
12862 return 0;
12863 }
12864
12865 uint32_t Client::ll_stripe_unit(Inode *in)
12866 {
12867 std::lock_guard lock(client_lock);
12868 return in->layout.stripe_unit;
12869 }
12870
12871 uint64_t Client::ll_snap_seq(Inode *in)
12872 {
12873 std::lock_guard lock(client_lock);
12874 return in->snaprealm->seq;
12875 }
12876
12877 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12878 {
12879 std::lock_guard lock(client_lock);
12880 *layout = in->layout;
12881 return 0;
12882 }
12883
12884 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12885 {
12886 return ll_file_layout(fh->inode.get(), layout);
12887 }
12888
12889 /* Currently we cannot take advantage of redundancy in reads, since we
12890 would have to go through all possible placement groups (a
12891 potentially quite large number determined by a hash), and use CRUSH
12892 to calculate the appropriate set of OSDs for each placement group,
12893 then index into that. An array with one entry per OSD is much more
12894 tractable and works for demonstration purposes. */
12895
12896 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12897 file_layout_t* layout)
12898 {
12899 std::lock_guard lock(client_lock);
12900
12901 inodeno_t ino = in->ino;
12902 uint32_t object_size = layout->object_size;
12903 uint32_t su = layout->stripe_unit;
12904 uint32_t stripe_count = layout->stripe_count;
12905 uint64_t stripes_per_object = object_size / su;
12906 uint64_t stripeno = 0, stripepos = 0;
12907
12908 if(stripe_count) {
12909 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12910 stripepos = blockno % stripe_count; // which object in the object set (X)
12911 }
12912 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12913 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12914
12915 object_t oid = file_object_t(ino, objectno);
12916 return objecter->with_osdmap([&](const OSDMap& o) {
12917 ceph_object_layout olayout =
12918 o.file_to_object_layout(oid, *layout);
12919 pg_t pg = (pg_t)olayout.ol_pgid;
12920 vector<int> osds;
12921 int primary;
12922 o.pg_to_acting_osds(pg, &osds, &primary);
12923 return primary;
12924 });
12925 }
12926
12927 /* Return the offset of the block, internal to the object */
12928
12929 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12930 {
12931 std::lock_guard lock(client_lock);
12932 file_layout_t *layout=&(in->layout);
12933 uint32_t object_size = layout->object_size;
12934 uint32_t su = layout->stripe_unit;
12935 uint64_t stripes_per_object = object_size / su;
12936
12937 return (blockno % stripes_per_object) * su;
12938 }
12939
12940 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12941 const UserPerm& perms)
12942 {
12943 std::lock_guard lock(client_lock);
12944
12945 if (unmounting)
12946 return -ENOTCONN;
12947
12948 vinodeno_t vino = _get_vino(in);
12949
12950 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12951 tout(cct) << "ll_opendir" << std::endl;
12952 tout(cct) << vino.ino.val << std::endl;
12953
12954 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12955 "fuse_default_permissions");
12956 if (!fuse_default_permissions) {
12957 int r = may_open(in, flags, perms);
12958 if (r < 0)
12959 return r;
12960 }
12961
12962 int r = _opendir(in, dirpp, perms);
12963 tout(cct) << (unsigned long)*dirpp << std::endl;
12964
12965 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12966 << dendl;
12967 return r;
12968 }
12969
12970 int Client::ll_releasedir(dir_result_t *dirp)
12971 {
12972 std::lock_guard lock(client_lock);
12973 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12974 tout(cct) << "ll_releasedir" << std::endl;
12975 tout(cct) << (unsigned long)dirp << std::endl;
12976
12977 if (unmounting)
12978 return -ENOTCONN;
12979
12980 _closedir(dirp);
12981 return 0;
12982 }
12983
12984 int Client::ll_fsyncdir(dir_result_t *dirp)
12985 {
12986 std::lock_guard lock(client_lock);
12987 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12988 tout(cct) << "ll_fsyncdir" << std::endl;
12989 tout(cct) << (unsigned long)dirp << std::endl;
12990
12991 if (unmounting)
12992 return -ENOTCONN;
12993
12994 return _fsync(dirp->inode.get(), false);
12995 }
12996
12997 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12998 {
12999 ceph_assert(!(flags & O_CREAT));
13000
13001 std::lock_guard lock(client_lock);
13002
13003 if (unmounting)
13004 return -ENOTCONN;
13005
13006 vinodeno_t vino = _get_vino(in);
13007
13008 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13009 tout(cct) << "ll_open" << std::endl;
13010 tout(cct) << vino.ino.val << std::endl;
13011 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13012
13013 int r;
13014 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13015 "fuse_default_permissions");
13016 if (!fuse_default_permissions) {
13017 r = may_open(in, flags, perms);
13018 if (r < 0)
13019 goto out;
13020 }
13021
13022 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13023
13024 out:
13025 Fh *fhptr = fhp ? *fhp : NULL;
13026 if (fhptr) {
13027 ll_unclosed_fh_set.insert(fhptr);
13028 }
13029 tout(cct) << (unsigned long)fhptr << std::endl;
13030 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13031 " = " << r << " (" << fhptr << ")" << dendl;
13032 return r;
13033 }
13034
13035 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13036 int flags, InodeRef *in, int caps, Fh **fhp,
13037 const UserPerm& perms)
13038 {
13039 *fhp = NULL;
13040
13041 vinodeno_t vparent = _get_vino(parent);
13042
13043 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13044 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13045 << ", gid " << perms.gid() << dendl;
13046 tout(cct) << "ll_create" << std::endl;
13047 tout(cct) << vparent.ino.val << std::endl;
13048 tout(cct) << name << std::endl;
13049 tout(cct) << mode << std::endl;
13050 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13051
13052 bool created = false;
13053 int r = _lookup(parent, name, caps, in, perms);
13054
13055 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13056 return -EEXIST;
13057
13058 if (r == -ENOENT && (flags & O_CREAT)) {
13059 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13060 "fuse_default_permissions");
13061 if (!fuse_default_permissions) {
13062 r = may_create(parent, perms);
13063 if (r < 0)
13064 goto out;
13065 }
13066 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13067 perms);
13068 if (r < 0)
13069 goto out;
13070 }
13071
13072 if (r < 0)
13073 goto out;
13074
13075 ceph_assert(*in);
13076
13077 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13078 if (!created) {
13079 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13080 "fuse_default_permissions");
13081 if (!fuse_default_permissions) {
13082 r = may_open(in->get(), flags, perms);
13083 if (r < 0) {
13084 if (*fhp) {
13085 int release_r = _release_fh(*fhp);
13086 ceph_assert(release_r == 0); // during create, no async data ops should have happened
13087 }
13088 goto out;
13089 }
13090 }
13091 if (*fhp == NULL) {
13092 r = _open(in->get(), flags, mode, fhp, perms);
13093 if (r < 0)
13094 goto out;
13095 }
13096 }
13097
13098 out:
13099 if (*fhp) {
13100 ll_unclosed_fh_set.insert(*fhp);
13101 }
13102
13103 ino_t ino = 0;
13104 if (r >= 0) {
13105 Inode *inode = in->get();
13106 if (use_faked_inos())
13107 ino = inode->faked_ino;
13108 else
13109 ino = inode->ino;
13110 }
13111
13112 tout(cct) << (unsigned long)*fhp << std::endl;
13113 tout(cct) << ino << std::endl;
13114 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13115 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13116 *fhp << " " << hex << ino << dec << ")" << dendl;
13117
13118 return r;
13119 }
13120
13121 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13122 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13123 const UserPerm& perms)
13124 {
13125 std::lock_guard lock(client_lock);
13126 InodeRef in;
13127
13128 if (unmounting)
13129 return -ENOTCONN;
13130
13131 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13132 fhp, perms);
13133 if (r >= 0) {
13134 ceph_assert(in);
13135
13136 // passing an Inode in outp requires an additional ref
13137 if (outp) {
13138 _ll_get(in.get());
13139 *outp = in.get();
13140 }
13141 fill_stat(in, attr);
13142 } else {
13143 attr->st_ino = 0;
13144 }
13145
13146 return r;
13147 }
13148
13149 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13150 int oflags, Inode **outp, Fh **fhp,
13151 struct ceph_statx *stx, unsigned want, unsigned lflags,
13152 const UserPerm& perms)
13153 {
13154 unsigned caps = statx_to_mask(lflags, want);
13155 std::lock_guard lock(client_lock);
13156 InodeRef in;
13157
13158 if (unmounting)
13159 return -ENOTCONN;
13160
13161 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13162 if (r >= 0) {
13163 ceph_assert(in);
13164
13165 // passing an Inode in outp requires an additional ref
13166 if (outp) {
13167 _ll_get(in.get());
13168 *outp = in.get();
13169 }
13170 fill_statx(in, caps, stx);
13171 } else {
13172 stx->stx_ino = 0;
13173 stx->stx_mask = 0;
13174 }
13175
13176 return r;
13177 }
13178
13179 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13180 {
13181 std::lock_guard lock(client_lock);
13182 tout(cct) << "ll_lseek" << std::endl;
13183 tout(cct) << offset << std::endl;
13184 tout(cct) << whence << std::endl;
13185
13186 if (unmounting)
13187 return -ENOTCONN;
13188
13189 return _lseek(fh, offset, whence);
13190 }
13191
13192 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13193 {
13194 std::lock_guard lock(client_lock);
13195 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13196 tout(cct) << "ll_read" << std::endl;
13197 tout(cct) << (unsigned long)fh << std::endl;
13198 tout(cct) << off << std::endl;
13199 tout(cct) << len << std::endl;
13200
13201 if (unmounting)
13202 return -ENOTCONN;
13203
13204 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13205 len = std::min(len, (loff_t)INT_MAX);
13206 return _read(fh, off, len, bl);
13207 }
13208
13209 int Client::ll_read_block(Inode *in, uint64_t blockid,
13210 char *buf,
13211 uint64_t offset,
13212 uint64_t length,
13213 file_layout_t* layout)
13214 {
13215 std::lock_guard lock(client_lock);
13216
13217 if (unmounting)
13218 return -ENOTCONN;
13219
13220 vinodeno_t vino = _get_vino(in);
13221 object_t oid = file_object_t(vino.ino, blockid);
13222 C_SaferCond onfinish;
13223 bufferlist bl;
13224
13225 objecter->read(oid,
13226 object_locator_t(layout->pool_id),
13227 offset,
13228 length,
13229 vino.snapid,
13230 &bl,
13231 CEPH_OSD_FLAG_READ,
13232 &onfinish);
13233
13234 client_lock.Unlock();
13235 int r = onfinish.wait();
13236 client_lock.Lock();
13237
13238 if (r >= 0) {
13239 bl.copy(0, bl.length(), buf);
13240 r = bl.length();
13241 }
13242
13243 return r;
13244 }
13245
13246 /* It appears that the OSD doesn't return success unless the entire
13247 buffer was written, return the write length on success. */
13248
13249 int Client::ll_write_block(Inode *in, uint64_t blockid,
13250 char* buf, uint64_t offset,
13251 uint64_t length, file_layout_t* layout,
13252 uint64_t snapseq, uint32_t sync)
13253 {
13254 vinodeno_t vino = ll_get_vino(in);
13255 int r = 0;
13256 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13257
13258 if (length == 0) {
13259 return -EINVAL;
13260 }
13261 if (true || sync) {
13262 /* if write is stable, the epilogue is waiting on
13263 * flock */
13264 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13265 }
13266 object_t oid = file_object_t(vino.ino, blockid);
13267 SnapContext fakesnap;
13268 ceph::bufferlist bl;
13269 if (length > 0) {
13270 bl.push_back(buffer::copy(buf, length));
13271 }
13272
13273 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13274 << dendl;
13275
13276 fakesnap.seq = snapseq;
13277
13278 /* lock just in time */
13279 client_lock.Lock();
13280 if (unmounting) {
13281 client_lock.Unlock();
13282 return -ENOTCONN;
13283 }
13284
13285 objecter->write(oid,
13286 object_locator_t(layout->pool_id),
13287 offset,
13288 length,
13289 fakesnap,
13290 bl,
13291 ceph::real_clock::now(),
13292 0,
13293 onsafe.get());
13294
13295 client_lock.Unlock();
13296 if (nullptr != onsafe) {
13297 r = onsafe->wait();
13298 }
13299
13300 if (r < 0) {
13301 return r;
13302 } else {
13303 return length;
13304 }
13305 }
13306
13307 int Client::ll_commit_blocks(Inode *in,
13308 uint64_t offset,
13309 uint64_t length)
13310 {
13311 std::lock_guard lock(client_lock);
13312 /*
13313 BarrierContext *bctx;
13314 vinodeno_t vino = _get_vino(in);
13315 uint64_t ino = vino.ino;
13316
13317 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13318 << offset << " to " << length << dendl;
13319
13320 if (length == 0) {
13321 return -EINVAL;
13322 }
13323
13324 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13325 if (p != barriers.end()) {
13326 barrier_interval civ(offset, offset + length);
13327 p->second->commit_barrier(civ);
13328 }
13329 */
13330 return 0;
13331 }
13332
13333 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13334 {
13335 std::lock_guard lock(client_lock);
13336 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13337 "~" << len << dendl;
13338 tout(cct) << "ll_write" << std::endl;
13339 tout(cct) << (unsigned long)fh << std::endl;
13340 tout(cct) << off << std::endl;
13341 tout(cct) << len << std::endl;
13342
13343 if (unmounting)
13344 return -ENOTCONN;
13345
13346 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13347 len = std::min(len, (loff_t)INT_MAX);
13348 int r = _write(fh, off, len, data, NULL, 0);
13349 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13350 << dendl;
13351 return r;
13352 }
13353
13354 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13355 {
13356 std::lock_guard lock(client_lock);
13357 if (unmounting)
13358 return -ENOTCONN;
13359 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13360 }
13361
13362 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13363 {
13364 std::lock_guard lock(client_lock);
13365 if (unmounting)
13366 return -ENOTCONN;
13367 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13368 }
13369
13370 int Client::ll_flush(Fh *fh)
13371 {
13372 std::lock_guard lock(client_lock);
13373 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13374 tout(cct) << "ll_flush" << std::endl;
13375 tout(cct) << (unsigned long)fh << std::endl;
13376
13377 if (unmounting)
13378 return -ENOTCONN;
13379
13380 return _flush(fh);
13381 }
13382
13383 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13384 {
13385 std::lock_guard lock(client_lock);
13386 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13387 tout(cct) << "ll_fsync" << std::endl;
13388 tout(cct) << (unsigned long)fh << std::endl;
13389
13390 if (unmounting)
13391 return -ENOTCONN;
13392
13393 int r = _fsync(fh, syncdataonly);
13394 if (r) {
13395 // If we're returning an error, clear it from the FH
13396 fh->take_async_err();
13397 }
13398 return r;
13399 }
13400
13401 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13402 {
13403 std::lock_guard lock(client_lock);
13404 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13405 tout(cct) << "ll_sync_inode" << std::endl;
13406 tout(cct) << (unsigned long)in << std::endl;
13407
13408 if (unmounting)
13409 return -ENOTCONN;
13410
13411 return _fsync(in, syncdataonly);
13412 }
13413
13414 #ifdef FALLOC_FL_PUNCH_HOLE
13415
13416 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13417 {
13418 if (offset < 0 || length <= 0)
13419 return -EINVAL;
13420
13421 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13422 return -EOPNOTSUPP;
13423
13424 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13425 return -EOPNOTSUPP;
13426
13427 Inode *in = fh->inode.get();
13428
13429 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13430 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13431 return -ENOSPC;
13432 }
13433
13434 if (in->snapid != CEPH_NOSNAP)
13435 return -EROFS;
13436
13437 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13438 return -EBADF;
13439
13440 uint64_t size = offset + length;
13441 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13442 size > in->size &&
13443 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13444 return -EDQUOT;
13445 }
13446
13447 int have;
13448 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13449 if (r < 0)
13450 return r;
13451
13452 std::unique_ptr<C_SaferCond> onuninline = nullptr;
13453 if (mode & FALLOC_FL_PUNCH_HOLE) {
13454 if (in->inline_version < CEPH_INLINE_NONE &&
13455 (have & CEPH_CAP_FILE_BUFFER)) {
13456 bufferlist bl;
13457 int len = in->inline_data.length();
13458 if (offset < len) {
13459 if (offset > 0)
13460 in->inline_data.copy(0, offset, bl);
13461 int size = length;
13462 if (offset + size > len)
13463 size = len - offset;
13464 if (size > 0)
13465 bl.append_zero(size);
13466 if (offset + size < len)
13467 in->inline_data.copy(offset + size, len - offset - size, bl);
13468 in->inline_data = bl;
13469 in->inline_version++;
13470 }
13471 in->mtime = in->ctime = ceph_clock_now();
13472 in->change_attr++;
13473 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13474 } else {
13475 if (in->inline_version < CEPH_INLINE_NONE) {
13476 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13477 uninline_data(in, onuninline.get());
13478 }
13479
13480 C_SaferCond onfinish("Client::_punch_hole flock");
13481
13482 unsafe_sync_write++;
13483 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13484
13485 _invalidate_inode_cache(in, offset, length);
13486 filer->zero(in->ino, &in->layout,
13487 in->snaprealm->get_snap_context(),
13488 offset, length,
13489 ceph::real_clock::now(),
13490 0, true, &onfinish);
13491 in->mtime = in->ctime = ceph_clock_now();
13492 in->change_attr++;
13493 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13494
13495 client_lock.Unlock();
13496 onfinish.wait();
13497 client_lock.Lock();
13498 _sync_write_commit(in);
13499 }
13500 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13501 uint64_t size = offset + length;
13502 if (size > in->size) {
13503 in->size = size;
13504 in->mtime = in->ctime = ceph_clock_now();
13505 in->change_attr++;
13506 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13507
13508 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13509 check_caps(in, CHECK_CAPS_NODELAY);
13510 } else if (is_max_size_approaching(in)) {
13511 check_caps(in, 0);
13512 }
13513 }
13514 }
13515
13516 if (nullptr != onuninline) {
13517 client_lock.Unlock();
13518 int ret = onuninline->wait();
13519 client_lock.Lock();
13520
13521 if (ret >= 0 || ret == -ECANCELED) {
13522 in->inline_data.clear();
13523 in->inline_version = CEPH_INLINE_NONE;
13524 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13525 check_caps(in, 0);
13526 } else
13527 r = ret;
13528 }
13529
13530 put_cap_ref(in, CEPH_CAP_FILE_WR);
13531 return r;
13532 }
13533 #else
13534
13535 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13536 {
13537 return -EOPNOTSUPP;
13538 }
13539
13540 #endif
13541
13542
13543 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13544 {
13545 std::lock_guard lock(client_lock);
13546 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13547 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13548 tout(cct) << (unsigned long)fh << std::endl;
13549
13550 if (unmounting)
13551 return -ENOTCONN;
13552
13553 return _fallocate(fh, mode, offset, length);
13554 }
13555
13556 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13557 {
13558 std::lock_guard lock(client_lock);
13559 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13560
13561 if (unmounting)
13562 return -ENOTCONN;
13563
13564 Fh *fh = get_filehandle(fd);
13565 if (!fh)
13566 return -EBADF;
13567 #if defined(__linux__) && defined(O_PATH)
13568 if (fh->flags & O_PATH)
13569 return -EBADF;
13570 #endif
13571 return _fallocate(fh, mode, offset, length);
13572 }
13573
13574 int Client::ll_release(Fh *fh)
13575 {
13576 std::lock_guard lock(client_lock);
13577
13578 if (unmounting)
13579 return -ENOTCONN;
13580
13581 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13582 dendl;
13583 tout(cct) << __func__ << " (fh)" << std::endl;
13584 tout(cct) << (unsigned long)fh << std::endl;
13585
13586 if (ll_unclosed_fh_set.count(fh))
13587 ll_unclosed_fh_set.erase(fh);
13588 return _release_fh(fh);
13589 }
13590
13591 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13592 {
13593 std::lock_guard lock(client_lock);
13594
13595 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13596 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13597
13598 if (unmounting)
13599 return -ENOTCONN;
13600
13601 return _getlk(fh, fl, owner);
13602 }
13603
13604 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13605 {
13606 std::lock_guard lock(client_lock);
13607
13608 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13609 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13610
13611 if (unmounting)
13612 return -ENOTCONN;
13613
13614 return _setlk(fh, fl, owner, sleep);
13615 }
13616
13617 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13618 {
13619 std::lock_guard lock(client_lock);
13620
13621 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13622 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13623
13624 if (unmounting)
13625 return -ENOTCONN;
13626
13627 return _flock(fh, cmd, owner);
13628 }
13629
13630 int Client::set_deleg_timeout(uint32_t timeout)
13631 {
13632 std::lock_guard lock(client_lock);
13633
13634 /*
13635 * The whole point is to prevent blacklisting so we must time out the
13636 * delegation before the session autoclose timeout kicks in.
13637 */
13638 if (timeout >= mdsmap->get_session_autoclose())
13639 return -EINVAL;
13640
13641 deleg_timeout = timeout;
13642 return 0;
13643 }
13644
13645 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13646 {
13647 int ret = -EINVAL;
13648
13649 std::lock_guard lock(client_lock);
13650
13651 if (!mounted)
13652 return -ENOTCONN;
13653
13654 Inode *inode = fh->inode.get();
13655
13656 switch(cmd) {
13657 case CEPH_DELEGATION_NONE:
13658 inode->unset_deleg(fh);
13659 ret = 0;
13660 break;
13661 default:
13662 try {
13663 ret = inode->set_deleg(fh, cmd, cb, priv);
13664 } catch (std::bad_alloc&) {
13665 ret = -ENOMEM;
13666 }
13667 break;
13668 }
13669 return ret;
13670 }
13671
13672 class C_Client_RequestInterrupt : public Context {
13673 private:
13674 Client *client;
13675 MetaRequest *req;
13676 public:
13677 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13678 req->get();
13679 }
13680 void finish(int r) override {
13681 std::lock_guard l(client->client_lock);
13682 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13683 client->_interrupt_filelock(req);
13684 client->put_request(req);
13685 }
13686 };
13687
13688 void Client::ll_interrupt(void *d)
13689 {
13690 MetaRequest *req = static_cast<MetaRequest*>(d);
13691 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13692 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13693 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13694 }
13695
13696 // =========================================
13697 // layout
13698
13699 // expose file layouts
13700
13701 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13702 const UserPerm& perms)
13703 {
13704 std::lock_guard lock(client_lock);
13705
13706 if (unmounting)
13707 return -ENOTCONN;
13708
13709 filepath path(relpath);
13710 InodeRef in;
13711 int r = path_walk(path, &in, perms);
13712 if (r < 0)
13713 return r;
13714
13715 *lp = in->layout;
13716
13717 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13718 return 0;
13719 }
13720
13721 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13722 {
13723 std::lock_guard lock(client_lock);
13724
13725 if (unmounting)
13726 return -ENOTCONN;
13727
13728 Fh *f = get_filehandle(fd);
13729 if (!f)
13730 return -EBADF;
13731 Inode *in = f->inode.get();
13732
13733 *lp = in->layout;
13734
13735 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13736 return 0;
13737 }
13738
13739 int64_t Client::get_default_pool_id()
13740 {
13741 std::lock_guard lock(client_lock);
13742
13743 if (unmounting)
13744 return -ENOTCONN;
13745
13746 /* first data pool is the default */
13747 return mdsmap->get_first_data_pool();
13748 }
13749
13750 // expose osdmap
13751
13752 int64_t Client::get_pool_id(const char *pool_name)
13753 {
13754 std::lock_guard lock(client_lock);
13755
13756 if (unmounting)
13757 return -ENOTCONN;
13758
13759 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13760 pool_name);
13761 }
13762
13763 string Client::get_pool_name(int64_t pool)
13764 {
13765 std::lock_guard lock(client_lock);
13766
13767 if (unmounting)
13768 return string();
13769
13770 return objecter->with_osdmap([pool](const OSDMap& o) {
13771 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13772 });
13773 }
13774
13775 int Client::get_pool_replication(int64_t pool)
13776 {
13777 std::lock_guard lock(client_lock);
13778
13779 if (unmounting)
13780 return -ENOTCONN;
13781
13782 return objecter->with_osdmap([pool](const OSDMap& o) {
13783 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13784 });
13785 }
13786
13787 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13788 {
13789 std::lock_guard lock(client_lock);
13790
13791 if (unmounting)
13792 return -ENOTCONN;
13793
13794 Fh *f = get_filehandle(fd);
13795 if (!f)
13796 return -EBADF;
13797 Inode *in = f->inode.get();
13798
13799 vector<ObjectExtent> extents;
13800 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13801 ceph_assert(extents.size() == 1);
13802
13803 objecter->with_osdmap([&](const OSDMap& o) {
13804 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13805 o.pg_to_acting_osds(pg, osds);
13806 });
13807
13808 if (osds.empty())
13809 return -EINVAL;
13810
13811 /*
13812 * Return the remainder of the extent (stripe unit)
13813 *
13814 * If length = 1 is passed to Striper::file_to_extents we get a single
13815 * extent back, but its length is one so we still need to compute the length
13816 * to the end of the stripe unit.
13817 *
13818 * If length = su then we may get 1 or 2 objects back in the extents vector
13819 * which would have to be examined. Even then, the offsets are local to the
13820 * object, so matching up to the file offset is extra work.
13821 *
13822 * It seems simpler to stick with length = 1 and manually compute the
13823 * remainder.
13824 */
13825 if (len) {
13826 uint64_t su = in->layout.stripe_unit;
13827 *len = su - (off % su);
13828 }
13829
13830 return 0;
13831 }
13832
13833 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13834 {
13835 std::lock_guard lock(client_lock);
13836
13837 if (unmounting)
13838 return -ENOTCONN;
13839
13840 if (id < 0)
13841 return -EINVAL;
13842 return objecter->with_osdmap([&](const OSDMap& o) {
13843 return o.crush->get_full_location_ordered(id, path);
13844 });
13845 }
13846
13847 int Client::get_file_stripe_address(int fd, loff_t offset,
13848 vector<entity_addr_t>& address)
13849 {
13850 std::lock_guard lock(client_lock);
13851
13852 if (unmounting)
13853 return -ENOTCONN;
13854
13855 Fh *f = get_filehandle(fd);
13856 if (!f)
13857 return -EBADF;
13858 Inode *in = f->inode.get();
13859
13860 // which object?
13861 vector<ObjectExtent> extents;
13862 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13863 in->truncate_size, extents);
13864 ceph_assert(extents.size() == 1);
13865
13866 // now we have the object and its 'layout'
13867 return objecter->with_osdmap([&](const OSDMap& o) {
13868 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13869 vector<int> osds;
13870 o.pg_to_acting_osds(pg, osds);
13871 if (osds.empty())
13872 return -EINVAL;
13873 for (unsigned i = 0; i < osds.size(); i++) {
13874 entity_addr_t addr = o.get_addrs(osds[i]).front();
13875 address.push_back(addr);
13876 }
13877 return 0;
13878 });
13879 }
13880
13881 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13882 {
13883 std::lock_guard lock(client_lock);
13884
13885 if (unmounting)
13886 return -ENOTCONN;
13887
13888 return objecter->with_osdmap([&](const OSDMap& o) {
13889 if (!o.exists(osd))
13890 return -ENOENT;
13891
13892 addr = o.get_addrs(osd).front();
13893 return 0;
13894 });
13895 }
13896
13897 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13898 loff_t length, loff_t offset)
13899 {
13900 std::lock_guard lock(client_lock);
13901
13902 if (unmounting)
13903 return -ENOTCONN;
13904
13905 Fh *f = get_filehandle(fd);
13906 if (!f)
13907 return -EBADF;
13908 Inode *in = f->inode.get();
13909
13910 // map to a list of extents
13911 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13912
13913 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13914 return 0;
13915 }
13916
13917
13918 /* find an osd with the same ip. -ENXIO if none. */
13919 int Client::get_local_osd()
13920 {
13921 std::lock_guard lock(client_lock);
13922
13923 if (unmounting)
13924 return -ENOTCONN;
13925
13926 objecter->with_osdmap([this](const OSDMap& o) {
13927 if (o.get_epoch() != local_osd_epoch) {
13928 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
13929 local_osd_epoch = o.get_epoch();
13930 }
13931 });
13932 return local_osd;
13933 }
13934
13935
13936
13937
13938
13939
13940 // ===============================
13941
13942 void Client::ms_handle_connect(Connection *con)
13943 {
13944 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
13945 }
13946
13947 bool Client::ms_handle_reset(Connection *con)
13948 {
13949 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13950 return false;
13951 }
13952
13953 void Client::ms_handle_remote_reset(Connection *con)
13954 {
13955 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13956 std::lock_guard l(client_lock);
13957 switch (con->get_peer_type()) {
13958 case CEPH_ENTITY_TYPE_MDS:
13959 {
13960 // kludge to figure out which mds this is; fixme with a Connection* state
13961 mds_rank_t mds = MDS_RANK_NONE;
13962 MetaSession *s = NULL;
13963 for (auto &p : mds_sessions) {
13964 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13965 mds = p.first;
13966 s = &p.second;
13967 }
13968 }
13969 if (mds >= 0) {
13970 assert (s != NULL);
13971 switch (s->state) {
13972 case MetaSession::STATE_CLOSING:
13973 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13974 _closed_mds_session(s);
13975 break;
13976
13977 case MetaSession::STATE_OPENING:
13978 {
13979 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13980 list<Context*> waiters;
13981 waiters.swap(s->waiting_for_open);
13982 _closed_mds_session(s);
13983 MetaSession *news = _get_or_open_mds_session(mds);
13984 news->waiting_for_open.swap(waiters);
13985 }
13986 break;
13987
13988 case MetaSession::STATE_OPEN:
13989 {
13990 objecter->maybe_request_map(); /* to check if we are blacklisted */
13991 const auto& conf = cct->_conf;
13992 if (conf->client_reconnect_stale) {
13993 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13994 _closed_mds_session(s);
13995 } else {
13996 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13997 s->state = MetaSession::STATE_STALE;
13998 }
13999 }
14000 break;
14001
14002 case MetaSession::STATE_NEW:
14003 case MetaSession::STATE_CLOSED:
14004 default:
14005 break;
14006 }
14007 }
14008 }
14009 break;
14010 }
14011 }
14012
14013 bool Client::ms_handle_refused(Connection *con)
14014 {
14015 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14016 return false;
14017 }
14018
14019 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
14020 {
14021 if (dest_type == CEPH_ENTITY_TYPE_MON)
14022 return true;
14023 *authorizer = monclient->build_authorizer(dest_type);
14024 return true;
14025 }
14026
14027 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14028 {
14029 Inode *quota_in = root_ancestor;
14030 SnapRealm *realm = in->snaprealm;
14031 while (realm) {
14032 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14033 if (realm->ino != in->ino) {
14034 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14035 if (p == inode_map.end())
14036 break;
14037
14038 if (p->second->quota.is_enable()) {
14039 quota_in = p->second;
14040 break;
14041 }
14042 }
14043 realm = realm->pparent;
14044 }
14045 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14046 return quota_in;
14047 }
14048
14049 /**
14050 * Traverse quota ancestors of the Inode, return true
14051 * if any of them passes the passed function
14052 */
14053 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14054 std::function<bool (const Inode &in)> test)
14055 {
14056 while (true) {
14057 ceph_assert(in != NULL);
14058 if (test(*in)) {
14059 return true;
14060 }
14061
14062 if (in == root_ancestor) {
14063 // We're done traversing, drop out
14064 return false;
14065 } else {
14066 // Continue up the tree
14067 in = get_quota_root(in, perms);
14068 }
14069 }
14070
14071 return false;
14072 }
14073
14074 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14075 {
14076 return check_quota_condition(in, perms,
14077 [](const Inode &in) {
14078 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14079 });
14080 }
14081
14082 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14083 const UserPerm& perms)
14084 {
14085 return check_quota_condition(in, perms,
14086 [&new_bytes](const Inode &in) {
14087 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14088 > in.quota.max_bytes;
14089 });
14090 }
14091
14092 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14093 {
14094 return check_quota_condition(in, perms,
14095 [](const Inode &in) {
14096 if (in.quota.max_bytes) {
14097 if (in.rstat.rbytes >= in.quota.max_bytes) {
14098 return true;
14099 }
14100
14101 ceph_assert(in.size >= in.reported_size);
14102 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14103 const uint64_t size = in.size - in.reported_size;
14104 return (space >> 4) < size;
14105 } else {
14106 return false;
14107 }
14108 });
14109 }
14110
14111 enum {
14112 POOL_CHECKED = 1,
14113 POOL_CHECKING = 2,
14114 POOL_READ = 4,
14115 POOL_WRITE = 8,
14116 };
14117
14118 int Client::check_pool_perm(Inode *in, int need)
14119 {
14120 if (!cct->_conf->client_check_pool_perm)
14121 return 0;
14122
14123 int64_t pool_id = in->layout.pool_id;
14124 std::string pool_ns = in->layout.pool_ns;
14125 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14126 int have = 0;
14127 while (true) {
14128 auto it = pool_perms.find(perm_key);
14129 if (it == pool_perms.end())
14130 break;
14131 if (it->second == POOL_CHECKING) {
14132 // avoid concurrent checkings
14133 wait_on_list(waiting_for_pool_perm);
14134 } else {
14135 have = it->second;
14136 ceph_assert(have & POOL_CHECKED);
14137 break;
14138 }
14139 }
14140
14141 if (!have) {
14142 if (in->snapid != CEPH_NOSNAP) {
14143 // pool permission check needs to write to the first object. But for snapshot,
14144 // head of the first object may have alread been deleted. To avoid creating
14145 // orphan object, skip the check for now.
14146 return 0;
14147 }
14148
14149 pool_perms[perm_key] = POOL_CHECKING;
14150
14151 char oid_buf[32];
14152 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14153 object_t oid = oid_buf;
14154
14155 SnapContext nullsnapc;
14156
14157 C_SaferCond rd_cond;
14158 ObjectOperation rd_op;
14159 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14160
14161 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14162 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14163
14164 C_SaferCond wr_cond;
14165 ObjectOperation wr_op;
14166 wr_op.create(true);
14167
14168 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14169 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14170
14171 client_lock.Unlock();
14172 int rd_ret = rd_cond.wait();
14173 int wr_ret = wr_cond.wait();
14174 client_lock.Lock();
14175
14176 bool errored = false;
14177
14178 if (rd_ret == 0 || rd_ret == -ENOENT)
14179 have |= POOL_READ;
14180 else if (rd_ret != -EPERM) {
14181 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14182 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14183 errored = true;
14184 }
14185
14186 if (wr_ret == 0 || wr_ret == -EEXIST)
14187 have |= POOL_WRITE;
14188 else if (wr_ret != -EPERM) {
14189 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14190 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14191 errored = true;
14192 }
14193
14194 if (errored) {
14195 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14196 // Raise EIO because actual error code might be misleading for
14197 // userspace filesystem user.
14198 pool_perms.erase(perm_key);
14199 signal_cond_list(waiting_for_pool_perm);
14200 return -EIO;
14201 }
14202
14203 pool_perms[perm_key] = have | POOL_CHECKED;
14204 signal_cond_list(waiting_for_pool_perm);
14205 }
14206
14207 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14208 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14209 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14210 return -EPERM;
14211 }
14212 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14213 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14214 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14215 return -EPERM;
14216 }
14217
14218 return 0;
14219 }
14220
14221 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14222 {
14223 if (acl_type == POSIX_ACL) {
14224 if (in->xattrs.count(ACL_EA_ACCESS)) {
14225 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14226
14227 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14228 }
14229 }
14230 return -EAGAIN;
14231 }
14232
14233 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14234 {
14235 if (acl_type == NO_ACL)
14236 return 0;
14237
14238 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14239 if (r < 0)
14240 goto out;
14241
14242 if (acl_type == POSIX_ACL) {
14243 if (in->xattrs.count(ACL_EA_ACCESS)) {
14244 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14245 bufferptr acl(access_acl.c_str(), access_acl.length());
14246 r = posix_acl_access_chmod(acl, mode);
14247 if (r < 0)
14248 goto out;
14249 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14250 } else {
14251 r = 0;
14252 }
14253 }
14254 out:
14255 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14256 return r;
14257 }
14258
14259 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14260 const UserPerm& perms)
14261 {
14262 if (acl_type == NO_ACL)
14263 return 0;
14264
14265 if (S_ISLNK(*mode))
14266 return 0;
14267
14268 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14269 if (r < 0)
14270 goto out;
14271
14272 if (acl_type == POSIX_ACL) {
14273 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14274 map<string, bufferptr> xattrs;
14275
14276 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14277 bufferptr acl(default_acl.c_str(), default_acl.length());
14278 r = posix_acl_inherit_mode(acl, mode);
14279 if (r < 0)
14280 goto out;
14281
14282 if (r > 0) {
14283 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14284 if (r < 0)
14285 goto out;
14286 if (r > 0)
14287 xattrs[ACL_EA_ACCESS] = acl;
14288 }
14289
14290 if (S_ISDIR(*mode))
14291 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14292
14293 r = xattrs.size();
14294 if (r > 0)
14295 encode(xattrs, xattrs_bl);
14296 } else {
14297 if (umask_cb)
14298 *mode &= ~umask_cb(callback_handle);
14299 r = 0;
14300 }
14301 }
14302 out:
14303 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14304 return r;
14305 }
14306
14307 void Client::set_filer_flags(int flags)
14308 {
14309 std::lock_guard l(client_lock);
14310 ceph_assert(flags == 0 ||
14311 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14312 objecter->add_global_op_flags(flags);
14313 }
14314
14315 void Client::clear_filer_flags(int flags)
14316 {
14317 std::lock_guard l(client_lock);
14318 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14319 objecter->clear_global_op_flag(flags);
14320 }
14321
14322 // called before mount
14323 void Client::set_uuid(const std::string& uuid)
14324 {
14325 std::lock_guard l(client_lock);
14326 assert(initialized);
14327 assert(!uuid.empty());
14328
14329 metadata["uuid"] = uuid;
14330 _close_sessions();
14331 }
14332
14333 // called before mount. 0 means infinite
14334 void Client::set_session_timeout(unsigned timeout)
14335 {
14336 std::lock_guard l(client_lock);
14337 assert(initialized);
14338
14339 metadata["timeout"] = stringify(timeout);
14340 }
14341
14342 // called before mount
14343 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14344 const std::string& fs_name)
14345 {
14346 std::lock_guard l(client_lock);
14347 if (!initialized)
14348 return -ENOTCONN;
14349
14350 if (uuid.empty())
14351 return -EINVAL;
14352
14353 {
14354 auto it = metadata.find("uuid");
14355 if (it != metadata.end() && it->second == uuid)
14356 return -EINVAL;
14357 }
14358
14359 int r = subscribe_mdsmap(fs_name);
14360 if (r < 0) {
14361 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14362 return r;
14363 }
14364
14365 if (metadata.empty())
14366 populate_metadata("");
14367
14368 while (mdsmap->get_epoch() == 0)
14369 wait_on_list(waiting_for_mdsmap);
14370
14371 reclaim_errno = 0;
14372 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14373 if (!mdsmap->is_up(mds)) {
14374 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14375 wait_on_list(waiting_for_mdsmap);
14376 continue;
14377 }
14378
14379 MetaSession *session;
14380 if (!have_open_session(mds)) {
14381 session = _get_or_open_mds_session(mds);
14382 if (session->state != MetaSession::STATE_OPENING) {
14383 // umounting?
14384 return -EINVAL;
14385 }
14386 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14387 wait_on_context_list(session->waiting_for_open);
14388 if (rejected_by_mds.count(mds))
14389 return -EPERM;
14390 continue;
14391 }
14392
14393 session = &mds_sessions.at(mds);
14394 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14395 return -EOPNOTSUPP;
14396
14397 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14398 session->reclaim_state == MetaSession::RECLAIMING) {
14399 session->reclaim_state = MetaSession::RECLAIMING;
14400 auto m = MClientReclaim::create(uuid, flags);
14401 session->con->send_message2(std::move(m));
14402 wait_on_list(waiting_for_reclaim);
14403 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14404 return reclaim_errno ? : -ENOTRECOVERABLE;
14405 } else {
14406 mds++;
14407 }
14408 }
14409
14410 // didn't find target session in any mds
14411 if (reclaim_target_addrs.empty()) {
14412 if (flags & CEPH_RECLAIM_RESET)
14413 return -ENOENT;
14414 return -ENOTRECOVERABLE;
14415 }
14416
14417 if (flags & CEPH_RECLAIM_RESET)
14418 return 0;
14419
14420 // use blacklist to check if target session was killed
14421 // (config option mds_session_blacklist_on_evict needs to be true)
14422 C_SaferCond cond;
14423 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14424 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14425 client_lock.Unlock();
14426 cond.wait();
14427 client_lock.Lock();
14428 }
14429
14430 bool blacklisted = objecter->with_osdmap(
14431 [this](const OSDMap &osd_map) -> bool {
14432 return osd_map.is_blacklisted(reclaim_target_addrs);
14433 });
14434 if (blacklisted)
14435 return -ENOTRECOVERABLE;
14436
14437 metadata["reclaiming_uuid"] = uuid;
14438 return 0;
14439 }
14440
14441 void Client::finish_reclaim()
14442 {
14443 auto it = metadata.find("reclaiming_uuid");
14444 if (it == metadata.end()) {
14445 for (auto &p : mds_sessions)
14446 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14447 return;
14448 }
14449
14450 for (auto &p : mds_sessions) {
14451 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14452 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14453 p.second.con->send_message2(std::move(m));
14454 }
14455
14456 metadata["uuid"] = it->second;
14457 metadata.erase(it);
14458 }
14459
14460 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14461 {
14462 mds_rank_t from = mds_rank_t(reply->get_source().num());
14463 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14464
14465 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14466 if (!session) {
14467 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14468 return;
14469 }
14470
14471 if (reply->get_result() >= 0) {
14472 session->reclaim_state = MetaSession::RECLAIM_OK;
14473 if (reply->get_epoch() > reclaim_osd_epoch)
14474 reclaim_osd_epoch = reply->get_epoch();
14475 if (!reply->get_addrs().empty())
14476 reclaim_target_addrs = reply->get_addrs();
14477 } else {
14478 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14479 reclaim_errno = reply->get_result();
14480 }
14481
14482 signal_cond_list(waiting_for_reclaim);
14483 }
14484
14485 /**
14486 * This is included in cap release messages, to cause
14487 * the MDS to wait until this OSD map epoch. It is necessary
14488 * in corner cases where we cancel RADOS ops, so that
14489 * nobody else tries to do IO to the same objects in
14490 * the same epoch as the cancelled ops.
14491 */
14492 void Client::set_cap_epoch_barrier(epoch_t e)
14493 {
14494 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14495 cap_epoch_barrier = e;
14496 }
14497
14498 const char** Client::get_tracked_conf_keys() const
14499 {
14500 static const char* keys[] = {
14501 "client_cache_size",
14502 "client_cache_mid",
14503 "client_acl_type",
14504 "client_deleg_timeout",
14505 "client_deleg_break_on_open",
14506 NULL
14507 };
14508 return keys;
14509 }
14510
14511 void Client::handle_conf_change(const ConfigProxy& conf,
14512 const std::set <std::string> &changed)
14513 {
14514 std::lock_guard lock(client_lock);
14515
14516 if (changed.count("client_cache_mid")) {
14517 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14518 }
14519 if (changed.count("client_acl_type")) {
14520 acl_type = NO_ACL;
14521 if (cct->_conf->client_acl_type == "posix_acl")
14522 acl_type = POSIX_ACL;
14523 }
14524 }
14525
14526 void intrusive_ptr_add_ref(Inode *in)
14527 {
14528 in->get();
14529 }
14530
14531 void intrusive_ptr_release(Inode *in)
14532 {
14533 in->client->put_inode(in);
14534 }
14535
14536 mds_rank_t Client::_get_random_up_mds() const
14537 {
14538 ceph_assert(client_lock.is_locked_by_me());
14539
14540 std::set<mds_rank_t> up;
14541 mdsmap->get_up_mds_set(up);
14542
14543 if (up.empty())
14544 return MDS_RANK_NONE;
14545 std::set<mds_rank_t>::const_iterator p = up.begin();
14546 for (int n = rand() % up.size(); n; n--)
14547 ++p;
14548 return *p;
14549 }
14550
14551
14552 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14553 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14554 {
14555 monclient->set_messenger(m);
14556 objecter->set_client_incarnation(0);
14557 }
14558
14559 StandaloneClient::~StandaloneClient()
14560 {
14561 delete objecter;
14562 objecter = nullptr;
14563 }
14564
14565 int StandaloneClient::init()
14566 {
14567 timer.init();
14568 objectcacher->start();
14569 objecter->init();
14570
14571 client_lock.Lock();
14572 ceph_assert(!is_initialized());
14573
14574 messenger->add_dispatcher_tail(objecter);
14575 messenger->add_dispatcher_tail(this);
14576
14577 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14578 int r = monclient->init();
14579 if (r < 0) {
14580 // need to do cleanup because we're in an intermediate init state
14581 timer.shutdown();
14582 client_lock.Unlock();
14583 objecter->shutdown();
14584 objectcacher->stop();
14585 monclient->shutdown();
14586 return r;
14587 }
14588 objecter->start();
14589
14590 client_lock.Unlock();
14591 _finish_init();
14592
14593 return 0;
14594 }
14595
14596 void StandaloneClient::shutdown()
14597 {
14598 Client::shutdown();
14599 objecter->shutdown();
14600 monclient->shutdown();
14601 }