]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
update sources to v12.2.5
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <sys/stat.h>
22 #include <sys/param.h>
23 #include <fcntl.h>
24 #include <sys/file.h>
25 #include <sys/utsname.h>
26 #include <sys/uio.h>
27
28 #include <boost/lexical_cast.hpp>
29 #include <boost/fusion/include/std_pair.hpp>
30
31 #if defined(__FreeBSD__)
32 #define XATTR_CREATE 0x1
33 #define XATTR_REPLACE 0x2
34 #else
35 #include <sys/xattr.h>
36 #endif
37
38 #if defined(__linux__)
39 #include <linux/falloc.h>
40 #endif
41
42 #include <sys/statvfs.h>
43
44 #include "common/config.h"
45 #include "common/version.h"
46
47 // ceph stuff
48 #include "messages/MClientSession.h"
49 #include "messages/MClientReconnect.h"
50 #include "messages/MClientRequest.h"
51 #include "messages/MClientRequestForward.h"
52 #include "messages/MClientReply.h"
53 #include "messages/MClientCaps.h"
54 #include "messages/MClientLease.h"
55 #include "messages/MClientSnap.h"
56 #include "messages/MCommandReply.h"
57 #include "messages/MOSDMap.h"
58 #include "messages/MClientQuota.h"
59 #include "messages/MClientCapRelease.h"
60 #include "messages/MMDSMap.h"
61 #include "messages/MFSMap.h"
62 #include "messages/MFSMapUser.h"
63
64 #include "mon/MonClient.h"
65
66 #include "mds/flock.h"
67 #include "osd/OSDMap.h"
68 #include "osdc/Filer.h"
69
70 #include "common/Cond.h"
71 #include "common/Mutex.h"
72 #include "common/perf_counters.h"
73 #include "common/admin_socket.h"
74 #include "common/errno.h"
75 #include "include/str_list.h"
76
77 #define dout_subsys ceph_subsys_client
78
79 #include "include/lru.h"
80 #include "include/compat.h"
81 #include "include/stringify.h"
82
83 #include "Client.h"
84 #include "Inode.h"
85 #include "Dentry.h"
86 #include "Delegation.h"
87 #include "Dir.h"
88 #include "ClientSnapRealm.h"
89 #include "Fh.h"
90 #include "MetaSession.h"
91 #include "MetaRequest.h"
92 #include "ObjecterWriteback.h"
93 #include "posix_acl.h"
94
95 #include "include/assert.h"
96 #include "include/stat.h"
97
98 #include "include/cephfs/ceph_statx.h"
99
100 #if HAVE_GETGROUPLIST
101 #include <grp.h>
102 #include <pwd.h>
103 #include <unistd.h>
104 #endif
105
106 #undef dout_prefix
107 #define dout_prefix *_dout << "client." << whoami << " "
108
109 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111 // FreeBSD fails to define this
112 #ifndef O_DSYNC
113 #define O_DSYNC 0x0
114 #endif
115 // Darwin fails to define this
116 #ifndef O_RSYNC
117 #define O_RSYNC 0x0
118 #endif
119
120 #ifndef O_DIRECT
121 #define O_DIRECT 0x0
122 #endif
123
124 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127 {
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130 }
131
132
133 // -------------
134
135 Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137 {
138 }
139
140 bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142 {
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163 }
164
165
166 // -------------
167
168 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174 void Client::_reset_faked_inos()
175 {
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181 }
182
183 void Client::_assign_faked_ino(Inode *in)
184 {
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201 }
202
203 void Client::_release_faked_ino(Inode *in)
204 {
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207 }
208
209 vinodeno_t Client::_map_faked_ino(ino_t ino)
210 {
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220 }
221
222 vinodeno_t Client::map_faked_ino(ino_t ino)
223 {
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226 }
227
228 // cons/des
229
230 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
239 getgroups_cb(NULL),
240 umask_cb(NULL),
241 can_invalidate_dentries(false),
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
247 tick_event(NULL),
248 messenger(m), monclient(mc),
249 objecter(objecter_),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 initialized(false),
253 mounted(false), unmounting(false), blacklisted(false),
254 local_osd(-ENXIO), local_osd_epoch(0),
255 unsafe_sync_write(0),
256 client_lock("Client::client_lock"),
257 deleg_timeout(0)
258 {
259 _reset_faked_inos();
260 //
261 root = 0;
262
263 num_flushing_caps = 0;
264
265 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
266 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
267
268 user_id = cct->_conf->client_mount_uid;
269 group_id = cct->_conf->client_mount_gid;
270
271 acl_type = NO_ACL;
272 if (cct->_conf->client_acl_type == "posix_acl")
273 acl_type = POSIX_ACL;
274
275 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
276
277 // file handles
278 free_fd_set.insert(10, 1<<30);
279
280 mdsmap.reset(new MDSMap);
281
282 // osd interfaces
283 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
284 &client_lock));
285 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
286 client_flush_set_callback, // all commit callback
287 (void*)this,
288 cct->_conf->client_oc_size,
289 cct->_conf->client_oc_max_objects,
290 cct->_conf->client_oc_max_dirty,
291 cct->_conf->client_oc_target_dirty,
292 cct->_conf->client_oc_max_dirty_age,
293 true));
294 objecter_finisher.start();
295 filer.reset(new Filer(objecter, &objecter_finisher));
296 objecter->enable_blacklist_events();
297 }
298
299
300 Client::~Client()
301 {
302 assert(!client_lock.is_locked());
303
304 // It is necessary to hold client_lock, because any inode destruction
305 // may call into ObjectCacher, which asserts that it's lock (which is
306 // client_lock) is held.
307 client_lock.Lock();
308 tear_down_cache();
309 client_lock.Unlock();
310 }
311
312 void Client::tear_down_cache()
313 {
314 // fd's
315 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
316 it != fd_map.end();
317 ++it) {
318 Fh *fh = it->second;
319 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
320 _release_fh(fh);
321 }
322 fd_map.clear();
323
324 while (!opened_dirs.empty()) {
325 dir_result_t *dirp = *opened_dirs.begin();
326 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
327 _closedir(dirp);
328 }
329
330 // caps!
331 // *** FIXME ***
332
333 // empty lru
334 trim_cache();
335 assert(lru.lru_get_size() == 0);
336
337 // close root ino
338 assert(inode_map.size() <= 1 + root_parents.size());
339 if (root && inode_map.size() == 1 + root_parents.size()) {
340 delete root;
341 root = 0;
342 root_ancestor = 0;
343 while (!root_parents.empty())
344 root_parents.erase(root_parents.begin());
345 inode_map.clear();
346 _reset_faked_inos();
347 }
348
349 assert(inode_map.empty());
350 }
351
352 inodeno_t Client::get_root_ino()
353 {
354 Mutex::Locker l(client_lock);
355 if (use_faked_inos())
356 return root->faked_ino;
357 else
358 return root->ino;
359 }
360
361 Inode *Client::get_root()
362 {
363 Mutex::Locker l(client_lock);
364 root->ll_get();
365 return root;
366 }
367
368
369 // debug crapola
370
371 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
372 {
373 filepath path;
374 in->make_long_path(path);
375 ldout(cct, 1) << "dump_inode: "
376 << (disconnected ? "DISCONNECTED ":"")
377 << "inode " << in->ino
378 << " " << path
379 << " ref " << in->get_num_ref()
380 << *in << dendl;
381
382 if (f) {
383 f->open_object_section("inode");
384 f->dump_stream("path") << path;
385 if (disconnected)
386 f->dump_int("disconnected", 1);
387 in->dump(f);
388 f->close_section();
389 }
390
391 did.insert(in);
392 if (in->dir) {
393 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
394 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
395 it != in->dir->dentries.end();
396 ++it) {
397 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
398 if (f) {
399 f->open_object_section("dentry");
400 it->second->dump(f);
401 f->close_section();
402 }
403 if (it->second->inode)
404 dump_inode(f, it->second->inode.get(), did, false);
405 }
406 }
407 }
408
409 void Client::dump_cache(Formatter *f)
410 {
411 set<Inode*> did;
412
413 ldout(cct, 1) << "dump_cache" << dendl;
414
415 if (f)
416 f->open_array_section("cache");
417
418 if (root)
419 dump_inode(f, root, did, true);
420
421 // make a second pass to catch anything disconnected
422 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
423 it != inode_map.end();
424 ++it) {
425 if (did.count(it->second))
426 continue;
427 dump_inode(f, it->second, did, true);
428 }
429
430 if (f)
431 f->close_section();
432 }
433
434 void Client::dump_status(Formatter *f)
435 {
436 assert(client_lock.is_locked_by_me());
437
438 ldout(cct, 1) << __func__ << dendl;
439
440 const epoch_t osd_epoch
441 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
442
443 if (f) {
444 f->open_object_section("metadata");
445 for (const auto& kv : metadata)
446 f->dump_string(kv.first.c_str(), kv.second);
447 f->close_section();
448
449 f->dump_int("dentry_count", lru.lru_get_size());
450 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
451 f->dump_int("id", get_nodeid().v);
452 f->dump_int("inode_count", inode_map.size());
453 f->dump_int("mds_epoch", mdsmap->get_epoch());
454 f->dump_int("osd_epoch", osd_epoch);
455 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
456 }
457 }
458
459 int Client::init()
460 {
461 timer.init();
462 objectcacher->start();
463
464 client_lock.Lock();
465 assert(!initialized);
466
467 messenger->add_dispatcher_tail(this);
468 client_lock.Unlock();
469
470 _finish_init();
471 return 0;
472 }
473
474 void Client::_finish_init()
475 {
476 client_lock.Lock();
477 // logger
478 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
479 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
480 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
481 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
482 logger.reset(plb.create_perf_counters());
483 cct->get_perfcounters_collection()->add(logger.get());
484
485 client_lock.Unlock();
486
487 cct->_conf->add_observer(this);
488
489 AdminSocket* admin_socket = cct->get_admin_socket();
490 int ret = admin_socket->register_command("mds_requests",
491 "mds_requests",
492 &m_command_hook,
493 "show in-progress mds requests");
494 if (ret < 0) {
495 lderr(cct) << "error registering admin socket command: "
496 << cpp_strerror(-ret) << dendl;
497 }
498 ret = admin_socket->register_command("mds_sessions",
499 "mds_sessions",
500 &m_command_hook,
501 "show mds session state");
502 if (ret < 0) {
503 lderr(cct) << "error registering admin socket command: "
504 << cpp_strerror(-ret) << dendl;
505 }
506 ret = admin_socket->register_command("dump_cache",
507 "dump_cache",
508 &m_command_hook,
509 "show in-memory metadata cache contents");
510 if (ret < 0) {
511 lderr(cct) << "error registering admin socket command: "
512 << cpp_strerror(-ret) << dendl;
513 }
514 ret = admin_socket->register_command("kick_stale_sessions",
515 "kick_stale_sessions",
516 &m_command_hook,
517 "kick sessions that were remote reset");
518 if (ret < 0) {
519 lderr(cct) << "error registering admin socket command: "
520 << cpp_strerror(-ret) << dendl;
521 }
522 ret = admin_socket->register_command("status",
523 "status",
524 &m_command_hook,
525 "show overall client status");
526 if (ret < 0) {
527 lderr(cct) << "error registering admin socket command: "
528 << cpp_strerror(-ret) << dendl;
529 }
530
531 client_lock.Lock();
532 initialized = true;
533 client_lock.Unlock();
534 }
535
536 void Client::shutdown()
537 {
538 ldout(cct, 1) << "shutdown" << dendl;
539
540 // If we were not mounted, but were being used for sending
541 // MDS commands, we may have sessions that need closing.
542 client_lock.Lock();
543 _close_sessions();
544 client_lock.Unlock();
545
546 cct->_conf->remove_observer(this);
547
548 AdminSocket* admin_socket = cct->get_admin_socket();
549 admin_socket->unregister_command("mds_requests");
550 admin_socket->unregister_command("mds_sessions");
551 admin_socket->unregister_command("dump_cache");
552 admin_socket->unregister_command("kick_stale_sessions");
553 admin_socket->unregister_command("status");
554
555 if (ino_invalidate_cb) {
556 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
557 async_ino_invalidator.wait_for_empty();
558 async_ino_invalidator.stop();
559 }
560
561 if (dentry_invalidate_cb) {
562 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
563 async_dentry_invalidator.wait_for_empty();
564 async_dentry_invalidator.stop();
565 }
566
567 if (switch_interrupt_cb) {
568 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
569 interrupt_finisher.wait_for_empty();
570 interrupt_finisher.stop();
571 }
572
573 if (remount_cb) {
574 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
575 remount_finisher.wait_for_empty();
576 remount_finisher.stop();
577 }
578
579 objectcacher->stop(); // outside of client_lock! this does a join.
580
581 client_lock.Lock();
582 assert(initialized);
583 initialized = false;
584 timer.shutdown();
585 client_lock.Unlock();
586
587 objecter_finisher.wait_for_empty();
588 objecter_finisher.stop();
589
590 if (logger) {
591 cct->get_perfcounters_collection()->remove(logger.get());
592 logger.reset();
593 }
594 }
595
596
597 // ===================
598 // metadata cache stuff
599
600 void Client::trim_cache(bool trim_kernel_dcache)
601 {
602 uint64_t max = cct->_conf->client_cache_size;
603 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
604 unsigned last = 0;
605 while (lru.lru_get_size() != last) {
606 last = lru.lru_get_size();
607
608 if (!unmounting && lru.lru_get_size() <= max) break;
609
610 // trim!
611 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
612 if (!dn)
613 break; // done
614
615 trim_dentry(dn);
616 }
617
618 if (trim_kernel_dcache && lru.lru_get_size() > max)
619 _invalidate_kernel_dcache();
620
621 // hose root?
622 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
623 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
624 delete root;
625 root = 0;
626 root_ancestor = 0;
627 while (!root_parents.empty())
628 root_parents.erase(root_parents.begin());
629 inode_map.clear();
630 _reset_faked_inos();
631 }
632 }
633
634 void Client::trim_cache_for_reconnect(MetaSession *s)
635 {
636 mds_rank_t mds = s->mds_num;
637 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
638
639 int trimmed = 0;
640 list<Dentry*> skipped;
641 while (lru.lru_get_size() > 0) {
642 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
643 if (!dn)
644 break;
645
646 if ((dn->inode && dn->inode->caps.count(mds)) ||
647 dn->dir->parent_inode->caps.count(mds)) {
648 trim_dentry(dn);
649 trimmed++;
650 } else
651 skipped.push_back(dn);
652 }
653
654 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
655 lru.lru_insert_mid(*p);
656
657 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
658 << " trimmed " << trimmed << " dentries" << dendl;
659
660 if (s->caps.size() > 0)
661 _invalidate_kernel_dcache();
662 }
663
664 void Client::trim_dentry(Dentry *dn)
665 {
666 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
667 << " in dir " << hex << dn->dir->parent_inode->ino
668 << dendl;
669 if (dn->inode) {
670 Inode *diri = dn->dir->parent_inode;
671 diri->dir_release_count++;
672 clear_dir_complete_and_ordered(diri, true);
673 }
674 unlink(dn, false, false); // drop dir, drop dentry
675 }
676
677
678 void Client::update_inode_file_bits(Inode *in,
679 uint64_t truncate_seq, uint64_t truncate_size,
680 uint64_t size, uint64_t change_attr,
681 uint64_t time_warp_seq, utime_t ctime,
682 utime_t mtime,
683 utime_t atime,
684 version_t inline_version,
685 bufferlist& inline_data,
686 int issued)
687 {
688 bool warn = false;
689 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
690 << " mtime " << mtime << dendl;
691 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
692 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
693 << " local " << in->time_warp_seq << dendl;
694 uint64_t prior_size = in->size;
695
696 if (inline_version > in->inline_version) {
697 in->inline_data = inline_data;
698 in->inline_version = inline_version;
699 }
700
701 /* always take a newer change attr */
702 if (change_attr > in->change_attr)
703 in->change_attr = change_attr;
704
705 if (truncate_seq > in->truncate_seq ||
706 (truncate_seq == in->truncate_seq && size > in->size)) {
707 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
708 in->size = size;
709 in->reported_size = size;
710 if (truncate_seq != in->truncate_seq) {
711 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
712 << truncate_seq << dendl;
713 in->truncate_seq = truncate_seq;
714 in->oset.truncate_seq = truncate_seq;
715
716 // truncate cached file data
717 if (prior_size > size) {
718 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
719 }
720 }
721
722 // truncate inline data
723 if (in->inline_version < CEPH_INLINE_NONE) {
724 uint32_t len = in->inline_data.length();
725 if (size < len)
726 in->inline_data.splice(size, len - size);
727 }
728 }
729 if (truncate_seq >= in->truncate_seq &&
730 in->truncate_size != truncate_size) {
731 if (in->is_file()) {
732 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
733 << truncate_size << dendl;
734 in->truncate_size = truncate_size;
735 in->oset.truncate_size = truncate_size;
736 } else {
737 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
738 }
739 }
740
741 // be careful with size, mtime, atime
742 if (issued & (CEPH_CAP_FILE_EXCL|
743 CEPH_CAP_FILE_WR|
744 CEPH_CAP_FILE_BUFFER|
745 CEPH_CAP_AUTH_EXCL|
746 CEPH_CAP_XATTR_EXCL)) {
747 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
748 if (ctime > in->ctime)
749 in->ctime = ctime;
750 if (time_warp_seq > in->time_warp_seq) {
751 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
752 << " is higher than local time_warp_seq "
753 << in->time_warp_seq << dendl;
754 //the mds updated times, so take those!
755 in->mtime = mtime;
756 in->atime = atime;
757 in->time_warp_seq = time_warp_seq;
758 } else if (time_warp_seq == in->time_warp_seq) {
759 //take max times
760 if (mtime > in->mtime)
761 in->mtime = mtime;
762 if (atime > in->atime)
763 in->atime = atime;
764 } else if (issued & CEPH_CAP_FILE_EXCL) {
765 //ignore mds values as we have a higher seq
766 } else warn = true;
767 } else {
768 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
769 if (time_warp_seq >= in->time_warp_seq) {
770 in->ctime = ctime;
771 in->mtime = mtime;
772 in->atime = atime;
773 in->time_warp_seq = time_warp_seq;
774 } else warn = true;
775 }
776 if (warn) {
777 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
778 << time_warp_seq << " is lower than local time_warp_seq "
779 << in->time_warp_seq
780 << dendl;
781 }
782 }
783
784 void Client::_fragmap_remove_non_leaves(Inode *in)
785 {
786 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (!in->dirfragtree.is_leaf(p->first))
788 in->fragmap.erase(p++);
789 else
790 ++p;
791 }
792
793 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
794 {
795 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
796 if (p->second == mds)
797 in->fragmap.erase(p++);
798 else
799 ++p;
800 }
801
802 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
803 MetaSession *session,
804 const UserPerm& request_perms)
805 {
806 Inode *in;
807 bool was_new = false;
808 if (inode_map.count(st->vino)) {
809 in = inode_map[st->vino];
810 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
811 } else {
812 in = new Inode(this, st->vino, &st->layout);
813 inode_map[st->vino] = in;
814
815 if (use_faked_inos())
816 _assign_faked_ino(in);
817
818 if (!root) {
819 root = in;
820 root_ancestor = in;
821 cwd = root;
822 } else if (!mounted) {
823 root_parents[root_ancestor] = in;
824 root_ancestor = in;
825 }
826
827 // immutable bits
828 in->ino = st->vino.ino;
829 in->snapid = st->vino.snapid;
830 in->mode = st->mode & S_IFMT;
831 was_new = true;
832 }
833
834 in->rdev = st->rdev;
835 if (in->is_symlink())
836 in->symlink = st->symlink;
837
838 if (was_new)
839 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
840
841 if (!st->cap.caps)
842 return in; // as with readdir returning indoes in different snaprealms (no caps!)
843
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool updating_inode = false;
846 int issued = 0;
847 if (st->version == 0 ||
848 (in->version & ~1) < st->version) {
849 updating_inode = true;
850
851 int implemented = 0;
852 issued = in->caps_issued(&implemented) | in->caps_dirty();
853 issued |= implemented;
854
855 in->version = st->version;
856
857 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
862 }
863
864 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
865 in->nlink = st->nlink;
866 }
867
868 in->dirstat = st->dirstat;
869 in->rstat = st->rstat;
870 in->quota = st->quota;
871 in->layout = st->layout;
872
873 if (in->is_dir()) {
874 in->dir_layout = st->dir_layout;
875 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
876 }
877
878 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
879 st->change_attr, st->time_warp_seq, st->ctime,
880 st->mtime, st->atime, st->inline_version,
881 st->inline_data, issued);
882 } else if (st->inline_version > in->inline_version) {
883 in->inline_data = st->inline_data;
884 in->inline_version = st->inline_version;
885 }
886
887 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
888 st->xattrbl.length() &&
889 st->xattr_version > in->xattr_version) {
890 bufferlist::iterator p = st->xattrbl.begin();
891 ::decode(in->xattrs, p);
892 in->xattr_version = st->xattr_version;
893 }
894
895 // move me if/when version reflects fragtree changes.
896 if (in->dirfragtree != st->dirfragtree) {
897 in->dirfragtree = st->dirfragtree;
898 _fragmap_remove_non_leaves(in);
899 }
900
901 if (in->snapid == CEPH_NOSNAP) {
902 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
903 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
904 request_perms);
905 if (in->auth_cap && in->auth_cap->session == session)
906 in->max_size = st->max_size;
907 } else
908 in->snap_caps |= st->cap.caps;
909
910 // setting I_COMPLETE needs to happen after adding the cap
911 if (updating_inode &&
912 in->is_dir() &&
913 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
914 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
915 in->dirstat.nfiles == 0 &&
916 in->dirstat.nsubdirs == 0) {
917 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
918 in->flags |= I_COMPLETE | I_DIR_ORDERED;
919 if (in->dir) {
920 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
921 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
922 in->dir->readdir_cache.clear();
923 for (auto p = in->dir->dentries.begin();
924 p != in->dir->dentries.end();
925 ++p) {
926 unlink(p->second, true, true); // keep dir, keep dentry
927 }
928 if (in->dir->dentries.empty())
929 close_dir(in->dir);
930 }
931 }
932
933 return in;
934 }
935
936
937 /*
938 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
939 */
940 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
941 Inode *in, utime_t from, MetaSession *session,
942 Dentry *old_dentry)
943 {
944 Dentry *dn = NULL;
945 if (dir->dentries.count(dname))
946 dn = dir->dentries[dname];
947
948 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
949 << " in dir " << dir->parent_inode->vino() << " dn " << dn
950 << dendl;
951
952 if (dn && dn->inode) {
953 if (dn->inode->vino() == in->vino()) {
954 touch_dn(dn);
955 ldout(cct, 12) << " had dentry " << dname
956 << " with correct vino " << dn->inode->vino()
957 << dendl;
958 } else {
959 ldout(cct, 12) << " had dentry " << dname
960 << " with WRONG vino " << dn->inode->vino()
961 << dendl;
962 unlink(dn, true, true); // keep dir, keep dentry
963 }
964 }
965
966 if (!dn || !dn->inode) {
967 InodeRef tmp_ref(in);
968 if (old_dentry) {
969 if (old_dentry->dir != dir) {
970 Inode *old_diri = old_dentry->dir->parent_inode;
971 old_diri->dir_ordered_count++;
972 clear_dir_complete_and_ordered(old_diri, false);
973 }
974 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
975 }
976 Inode *diri = dir->parent_inode;
977 diri->dir_ordered_count++;
978 clear_dir_complete_and_ordered(diri, false);
979 dn = link(dir, dname, in, dn);
980 }
981
982 update_dentry_lease(dn, dlease, from, session);
983 return dn;
984 }
985
986 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
987 {
988 utime_t dttl = from;
989 dttl += (float)dlease->duration_ms / 1000.0;
990
991 assert(dn);
992
993 if (dlease->mask & CEPH_LOCK_DN) {
994 if (dttl > dn->lease_ttl) {
995 ldout(cct, 10) << "got dentry lease on " << dn->name
996 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
997 dn->lease_ttl = dttl;
998 dn->lease_mds = session->mds_num;
999 dn->lease_seq = dlease->seq;
1000 dn->lease_gen = session->cap_gen;
1001 }
1002 }
1003 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1004 }
1005
1006
1007 /*
1008 * update MDS location cache for a single inode
1009 */
1010 void Client::update_dir_dist(Inode *in, DirStat *dst)
1011 {
1012 // auth
1013 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1014 if (dst->auth >= 0) {
1015 in->fragmap[dst->frag] = dst->auth;
1016 } else {
1017 in->fragmap.erase(dst->frag);
1018 }
1019 if (!in->dirfragtree.is_leaf(dst->frag)) {
1020 in->dirfragtree.force_to_leaf(cct, dst->frag);
1021 _fragmap_remove_non_leaves(in);
1022 }
1023
1024 // replicated
1025 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1026
1027 // dist
1028 /*
1029 if (!st->dirfrag_dist.empty()) { // FIXME
1030 set<int> dist = st->dirfrag_dist.begin()->second;
1031 if (dist.empty() && !in->dir_contacts.empty())
1032 ldout(cct, 9) << "lost dist spec for " << in->ino
1033 << " " << dist << dendl;
1034 if (!dist.empty() && in->dir_contacts.empty())
1035 ldout(cct, 9) << "got dist spec for " << in->ino
1036 << " " << dist << dendl;
1037 in->dir_contacts = dist;
1038 }
1039 */
1040 }
1041
1042 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043 {
1044 if (diri->flags & I_COMPLETE) {
1045 if (complete) {
1046 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1047 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048 } else {
1049 if (diri->flags & I_DIR_ORDERED) {
1050 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1051 diri->flags &= ~I_DIR_ORDERED;
1052 }
1053 }
1054 if (diri->dir)
1055 diri->dir->readdir_cache.clear();
1056 }
1057 }
1058
1059 /*
1060 * insert results from readdir or lssnap into the metadata cache.
1061 */
1062 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063
1064 MClientReply *reply = request->reply;
1065 ConnectionRef con = request->reply->get_connection();
1066 uint64_t features = con->get_features();
1067
1068 dir_result_t *dirp = request->dirp;
1069 assert(dirp);
1070
1071 // the extra buffer list is only set for readdir and lssnap replies
1072 bufferlist::iterator p = reply->get_extra_bl().begin();
1073 if (!p.end()) {
1074 // snapdir?
1075 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076 assert(diri);
1077 diri = open_snapdir(diri);
1078 }
1079
1080 // only open dir if we're actually adding stuff to it!
1081 Dir *dir = diri->open_dir();
1082 assert(dir);
1083
1084 // dirstat
1085 DirStat dst(p);
1086 __u32 numdn;
1087 __u16 flags;
1088 ::decode(numdn, p);
1089 ::decode(flags, p);
1090
1091 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1092 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093
1094 frag_t fg = (unsigned)request->head.args.readdir.frag;
1095 unsigned readdir_offset = dirp->next_offset;
1096 string readdir_start = dirp->last_name;
1097 assert(!readdir_start.empty() || readdir_offset == 2);
1098
1099 unsigned last_hash = 0;
1100 if (hash_order) {
1101 if (!readdir_start.empty()) {
1102 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1103 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1104 /* mds understands offset_hash */
1105 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1106 }
1107 }
1108
1109 if (fg != dst.frag) {
1110 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1111 fg = dst.frag;
1112 if (!hash_order) {
1113 readdir_offset = 2;
1114 readdir_start.clear();
1115 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1116 }
1117 }
1118
1119 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1120 << ", hash_order=" << hash_order
1121 << ", readdir_start " << readdir_start
1122 << ", last_hash " << last_hash
1123 << ", next_offset " << readdir_offset << dendl;
1124
1125 if (diri->snapid != CEPH_SNAPDIR &&
1126 fg.is_leftmost() && readdir_offset == 2 &&
1127 !(hash_order && last_hash)) {
1128 dirp->release_count = diri->dir_release_count;
1129 dirp->ordered_count = diri->dir_ordered_count;
1130 dirp->start_shared_gen = diri->shared_gen;
1131 dirp->cache_index = 0;
1132 }
1133
1134 dirp->buffer_frag = fg;
1135
1136 _readdir_drop_dirp_buffer(dirp);
1137 dirp->buffer.reserve(numdn);
1138
1139 string dname;
1140 LeaseStat dlease;
1141 for (unsigned i=0; i<numdn; i++) {
1142 ::decode(dname, p);
1143 ::decode(dlease, p);
1144 InodeStat ist(p, features);
1145
1146 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147
1148 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1149 request->perms);
1150 Dentry *dn;
1151 if (diri->dir->dentries.count(dname)) {
1152 Dentry *olddn = diri->dir->dentries[dname];
1153 if (olddn->inode != in) {
1154 // replace incorrect dentry
1155 unlink(olddn, true, true); // keep dir, dentry
1156 dn = link(dir, dname, in, olddn);
1157 assert(dn == olddn);
1158 } else {
1159 // keep existing dn
1160 dn = olddn;
1161 touch_dn(dn);
1162 }
1163 } else {
1164 // new dn
1165 dn = link(dir, dname, in, NULL);
1166 }
1167
1168 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169 if (hash_order) {
1170 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1171 if (hash != last_hash)
1172 readdir_offset = 2;
1173 last_hash = hash;
1174 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175 } else {
1176 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177 }
1178 // add to readdir cache
1179 if (dirp->release_count == diri->dir_release_count &&
1180 dirp->ordered_count == diri->dir_ordered_count &&
1181 dirp->start_shared_gen == diri->shared_gen) {
1182 if (dirp->cache_index == dir->readdir_cache.size()) {
1183 if (i == 0) {
1184 assert(!dirp->inode->is_complete_and_ordered());
1185 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186 }
1187 dir->readdir_cache.push_back(dn);
1188 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1189 if (dirp->inode->is_complete_and_ordered())
1190 assert(dir->readdir_cache[dirp->cache_index] == dn);
1191 else
1192 dir->readdir_cache[dirp->cache_index] = dn;
1193 } else {
1194 assert(0 == "unexpected readdir buffer idx");
1195 }
1196 dirp->cache_index++;
1197 }
1198 // add to cached result list
1199 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1200 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1201 }
1202
1203 if (numdn > 0)
1204 dirp->last_name = dname;
1205 if (end)
1206 dirp->next_offset = 2;
1207 else
1208 dirp->next_offset = readdir_offset;
1209
1210 if (dir->is_empty())
1211 close_dir(dir);
1212 }
1213 }
1214
1215 /** insert_trace
1216 *
1217 * insert a trace from a MDS reply into the cache.
1218 */
1219 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220 {
1221 MClientReply *reply = request->reply;
1222 int op = request->get_op();
1223
1224 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1225 << " is_target=" << (int)reply->head.is_target
1226 << " is_dentry=" << (int)reply->head.is_dentry
1227 << dendl;
1228
1229 bufferlist::iterator p = reply->get_trace_bl().begin();
1230 if (request->got_unsafe) {
1231 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1232 assert(p.end());
1233 return NULL;
1234 }
1235
1236 if (p.end()) {
1237 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238
1239 Dentry *d = request->dentry();
1240 if (d) {
1241 Inode *diri = d->dir->parent_inode;
1242 diri->dir_release_count++;
1243 clear_dir_complete_and_ordered(diri, true);
1244 }
1245
1246 if (d && reply->get_result() == 0) {
1247 if (op == CEPH_MDS_OP_RENAME) {
1248 // rename
1249 Dentry *od = request->old_dentry();
1250 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251 assert(od);
1252 unlink(od, true, true); // keep dir, dentry
1253 } else if (op == CEPH_MDS_OP_RMDIR ||
1254 op == CEPH_MDS_OP_UNLINK) {
1255 // unlink, rmdir
1256 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1257 unlink(d, true, true); // keep dir, dentry
1258 }
1259 }
1260 return NULL;
1261 }
1262
1263 ConnectionRef con = request->reply->get_connection();
1264 uint64_t features = con->get_features();
1265 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1266
1267 // snap trace
1268 SnapRealm *realm = NULL;
1269 if (reply->snapbl.length())
1270 update_snap_trace(reply->snapbl, &realm);
1271
1272 ldout(cct, 10) << " hrm "
1273 << " is_target=" << (int)reply->head.is_target
1274 << " is_dentry=" << (int)reply->head.is_dentry
1275 << dendl;
1276
1277 InodeStat dirst;
1278 DirStat dst;
1279 string dname;
1280 LeaseStat dlease;
1281 InodeStat ist;
1282
1283 if (reply->head.is_dentry) {
1284 dirst.decode(p, features);
1285 dst.decode(p);
1286 ::decode(dname, p);
1287 ::decode(dlease, p);
1288 }
1289
1290 Inode *in = 0;
1291 if (reply->head.is_target) {
1292 ist.decode(p, features);
1293 if (cct->_conf->client_debug_getattr_caps) {
1294 unsigned wanted = 0;
1295 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1296 wanted = request->head.args.getattr.mask;
1297 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1298 wanted = request->head.args.open.mask;
1299
1300 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1301 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1302 assert(0 == "MDS reply does not contain xattrs");
1303 }
1304
1305 in = add_update_inode(&ist, request->sent_stamp, session,
1306 request->perms);
1307 }
1308
1309 Inode *diri = NULL;
1310 if (reply->head.is_dentry) {
1311 diri = add_update_inode(&dirst, request->sent_stamp, session,
1312 request->perms);
1313 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1314
1315 if (in) {
1316 Dir *dir = diri->open_dir();
1317 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1318 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1319 } else {
1320 Dentry *dn = NULL;
1321 if (diri->dir && diri->dir->dentries.count(dname)) {
1322 dn = diri->dir->dentries[dname];
1323 if (dn->inode) {
1324 diri->dir_ordered_count++;
1325 clear_dir_complete_and_ordered(diri, false);
1326 unlink(dn, true, true); // keep dir, dentry
1327 }
1328 }
1329 if (dlease.duration_ms > 0) {
1330 if (!dn) {
1331 Dir *dir = diri->open_dir();
1332 dn = link(dir, dname, NULL, NULL);
1333 }
1334 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1335 }
1336 }
1337 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1338 op == CEPH_MDS_OP_MKSNAP) {
1339 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1340 // fake it for snap lookup
1341 vinodeno_t vino = ist.vino;
1342 vino.snapid = CEPH_SNAPDIR;
1343 assert(inode_map.count(vino));
1344 diri = inode_map[vino];
1345
1346 string dname = request->path.last_dentry();
1347
1348 LeaseStat dlease;
1349 dlease.duration_ms = 0;
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354 } else {
1355 if (diri->dir && diri->dir->dentries.count(dname)) {
1356 Dentry *dn = diri->dir->dentries[dname];
1357 if (dn->inode)
1358 unlink(dn, true, true); // keep dir, dentry
1359 }
1360 }
1361 }
1362
1363 if (in) {
1364 if (op == CEPH_MDS_OP_READDIR ||
1365 op == CEPH_MDS_OP_LSSNAP) {
1366 insert_readdir_results(request, session, in);
1367 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1368 // hack: return parent inode instead
1369 in = diri;
1370 }
1371
1372 if (request->dentry() == NULL && in != request->inode()) {
1373 // pin the target inode if its parent dentry is not pinned
1374 request->set_other_inode(in);
1375 }
1376 }
1377
1378 if (realm)
1379 put_snap_realm(realm);
1380
1381 request->target = in;
1382 return in;
1383 }
1384
1385 // -------
1386
1387 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388 {
1389 mds_rank_t mds = MDS_RANK_NONE;
1390 __u32 hash = 0;
1391 bool is_hash = false;
1392
1393 Inode *in = NULL;
1394 Dentry *de = NULL;
1395 Cap *cap = NULL;
1396
1397 if (req->resend_mds >= 0) {
1398 mds = req->resend_mds;
1399 req->resend_mds = -1;
1400 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1401 goto out;
1402 }
1403
1404 if (cct->_conf->client_use_random_mds)
1405 goto random_mds;
1406
1407 in = req->inode();
1408 de = req->dentry();
1409 if (in) {
1410 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1411 if (req->path.depth()) {
1412 hash = in->hash_dentry_name(req->path[0]);
1413 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1414 << " on " << req->path[0]
1415 << " => " << hash << dendl;
1416 is_hash = true;
1417 }
1418 } else if (de) {
1419 if (de->inode) {
1420 in = de->inode.get();
1421 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422 } else {
1423 in = de->dir->parent_inode;
1424 hash = in->hash_dentry_name(de->name);
1425 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1426 << " on " << de->name
1427 << " => " << hash << dendl;
1428 is_hash = true;
1429 }
1430 }
1431 if (in) {
1432 if (in->snapid != CEPH_NOSNAP) {
1433 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1434 while (in->snapid != CEPH_NOSNAP) {
1435 if (in->snapid == CEPH_SNAPDIR)
1436 in = in->snapdir_parent.get();
1437 else if (!in->dn_set.empty())
1438 /* In most cases there will only be one dentry, so getting it
1439 * will be the correct action. If there are multiple hard links,
1440 * I think the MDS should be able to redirect as needed*/
1441 in = in->get_first_parent()->dir->parent_inode;
1442 else {
1443 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1444 break;
1445 }
1446 }
1447 is_hash = false;
1448 }
1449
1450 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1451 << " hash=" << hash << dendl;
1452
1453 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1454 frag_t fg = in->dirfragtree[hash];
1455 if (in->fragmap.count(fg)) {
1456 mds = in->fragmap[fg];
1457 if (phash_diri)
1458 *phash_diri = in;
1459 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1460 goto out;
1461 }
1462 }
1463
1464 if (req->auth_is_best())
1465 cap = in->auth_cap;
1466 if (!cap && !in->caps.empty())
1467 cap = in->caps.begin()->second;
1468 if (!cap)
1469 goto random_mds;
1470 mds = cap->session->mds_num;
1471 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1472
1473 goto out;
1474 }
1475
1476 random_mds:
1477 if (mds < 0) {
1478 mds = _get_random_up_mds();
1479 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1480 }
1481
1482 out:
1483 ldout(cct, 20) << "mds is " << mds << dendl;
1484 return mds;
1485 }
1486
1487
1488 void Client::connect_mds_targets(mds_rank_t mds)
1489 {
1490 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1491 assert(mds_sessions.count(mds));
1492 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1493 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1494 q != info.export_targets.end();
1495 ++q) {
1496 if (mds_sessions.count(*q) == 0 &&
1497 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1498 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1499 << " export target mds." << *q << dendl;
1500 _open_mds_session(*q);
1501 }
1502 }
1503 }
1504
1505 void Client::dump_mds_sessions(Formatter *f)
1506 {
1507 f->dump_int("id", get_nodeid().v);
1508 f->open_array_section("sessions");
1509 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1510 f->open_object_section("session");
1511 p->second->dump(f);
1512 f->close_section();
1513 }
1514 f->close_section();
1515 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516 }
1517 void Client::dump_mds_requests(Formatter *f)
1518 {
1519 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1520 p != mds_requests.end();
1521 ++p) {
1522 f->open_object_section("request");
1523 p->second->dump(f);
1524 f->close_section();
1525 }
1526 }
1527
1528 int Client::verify_reply_trace(int r,
1529 MetaRequest *request, MClientReply *reply,
1530 InodeRef *ptarget, bool *pcreated,
1531 const UserPerm& perms)
1532 {
1533 // check whether this request actually did the create, and set created flag
1534 bufferlist extra_bl;
1535 inodeno_t created_ino;
1536 bool got_created_ino = false;
1537 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538
1539 extra_bl.claim(reply->get_extra_bl());
1540 if (extra_bl.length() >= 8) {
1541 // if the extra bufferlist has a buffer, we assume its the created inode
1542 // and that this request to create succeeded in actually creating
1543 // the inode (won the race with other create requests)
1544 ::decode(created_ino, extra_bl);
1545 got_created_ino = true;
1546 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1547 }
1548
1549 if (pcreated)
1550 *pcreated = got_created_ino;
1551
1552 if (request->target) {
1553 *ptarget = request->target;
1554 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555 } else {
1556 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1557 (*ptarget) = p->second;
1558 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559 } else {
1560 // we got a traceless reply, and need to look up what we just
1561 // created. for now, do this by name. someday, do this by the
1562 // ino... which we know! FIXME.
1563 InodeRef target;
1564 Dentry *d = request->dentry();
1565 if (d) {
1566 if (d->dir) {
1567 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1568 << d->dir->parent_inode->ino << "/" << d->name
1569 << " got_ino " << got_created_ino
1570 << " ino " << created_ino
1571 << dendl;
1572 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1573 &target, perms);
1574 } else {
1575 // if the dentry is not linked, just do our best. see #5021.
1576 assert(0 == "how did this happen? i want logs!");
1577 }
1578 } else {
1579 Inode *in = request->inode();
1580 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1581 << in->ino << dendl;
1582 r = _getattr(in, request->regetattr_mask, perms, true);
1583 target = in;
1584 }
1585 if (r >= 0) {
1586 // verify ino returned in reply and trace_dist are the same
1587 if (got_created_ino &&
1588 created_ino.val != target->ino.val) {
1589 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1590 r = -EINTR;
1591 }
1592 if (ptarget)
1593 ptarget->swap(target);
1594 }
1595 }
1596 }
1597
1598 return r;
1599 }
1600
1601
1602 /**
1603 * make a request
1604 *
1605 * Blocking helper to make an MDS request.
1606 *
1607 * If the ptarget flag is set, behavior changes slightly: the caller
1608 * expects to get a pointer to the inode we are creating or operating
1609 * on. As a result, we will follow up any traceless mutation reply
1610 * with a getattr or lookup to transparently handle a traceless reply
1611 * from the MDS (as when the MDS restarts and the client has to replay
1612 * a request).
1613 *
1614 * @param request the MetaRequest to execute
1615 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1616 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1617 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1618 * @param use_mds [optional] prefer a specific mds (-1 for default)
1619 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620 */
1621 int Client::make_request(MetaRequest *request,
1622 const UserPerm& perms,
1623 InodeRef *ptarget, bool *pcreated,
1624 mds_rank_t use_mds,
1625 bufferlist *pdirbl)
1626 {
1627 int r = 0;
1628
1629 // assign a unique tid
1630 ceph_tid_t tid = ++last_tid;
1631 request->set_tid(tid);
1632
1633 // and timestamp
1634 request->op_stamp = ceph_clock_now();
1635
1636 // make note
1637 mds_requests[tid] = request->get();
1638 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1639 oldest_tid = tid;
1640
1641 request->set_caller_perms(perms);
1642
1643 if (cct->_conf->client_inject_fixed_oldest_tid) {
1644 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1645 request->set_oldest_client_tid(1);
1646 } else {
1647 request->set_oldest_client_tid(oldest_tid);
1648 }
1649
1650 // hack target mds?
1651 if (use_mds >= 0)
1652 request->resend_mds = use_mds;
1653
1654 while (1) {
1655 if (request->aborted())
1656 break;
1657
1658 if (blacklisted) {
1659 request->abort(-EBLACKLISTED);
1660 break;
1661 }
1662
1663 // set up wait cond
1664 Cond caller_cond;
1665 request->caller_cond = &caller_cond;
1666
1667 // choose mds
1668 Inode *hash_diri = NULL;
1669 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1670 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1671 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1672 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673 if (hash_diri) {
1674 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1675 _fragmap_remove_stopped_mds(hash_diri, mds);
1676 } else {
1677 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1678 request->resend_mds = _get_random_up_mds();
1679 }
1680 } else {
1681 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1682 wait_on_list(waiting_for_mdsmap);
1683 }
1684 continue;
1685 }
1686
1687 // open a session?
1688 MetaSession *session = NULL;
1689 if (!have_open_session(mds)) {
1690 session = _get_or_open_mds_session(mds);
1691
1692 // wait
1693 if (session->state == MetaSession::STATE_OPENING) {
1694 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1695 wait_on_context_list(session->waiting_for_open);
1696 // Abort requests on REJECT from MDS
1697 if (rejected_by_mds.count(mds)) {
1698 request->abort(-EPERM);
1699 break;
1700 }
1701 continue;
1702 }
1703
1704 if (!have_open_session(mds))
1705 continue;
1706 } else {
1707 session = mds_sessions[mds];
1708 }
1709
1710 // send request.
1711 send_request(request, session);
1712
1713 // wait for signal
1714 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1715 request->kick = false;
1716 while (!request->reply && // reply
1717 request->resend_mds < 0 && // forward
1718 !request->kick)
1719 caller_cond.Wait(client_lock);
1720 request->caller_cond = NULL;
1721
1722 // did we get a reply?
1723 if (request->reply)
1724 break;
1725 }
1726
1727 if (!request->reply) {
1728 assert(request->aborted());
1729 assert(!request->got_unsafe);
1730 r = request->get_abort_code();
1731 request->item.remove_myself();
1732 unregister_request(request);
1733 put_request(request); // ours
1734 return r;
1735 }
1736
1737 // got it!
1738 MClientReply *reply = request->reply;
1739 request->reply = NULL;
1740 r = reply->get_result();
1741 if (r >= 0)
1742 request->success = true;
1743
1744 // kick dispatcher (we've got it!)
1745 assert(request->dispatch_cond);
1746 request->dispatch_cond->Signal();
1747 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1748 request->dispatch_cond = 0;
1749
1750 if (r >= 0 && ptarget)
1751 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1752
1753 if (pdirbl)
1754 pdirbl->claim(reply->get_extra_bl());
1755
1756 // -- log times --
1757 utime_t lat = ceph_clock_now();
1758 lat -= request->sent_stamp;
1759 ldout(cct, 20) << "lat " << lat << dendl;
1760 logger->tinc(l_c_lat, lat);
1761 logger->tinc(l_c_reply, lat);
1762
1763 put_request(request);
1764
1765 reply->put();
1766 return r;
1767 }
1768
1769 void Client::unregister_request(MetaRequest *req)
1770 {
1771 mds_requests.erase(req->tid);
1772 if (req->tid == oldest_tid) {
1773 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774 while (true) {
1775 if (p == mds_requests.end()) {
1776 oldest_tid = 0;
1777 break;
1778 }
1779 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1780 oldest_tid = p->first;
1781 break;
1782 }
1783 ++p;
1784 }
1785 }
1786 put_request(req);
1787 }
1788
1789 void Client::put_request(MetaRequest *request)
1790 {
1791 if (request->_put()) {
1792 int op = -1;
1793 if (request->success)
1794 op = request->get_op();
1795 InodeRef other_in;
1796 request->take_other_inode(&other_in);
1797 delete request;
1798
1799 if (other_in &&
1800 (op == CEPH_MDS_OP_RMDIR ||
1801 op == CEPH_MDS_OP_RENAME ||
1802 op == CEPH_MDS_OP_RMSNAP)) {
1803 _try_to_trim_inode(other_in.get(), false);
1804 }
1805 }
1806 }
1807
1808 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1809 mds_rank_t mds, int drop,
1810 int unless, int force)
1811 {
1812 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1813 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1814 << ", have:" << ", force:" << force << ")" << dendl;
1815 int released = 0;
1816 if (in->caps.count(mds)) {
1817 Cap *caps = in->caps[mds];
1818 drop &= ~(in->dirty_caps | get_caps_used(in));
1819 if ((drop & caps->issued) &&
1820 !(unless & caps->issued)) {
1821 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1822 caps->issued &= ~drop;
1823 caps->implemented &= ~drop;
1824 released = 1;
1825 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1826 } else {
1827 released = force;
1828 }
1829 if (released) {
1830 ceph_mds_request_release rel;
1831 rel.ino = in->ino;
1832 rel.cap_id = caps->cap_id;
1833 rel.seq = caps->seq;
1834 rel.issue_seq = caps->issue_seq;
1835 rel.mseq = caps->mseq;
1836 rel.caps = caps->implemented;
1837 rel.wanted = caps->wanted;
1838 rel.dname_len = 0;
1839 rel.dname_seq = 0;
1840 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1841 }
1842 }
1843 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1844 << released << dendl;
1845 return released;
1846 }
1847
1848 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1849 mds_rank_t mds, int drop, int unless)
1850 {
1851 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1852 << dn << ")" << dendl;
1853 int released = 0;
1854 if (dn->dir)
1855 released = encode_inode_release(dn->dir->parent_inode, req,
1856 mds, drop, unless, 1);
1857 if (released && dn->lease_mds == mds) {
1858 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1859 MClientRequest::Release& rel = req->cap_releases.back();
1860 rel.item.dname_len = dn->name.length();
1861 rel.item.dname_seq = dn->lease_seq;
1862 rel.dname = dn->name;
1863 }
1864 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1865 << dn << ")" << dendl;
1866 }
1867
1868
1869 /*
1870 * This requires the MClientRequest *request member to be set.
1871 * It will error out horribly without one.
1872 * Additionally, if you set any *drop member, you'd better have
1873 * set the corresponding dentry!
1874 */
1875 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876 {
1877 ldout(cct, 20) << "encode_cap_releases enter (req: "
1878 << req << ", mds: " << mds << ")" << dendl;
1879 if (req->inode_drop && req->inode())
1880 encode_inode_release(req->inode(), req,
1881 mds, req->inode_drop,
1882 req->inode_unless);
1883
1884 if (req->old_inode_drop && req->old_inode())
1885 encode_inode_release(req->old_inode(), req,
1886 mds, req->old_inode_drop,
1887 req->old_inode_unless);
1888 if (req->other_inode_drop && req->other_inode())
1889 encode_inode_release(req->other_inode(), req,
1890 mds, req->other_inode_drop,
1891 req->other_inode_unless);
1892
1893 if (req->dentry_drop && req->dentry())
1894 encode_dentry_release(req->dentry(), req,
1895 mds, req->dentry_drop,
1896 req->dentry_unless);
1897
1898 if (req->old_dentry_drop && req->old_dentry())
1899 encode_dentry_release(req->old_dentry(), req,
1900 mds, req->old_dentry_drop,
1901 req->old_dentry_unless);
1902 ldout(cct, 25) << "encode_cap_releases exit (req: "
1903 << req << ", mds " << mds <<dendl;
1904 }
1905
1906 bool Client::have_open_session(mds_rank_t mds)
1907 {
1908 return
1909 mds_sessions.count(mds) &&
1910 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1911 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1912 }
1913
1914 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915 {
1916 if (mds_sessions.count(mds) == 0)
1917 return NULL;
1918 MetaSession *s = mds_sessions[mds];
1919 if (s->con != con)
1920 return NULL;
1921 return s;
1922 }
1923
1924 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925 {
1926 if (mds_sessions.count(mds))
1927 return mds_sessions[mds];
1928 return _open_mds_session(mds);
1929 }
1930
1931 /**
1932 * Populate a map of strings with client-identifying metadata,
1933 * such as the hostname. Call this once at initialization.
1934 */
1935 void Client::populate_metadata(const std::string &mount_root)
1936 {
1937 // Hostname
1938 struct utsname u;
1939 int r = uname(&u);
1940 if (r >= 0) {
1941 metadata["hostname"] = u.nodename;
1942 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943 } else {
1944 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1945 }
1946
1947 metadata["pid"] = stringify(getpid());
1948
1949 // Ceph entity id (the '0' in "client.0")
1950 metadata["entity_id"] = cct->_conf->name.get_id();
1951
1952 // Our mount position
1953 if (!mount_root.empty()) {
1954 metadata["root"] = mount_root;
1955 }
1956
1957 // Ceph version
1958 metadata["ceph_version"] = pretty_version_to_str();
1959 metadata["ceph_sha1"] = git_version_to_str();
1960
1961 // Apply any metadata from the user's configured overrides
1962 std::vector<std::string> tokens;
1963 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1964 for (const auto &i : tokens) {
1965 auto eqpos = i.find("=");
1966 // Throw out anything that isn't of the form "<str>=<str>"
1967 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1968 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1969 continue;
1970 }
1971 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1972 }
1973 }
1974
1975 /**
1976 * Optionally add or override client metadata fields.
1977 */
1978 void Client::update_metadata(std::string const &k, std::string const &v)
1979 {
1980 Mutex::Locker l(client_lock);
1981 assert(initialized);
1982
1983 if (metadata.count(k)) {
1984 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1985 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1986 }
1987
1988 metadata[k] = v;
1989 }
1990
1991 MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992 {
1993 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1994 assert(mds_sessions.count(mds) == 0);
1995 MetaSession *session = new MetaSession;
1996 session->mds_num = mds;
1997 session->seq = 0;
1998 session->inst = mdsmap->get_inst(mds);
1999 session->con = messenger->get_connection(session->inst);
2000 session->state = MetaSession::STATE_OPENING;
2001 session->mds_state = MDSMap::STATE_NULL;
2002 mds_sessions[mds] = session;
2003
2004 // Maybe skip sending a request to open if this MDS daemon
2005 // has previously sent us a REJECT.
2006 if (rejected_by_mds.count(mds)) {
2007 if (rejected_by_mds[mds] == session->inst) {
2008 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2009 "because we were rejected" << dendl;
2010 return session;
2011 } else {
2012 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2013 "rejected us, trying with new inst" << dendl;
2014 rejected_by_mds.erase(mds);
2015 }
2016 }
2017
2018 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2019 m->client_meta = metadata;
2020 session->con->send_message(m);
2021 return session;
2022 }
2023
2024 void Client::_close_mds_session(MetaSession *s)
2025 {
2026 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2027 s->state = MetaSession::STATE_CLOSING;
2028 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2029 }
2030
2031 void Client::_closed_mds_session(MetaSession *s)
2032 {
2033 s->state = MetaSession::STATE_CLOSED;
2034 s->con->mark_down();
2035 signal_context_list(s->waiting_for_open);
2036 mount_cond.Signal();
2037 remove_session_caps(s);
2038 kick_requests_closed(s);
2039 mds_sessions.erase(s->mds_num);
2040 delete s;
2041 }
2042
2043 void Client::handle_client_session(MClientSession *m)
2044 {
2045 mds_rank_t from = mds_rank_t(m->get_source().num());
2046 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047
2048 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049 if (!session) {
2050 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2051 m->put();
2052 return;
2053 }
2054
2055 switch (m->get_op()) {
2056 case CEPH_SESSION_OPEN:
2057 renew_caps(session);
2058 session->state = MetaSession::STATE_OPEN;
2059 if (unmounting)
2060 mount_cond.Signal();
2061 else
2062 connect_mds_targets(from);
2063 signal_context_list(session->waiting_for_open);
2064 break;
2065
2066 case CEPH_SESSION_CLOSE:
2067 _closed_mds_session(session);
2068 break;
2069
2070 case CEPH_SESSION_RENEWCAPS:
2071 if (session->cap_renew_seq == m->get_seq()) {
2072 session->cap_ttl =
2073 session->last_cap_renew_request + mdsmap->get_session_timeout();
2074 wake_inode_waiters(session);
2075 }
2076 break;
2077
2078 case CEPH_SESSION_STALE:
2079 renew_caps(session);
2080 break;
2081
2082 case CEPH_SESSION_RECALL_STATE:
2083 trim_caps(session, m->get_max_caps());
2084 break;
2085
2086 case CEPH_SESSION_FLUSHMSG:
2087 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2088 break;
2089
2090 case CEPH_SESSION_FORCE_RO:
2091 force_session_readonly(session);
2092 break;
2093
2094 case CEPH_SESSION_REJECT:
2095 rejected_by_mds[session->mds_num] = session->inst;
2096 _closed_mds_session(session);
2097
2098 break;
2099
2100 default:
2101 ceph_abort();
2102 }
2103
2104 m->put();
2105 }
2106
2107 bool Client::_any_stale_sessions() const
2108 {
2109 assert(client_lock.is_locked_by_me());
2110
2111 for (const auto &i : mds_sessions) {
2112 if (i.second->state == MetaSession::STATE_STALE) {
2113 return true;
2114 }
2115 }
2116
2117 return false;
2118 }
2119
2120 void Client::_kick_stale_sessions()
2121 {
2122 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123
2124 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2125 p != mds_sessions.end(); ) {
2126 MetaSession *s = p->second;
2127 ++p;
2128 if (s->state == MetaSession::STATE_STALE)
2129 _closed_mds_session(s);
2130 }
2131 }
2132
2133 void Client::send_request(MetaRequest *request, MetaSession *session,
2134 bool drop_cap_releases)
2135 {
2136 // make the request
2137 mds_rank_t mds = session->mds_num;
2138 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2139 << " for mds." << mds << dendl;
2140 MClientRequest *r = build_client_request(request);
2141 if (request->dentry()) {
2142 r->set_dentry_wanted();
2143 }
2144 if (request->got_unsafe) {
2145 r->set_replayed_op();
2146 if (request->target)
2147 r->head.ino = request->target->ino;
2148 } else {
2149 encode_cap_releases(request, mds);
2150 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2151 request->cap_releases.clear();
2152 else
2153 r->releases.swap(request->cap_releases);
2154 }
2155 r->set_mdsmap_epoch(mdsmap->get_epoch());
2156 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2157 objecter->with_osdmap([r](const OSDMap& o) {
2158 r->set_osdmap_epoch(o.get_epoch());
2159 });
2160 }
2161
2162 if (request->mds == -1) {
2163 request->sent_stamp = ceph_clock_now();
2164 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2165 }
2166 request->mds = mds;
2167
2168 Inode *in = request->inode();
2169 if (in && in->caps.count(mds))
2170 request->sent_on_mseq = in->caps[mds]->mseq;
2171
2172 session->requests.push_back(&request->item);
2173
2174 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2175 session->con->send_message(r);
2176 }
2177
2178 MClientRequest* Client::build_client_request(MetaRequest *request)
2179 {
2180 MClientRequest *req = new MClientRequest(request->get_op());
2181 req->set_tid(request->tid);
2182 req->set_stamp(request->op_stamp);
2183 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184
2185 // if the filepath's haven't been set, set them!
2186 if (request->path.empty()) {
2187 Inode *in = request->inode();
2188 Dentry *de = request->dentry();
2189 if (in)
2190 in->make_nosnap_relative_path(request->path);
2191 else if (de) {
2192 if (de->inode)
2193 de->inode->make_nosnap_relative_path(request->path);
2194 else if (de->dir) {
2195 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2196 request->path.push_dentry(de->name);
2197 }
2198 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2199 << " No path, inode, or appropriately-endowed dentry given!"
2200 << dendl;
2201 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2202 << " No path, inode, or dentry given!"
2203 << dendl;
2204 }
2205 req->set_filepath(request->get_filepath());
2206 req->set_filepath2(request->get_filepath2());
2207 req->set_data(request->data);
2208 req->set_retry_attempt(request->retry_attempt++);
2209 req->head.num_fwd = request->num_fwd;
2210 const gid_t *_gids;
2211 int gid_count = request->perms.get_gids(&_gids);
2212 req->set_gid_list(gid_count, _gids);
2213 return req;
2214 }
2215
2216
2217
2218 void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219 {
2220 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2221 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2222 if (!session) {
2223 fwd->put();
2224 return;
2225 }
2226 ceph_tid_t tid = fwd->get_tid();
2227
2228 if (mds_requests.count(tid) == 0) {
2229 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2230 fwd->put();
2231 return;
2232 }
2233
2234 MetaRequest *request = mds_requests[tid];
2235 assert(request);
2236
2237 // reset retry counter
2238 request->retry_attempt = 0;
2239
2240 // request not forwarded, or dest mds has no session.
2241 // resend.
2242 ldout(cct, 10) << "handle_client_request tid " << tid
2243 << " fwd " << fwd->get_num_fwd()
2244 << " to mds." << fwd->get_dest_mds()
2245 << ", resending to " << fwd->get_dest_mds()
2246 << dendl;
2247
2248 request->mds = -1;
2249 request->item.remove_myself();
2250 request->num_fwd = fwd->get_num_fwd();
2251 request->resend_mds = fwd->get_dest_mds();
2252 request->caller_cond->Signal();
2253
2254 fwd->put();
2255 }
2256
2257 bool Client::is_dir_operation(MetaRequest *req)
2258 {
2259 int op = req->get_op();
2260 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2261 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2262 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2263 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2264 return true;
2265 return false;
2266 }
2267
2268 void Client::handle_client_reply(MClientReply *reply)
2269 {
2270 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2271 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2272 if (!session) {
2273 reply->put();
2274 return;
2275 }
2276
2277 ceph_tid_t tid = reply->get_tid();
2278 bool is_safe = reply->is_safe();
2279
2280 if (mds_requests.count(tid) == 0) {
2281 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2282 << " safe is:" << is_safe << dendl;
2283 reply->put();
2284 return;
2285 }
2286 MetaRequest *request = mds_requests.at(tid);
2287
2288 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2289 << " tid " << tid << dendl;
2290
2291 if (request->got_unsafe && !is_safe) {
2292 //duplicate response
2293 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2294 << mds_num << " safe:" << is_safe << dendl;
2295 reply->put();
2296 return;
2297 }
2298
2299 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2300 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2301 << " from mds." << request->mds << dendl;
2302 request->send_to_auth = true;
2303 request->resend_mds = choose_target_mds(request);
2304 Inode *in = request->inode();
2305 if (request->resend_mds >= 0 &&
2306 request->resend_mds == request->mds &&
2307 (in == NULL ||
2308 in->caps.count(request->resend_mds) == 0 ||
2309 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2310 // have to return ESTALE
2311 } else {
2312 request->caller_cond->Signal();
2313 reply->put();
2314 return;
2315 }
2316 ldout(cct, 20) << "have to return ESTALE" << dendl;
2317 }
2318
2319 assert(request->reply == NULL);
2320 request->reply = reply;
2321 insert_trace(request, session);
2322
2323 // Handle unsafe reply
2324 if (!is_safe) {
2325 request->got_unsafe = true;
2326 session->unsafe_requests.push_back(&request->unsafe_item);
2327 if (is_dir_operation(request)) {
2328 Inode *dir = request->inode();
2329 assert(dir);
2330 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331 }
2332 if (request->target) {
2333 InodeRef &in = request->target;
2334 in->unsafe_ops.push_back(&request->unsafe_target_item);
2335 }
2336 }
2337
2338 // Only signal the caller once (on the first reply):
2339 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2340 if (!is_safe || !request->got_unsafe) {
2341 Cond cond;
2342 request->dispatch_cond = &cond;
2343
2344 // wake up waiter
2345 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2346 request->caller_cond->Signal();
2347
2348 // wake for kick back
2349 while (request->dispatch_cond) {
2350 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2351 cond.Wait(client_lock);
2352 }
2353 }
2354
2355 if (is_safe) {
2356 // the filesystem change is committed to disk
2357 // we're done, clean up
2358 if (request->got_unsafe) {
2359 request->unsafe_item.remove_myself();
2360 request->unsafe_dir_item.remove_myself();
2361 request->unsafe_target_item.remove_myself();
2362 signal_cond_list(request->waitfor_safe);
2363 }
2364 request->item.remove_myself();
2365 unregister_request(request);
2366 }
2367 if (unmounting)
2368 mount_cond.Signal();
2369 }
2370
2371 void Client::_handle_full_flag(int64_t pool)
2372 {
2373 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2374 << "on " << pool << dendl;
2375 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2376 // to do this rather than blocking, because otherwise when we fill up we
2377 // potentially lock caps forever on files with dirty pages, and we need
2378 // to be able to release those caps to the MDS so that it can delete files
2379 // and free up space.
2380 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381
2382 // For all inodes with layouts in this pool and a pending flush write op
2383 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2384 // from ObjectCacher so that it doesn't re-issue the write in response to
2385 // the ENOSPC error.
2386 // Fortunately since we're cancelling everything in a given pool, we don't
2387 // need to know which ops belong to which ObjectSet, we can just blow all
2388 // the un-flushed cached data away and mark any dirty inodes' async_err
2389 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2390 // affecting this pool, and all the objectsets we're purging were also
2391 // in this pool.
2392 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2393 i != inode_map.end(); ++i)
2394 {
2395 Inode *inode = i->second;
2396 if (inode->oset.dirty_or_tx
2397 && (pool == -1 || inode->layout.pool_id == pool)) {
2398 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2399 << " has dirty objects, purging and setting ENOSPC" << dendl;
2400 objectcacher->purge_set(&inode->oset);
2401 inode->set_async_err(-ENOSPC);
2402 }
2403 }
2404
2405 if (cancelled_epoch != (epoch_t)-1) {
2406 set_cap_epoch_barrier(cancelled_epoch);
2407 }
2408 }
2409
2410 void Client::handle_osd_map(MOSDMap *m)
2411 {
2412 std::set<entity_addr_t> new_blacklists;
2413 objecter->consume_blacklist_events(&new_blacklists);
2414
2415 const auto myaddr = messenger->get_myaddr();
2416 if (!blacklisted && new_blacklists.count(myaddr)) {
2417 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2418 return o.get_epoch();
2419 });
2420 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421 blacklisted = true;
2422 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2423 p != mds_requests.end(); ) {
2424 auto req = p->second;
2425 ++p;
2426 req->abort(-EBLACKLISTED);
2427 if (req->caller_cond) {
2428 req->kick = true;
2429 req->caller_cond->Signal();
2430 }
2431 }
2432
2433 // Progress aborts on any requests that were on this waitlist. Any
2434 // requests that were on a waiting_for_open session waitlist
2435 // will get kicked during close session below.
2436 signal_cond_list(waiting_for_mdsmap);
2437
2438 // Force-close all sessions: assume this is not abandoning any state
2439 // on the MDS side because the MDS will have seen the blacklist too.
2440 while(!mds_sessions.empty()) {
2441 auto i = mds_sessions.begin();
2442 auto session = i->second;
2443 _closed_mds_session(session);
2444 }
2445
2446 // Since we know all our OSD ops will fail, cancel them all preemtively,
2447 // so that on an unhealthy cluster we can umount promptly even if e.g.
2448 // some PGs were inaccessible.
2449 objecter->op_cancel_writes(-EBLACKLISTED);
2450
2451 } else if (blacklisted) {
2452 // Handle case where we were blacklisted but no longer are
2453 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2454 return o.is_blacklisted(myaddr);});
2455 }
2456
2457 if (objecter->osdmap_full_flag()) {
2458 _handle_full_flag(-1);
2459 } else {
2460 // Accumulate local list of full pools so that I can drop
2461 // the objecter lock before re-entering objecter in
2462 // cancel_writes
2463 std::vector<int64_t> full_pools;
2464
2465 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2466 for (const auto& kv : o.get_pools()) {
2467 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2468 full_pools.push_back(kv.first);
2469 }
2470 }
2471 });
2472
2473 for (auto p : full_pools)
2474 _handle_full_flag(p);
2475
2476 // Subscribe to subsequent maps to watch for the full flag going
2477 // away. For the global full flag objecter does this for us, but
2478 // it pays no attention to the per-pool full flag so in this branch
2479 // we do it ourselves.
2480 if (!full_pools.empty()) {
2481 objecter->maybe_request_map();
2482 }
2483 }
2484
2485 m->put();
2486 }
2487
2488
2489 // ------------------------
2490 // incoming messages
2491
2492
2493 bool Client::ms_dispatch(Message *m)
2494 {
2495 Mutex::Locker l(client_lock);
2496 if (!initialized) {
2497 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2498 m->put();
2499 return true;
2500 }
2501
2502 switch (m->get_type()) {
2503 // mounting and mds sessions
2504 case CEPH_MSG_MDS_MAP:
2505 handle_mds_map(static_cast<MMDSMap*>(m));
2506 break;
2507 case CEPH_MSG_FS_MAP:
2508 handle_fs_map(static_cast<MFSMap*>(m));
2509 break;
2510 case CEPH_MSG_FS_MAP_USER:
2511 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512 break;
2513 case CEPH_MSG_CLIENT_SESSION:
2514 handle_client_session(static_cast<MClientSession*>(m));
2515 break;
2516
2517 case CEPH_MSG_OSD_MAP:
2518 handle_osd_map(static_cast<MOSDMap*>(m));
2519 break;
2520
2521 // requests
2522 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2523 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524 break;
2525 case CEPH_MSG_CLIENT_REPLY:
2526 handle_client_reply(static_cast<MClientReply*>(m));
2527 break;
2528
2529 case CEPH_MSG_CLIENT_SNAP:
2530 handle_snap(static_cast<MClientSnap*>(m));
2531 break;
2532 case CEPH_MSG_CLIENT_CAPS:
2533 handle_caps(static_cast<MClientCaps*>(m));
2534 break;
2535 case CEPH_MSG_CLIENT_LEASE:
2536 handle_lease(static_cast<MClientLease*>(m));
2537 break;
2538 case MSG_COMMAND_REPLY:
2539 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2540 handle_command_reply(static_cast<MCommandReply*>(m));
2541 } else {
2542 return false;
2543 }
2544 break;
2545 case CEPH_MSG_CLIENT_QUOTA:
2546 handle_quota(static_cast<MClientQuota*>(m));
2547 break;
2548
2549 default:
2550 return false;
2551 }
2552
2553 // unmounting?
2554 if (unmounting) {
2555 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2556 << "+" << inode_map.size() << dendl;
2557 long unsigned size = lru.lru_get_size() + inode_map.size();
2558 trim_cache();
2559 if (size < lru.lru_get_size() + inode_map.size()) {
2560 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2561 mount_cond.Signal();
2562 } else {
2563 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2564 << "+" << inode_map.size() << dendl;
2565 }
2566 }
2567
2568 return true;
2569 }
2570
2571 void Client::handle_fs_map(MFSMap *m)
2572 {
2573 fsmap.reset(new FSMap(m->get_fsmap()));
2574 m->put();
2575
2576 signal_cond_list(waiting_for_fsmap);
2577
2578 monclient->sub_got("fsmap", fsmap->get_epoch());
2579 }
2580
2581 void Client::handle_fs_map_user(MFSMapUser *m)
2582 {
2583 fsmap_user.reset(new FSMapUser);
2584 *fsmap_user = m->get_fsmap();
2585 m->put();
2586
2587 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2588 signal_cond_list(waiting_for_fsmap);
2589 }
2590
2591 void Client::handle_mds_map(MMDSMap* m)
2592 {
2593 if (m->get_epoch() <= mdsmap->get_epoch()) {
2594 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2595 << " is identical to or older than our "
2596 << mdsmap->get_epoch() << dendl;
2597 m->put();
2598 return;
2599 }
2600
2601 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602
2603 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2604 oldmap.swap(mdsmap);
2605
2606 mdsmap->decode(m->get_encoded());
2607
2608 // Cancel any commands for missing or laggy GIDs
2609 std::list<ceph_tid_t> cancel_ops;
2610 auto &commands = command_table.get_commands();
2611 for (const auto &i : commands) {
2612 auto &op = i.second;
2613 const mds_gid_t op_mds_gid = op.mds_gid;
2614 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2615 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2616 cancel_ops.push_back(i.first);
2617 if (op.outs) {
2618 std::ostringstream ss;
2619 ss << "MDS " << op_mds_gid << " went away";
2620 *(op.outs) = ss.str();
2621 }
2622 op.con->mark_down();
2623 if (op.on_finish) {
2624 op.on_finish->complete(-ETIMEDOUT);
2625 }
2626 }
2627 }
2628
2629 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2630 i != cancel_ops.end(); ++i) {
2631 command_table.erase(*i);
2632 }
2633
2634 // reset session
2635 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2636 p != mds_sessions.end(); ) {
2637 mds_rank_t mds = p->first;
2638 MetaSession *session = p->second;
2639 ++p;
2640
2641 int oldstate = oldmap->get_state(mds);
2642 int newstate = mdsmap->get_state(mds);
2643 if (!mdsmap->is_up(mds)) {
2644 session->con->mark_down();
2645 } else if (mdsmap->get_inst(mds) != session->inst) {
2646 session->con->mark_down();
2647 session->inst = mdsmap->get_inst(mds);
2648 // When new MDS starts to take over, notify kernel to trim unused entries
2649 // in its dcache/icache. Hopefully, the kernel will release some unused
2650 // inodes before the new MDS enters reconnect state.
2651 trim_cache_for_reconnect(session);
2652 } else if (oldstate == newstate)
2653 continue; // no change
2654
2655 session->mds_state = newstate;
2656 if (newstate == MDSMap::STATE_RECONNECT) {
2657 session->con = messenger->get_connection(session->inst);
2658 send_reconnect(session);
2659 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2660 if (oldstate < MDSMap::STATE_ACTIVE) {
2661 // kick new requests
2662 kick_requests(session);
2663 kick_flushing_caps(session);
2664 signal_context_list(session->waiting_for_open);
2665 kick_maxsize_requests(session);
2666 wake_inode_waiters(session);
2667 }
2668 connect_mds_targets(mds);
2669 } else if (newstate == MDSMap::STATE_NULL &&
2670 mds >= mdsmap->get_max_mds()) {
2671 _closed_mds_session(session);
2672 }
2673 }
2674
2675 // kick any waiting threads
2676 signal_cond_list(waiting_for_mdsmap);
2677
2678 m->put();
2679
2680 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2681 }
2682
2683 void Client::send_reconnect(MetaSession *session)
2684 {
2685 mds_rank_t mds = session->mds_num;
2686 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687
2688 // trim unused caps to reduce MDS's cache rejoin time
2689 trim_cache_for_reconnect(session);
2690
2691 session->readonly = false;
2692
2693 if (session->release) {
2694 session->release->put();
2695 session->release = NULL;
2696 }
2697
2698 // reset my cap seq number
2699 session->seq = 0;
2700 //connect to the mds' offload targets
2701 connect_mds_targets(mds);
2702 //make sure unsafe requests get saved
2703 resend_unsafe_requests(session);
2704
2705 MClientReconnect *m = new MClientReconnect;
2706
2707 // i have an open session.
2708 ceph::unordered_set<inodeno_t> did_snaprealm;
2709 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2710 p != inode_map.end();
2711 ++p) {
2712 Inode *in = p->second;
2713 if (in->caps.count(mds)) {
2714 ldout(cct, 10) << " caps on " << p->first
2715 << " " << ccap_string(in->caps[mds]->issued)
2716 << " wants " << ccap_string(in->caps_wanted())
2717 << dendl;
2718 filepath path;
2719 in->make_long_path(path);
2720 ldout(cct, 10) << " path " << path << dendl;
2721
2722 bufferlist flockbl;
2723 _encode_filelocks(in, flockbl);
2724
2725 Cap *cap = in->caps[mds];
2726 cap->seq = 0; // reset seq.
2727 cap->issue_seq = 0; // reset seq.
2728 cap->mseq = 0; // reset seq.
2729 cap->issued = cap->implemented;
2730
2731 snapid_t snap_follows = 0;
2732 if (!in->cap_snaps.empty())
2733 snap_follows = in->cap_snaps.begin()->first;
2734
2735 m->add_cap(p->first.ino,
2736 cap->cap_id,
2737 path.get_ino(), path.get_path(), // ino
2738 in->caps_wanted(), // wanted
2739 cap->issued, // issued
2740 in->snaprealm->ino,
2741 snap_follows,
2742 flockbl);
2743
2744 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2745 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2746 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2747 did_snaprealm.insert(in->snaprealm->ino);
2748 }
2749 }
2750 }
2751
2752 early_kick_flushing_caps(session);
2753
2754 session->con->send_message(m);
2755
2756 mount_cond.Signal();
2757 }
2758
2759
2760 void Client::kick_requests(MetaSession *session)
2761 {
2762 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2763 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2764 p != mds_requests.end();
2765 ++p) {
2766 MetaRequest *req = p->second;
2767 if (req->got_unsafe)
2768 continue;
2769 if (req->aborted()) {
2770 if (req->caller_cond) {
2771 req->kick = true;
2772 req->caller_cond->Signal();
2773 }
2774 continue;
2775 }
2776 if (req->retry_attempt > 0)
2777 continue; // new requests only
2778 if (req->mds == session->mds_num) {
2779 send_request(p->second, session);
2780 }
2781 }
2782 }
2783
2784 void Client::resend_unsafe_requests(MetaSession *session)
2785 {
2786 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2787 !iter.end();
2788 ++iter)
2789 send_request(*iter, session);
2790
2791 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2792 // process completed requests in clientreplay stage.
2793 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2794 p != mds_requests.end();
2795 ++p) {
2796 MetaRequest *req = p->second;
2797 if (req->got_unsafe)
2798 continue;
2799 if (req->aborted())
2800 continue;
2801 if (req->retry_attempt == 0)
2802 continue; // old requests only
2803 if (req->mds == session->mds_num)
2804 send_request(req, session, true);
2805 }
2806 }
2807
2808 void Client::wait_unsafe_requests()
2809 {
2810 list<MetaRequest*> last_unsafe_reqs;
2811 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2812 p != mds_sessions.end();
2813 ++p) {
2814 MetaSession *s = p->second;
2815 if (!s->unsafe_requests.empty()) {
2816 MetaRequest *req = s->unsafe_requests.back();
2817 req->get();
2818 last_unsafe_reqs.push_back(req);
2819 }
2820 }
2821
2822 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2823 p != last_unsafe_reqs.end();
2824 ++p) {
2825 MetaRequest *req = *p;
2826 if (req->unsafe_item.is_on_list())
2827 wait_on_list(req->waitfor_safe);
2828 put_request(req);
2829 }
2830 }
2831
2832 void Client::kick_requests_closed(MetaSession *session)
2833 {
2834 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2835 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2836 p != mds_requests.end(); ) {
2837 MetaRequest *req = p->second;
2838 ++p;
2839 if (req->mds == session->mds_num) {
2840 if (req->caller_cond) {
2841 req->kick = true;
2842 req->caller_cond->Signal();
2843 }
2844 req->item.remove_myself();
2845 if (req->got_unsafe) {
2846 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2847 req->unsafe_item.remove_myself();
2848 req->unsafe_dir_item.remove_myself();
2849 req->unsafe_target_item.remove_myself();
2850 signal_cond_list(req->waitfor_safe);
2851 unregister_request(req);
2852 }
2853 }
2854 }
2855 assert(session->requests.empty());
2856 assert(session->unsafe_requests.empty());
2857 }
2858
2859
2860
2861
2862 /************
2863 * leases
2864 */
2865
2866 void Client::got_mds_push(MetaSession *s)
2867 {
2868 s->seq++;
2869 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2870 if (s->state == MetaSession::STATE_CLOSING) {
2871 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2872 }
2873 }
2874
2875 void Client::handle_lease(MClientLease *m)
2876 {
2877 ldout(cct, 10) << "handle_lease " << *m << dendl;
2878
2879 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880
2881 mds_rank_t mds = mds_rank_t(m->get_source().num());
2882 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2883 if (!session) {
2884 m->put();
2885 return;
2886 }
2887
2888 got_mds_push(session);
2889
2890 ceph_seq_t seq = m->get_seq();
2891
2892 Inode *in;
2893 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2894 if (inode_map.count(vino) == 0) {
2895 ldout(cct, 10) << " don't have vino " << vino << dendl;
2896 goto revoke;
2897 }
2898 in = inode_map[vino];
2899
2900 if (m->get_mask() & CEPH_LOCK_DN) {
2901 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2902 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2903 goto revoke;
2904 }
2905 Dentry *dn = in->dir->dentries[m->dname];
2906 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2907 dn->lease_mds = -1;
2908 }
2909
2910 revoke:
2911 m->get_connection()->send_message(
2912 new MClientLease(
2913 CEPH_MDS_LEASE_RELEASE, seq,
2914 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2915 m->put();
2916 }
2917
2918 void Client::put_inode(Inode *in, int n)
2919 {
2920 ldout(cct, 10) << "put_inode on " << *in << dendl;
2921 int left = in->_put(n);
2922 if (left == 0) {
2923 // release any caps
2924 remove_all_caps(in);
2925
2926 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2927 bool unclean = objectcacher->release_set(&in->oset);
2928 assert(!unclean);
2929 inode_map.erase(in->vino());
2930 if (use_faked_inos())
2931 _release_faked_ino(in);
2932
2933 if (in == root) {
2934 root = 0;
2935 root_ancestor = 0;
2936 while (!root_parents.empty())
2937 root_parents.erase(root_parents.begin());
2938 }
2939
2940 delete in;
2941 }
2942 }
2943
2944 void Client::close_dir(Dir *dir)
2945 {
2946 Inode *in = dir->parent_inode;
2947 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2948 assert(dir->is_empty());
2949 assert(in->dir == dir);
2950 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2951 if (!in->dn_set.empty())
2952 in->get_first_parent()->put(); // unpin dentry
2953
2954 delete in->dir;
2955 in->dir = 0;
2956 put_inode(in); // unpin inode
2957 }
2958
2959 /**
2960 * Don't call this with in==NULL, use get_or_create for that
2961 * leave dn set to default NULL unless you're trying to add
2962 * a new inode to a pre-created Dentry
2963 */
2964 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2965 {
2966 if (!dn) {
2967 // create a new Dentry
2968 dn = new Dentry;
2969 dn->name = name;
2970
2971 // link to dir
2972 dn->dir = dir;
2973 dir->dentries[dn->name] = dn;
2974 lru.lru_insert_mid(dn); // mid or top?
2975
2976 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2977 << " dn " << dn << " (new dn)" << dendl;
2978 } else {
2979 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2980 << " dn " << dn << " (old dn)" << dendl;
2981 }
2982
2983 if (in) { // link to inode
2984 dn->inode = in;
2985 if (in->is_dir()) {
2986 if (in->dir)
2987 dn->get(); // dir -> dn pin
2988 if (in->ll_ref)
2989 dn->get(); // ll_ref -> dn pin
2990 }
2991
2992 assert(in->dn_set.count(dn) == 0);
2993
2994 // only one parent for directories!
2995 if (in->is_dir() && !in->dn_set.empty()) {
2996 Dentry *olddn = in->get_first_parent();
2997 assert(olddn->dir != dir || olddn->name != name);
2998 Inode *old_diri = olddn->dir->parent_inode;
2999 old_diri->dir_release_count++;
3000 clear_dir_complete_and_ordered(old_diri, true);
3001 unlink(olddn, true, true); // keep dir, dentry
3002 }
3003
3004 in->dn_set.insert(dn);
3005
3006 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3007 }
3008
3009 return dn;
3010 }
3011
3012 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3013 {
3014 InodeRef in;
3015 in.swap(dn->inode);
3016 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3017 << " inode " << dn->inode << dendl;
3018
3019 // unlink from inode
3020 if (in) {
3021 if (in->is_dir()) {
3022 if (in->dir)
3023 dn->put(); // dir -> dn pin
3024 if (in->ll_ref)
3025 dn->put(); // ll_ref -> dn pin
3026 }
3027 dn->inode = 0;
3028 assert(in->dn_set.count(dn));
3029 in->dn_set.erase(dn);
3030 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3031 }
3032
3033 if (keepdentry) {
3034 dn->lease_mds = -1;
3035 } else {
3036 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3037
3038 // unlink from dir
3039 dn->dir->dentries.erase(dn->name);
3040 if (dn->dir->is_empty() && !keepdir)
3041 close_dir(dn->dir);
3042 dn->dir = 0;
3043
3044 // delete den
3045 lru.lru_remove(dn);
3046 dn->put();
3047 }
3048 }
3049
3050 /**
3051 * For asynchronous flushes, check for errors from the IO and
3052 * update the inode if necessary
3053 */
3054 class C_Client_FlushComplete : public Context {
3055 private:
3056 Client *client;
3057 InodeRef inode;
3058 public:
3059 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3060 void finish(int r) override {
3061 assert(client->client_lock.is_locked_by_me());
3062 if (r != 0) {
3063 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3064 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3065 << " 0x" << std::hex << inode->ino << std::dec
3066 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3067 inode->set_async_err(r);
3068 }
3069 }
3070 };
3071
3072
3073 /****
3074 * caps
3075 */
3076
3077 void Client::get_cap_ref(Inode *in, int cap)
3078 {
3079 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3080 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3081 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3082 in->get();
3083 }
3084 if ((cap & CEPH_CAP_FILE_CACHE) &&
3085 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3086 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3087 in->get();
3088 }
3089 in->get_cap_ref(cap);
3090 }
3091
3092 void Client::put_cap_ref(Inode *in, int cap)
3093 {
3094 int last = in->put_cap_ref(cap);
3095 if (last) {
3096 int put_nref = 0;
3097 int drop = last & ~in->caps_issued();
3098 if (in->snapid == CEPH_NOSNAP) {
3099 if ((last & CEPH_CAP_FILE_WR) &&
3100 !in->cap_snaps.empty() &&
3101 in->cap_snaps.rbegin()->second.writing) {
3102 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3103 in->cap_snaps.rbegin()->second.writing = 0;
3104 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3105 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3106 }
3107 if (last & CEPH_CAP_FILE_BUFFER) {
3108 for (auto &p : in->cap_snaps)
3109 p.second.dirty_data = 0;
3110 signal_cond_list(in->waitfor_commit);
3111 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3112 ++put_nref;
3113 }
3114 }
3115 if (last & CEPH_CAP_FILE_CACHE) {
3116 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3117 ++put_nref;
3118 }
3119 if (drop)
3120 check_caps(in, 0);
3121 if (put_nref)
3122 put_inode(in, put_nref);
3123 }
3124 }
3125
3126 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127 {
3128 int r = check_pool_perm(in, need);
3129 if (r < 0)
3130 return r;
3131
3132 while (1) {
3133 int file_wanted = in->caps_file_wanted();
3134 if ((file_wanted & need) != need) {
3135 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3136 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3137 << dendl;
3138 return -EBADF;
3139 }
3140
3141 int implemented;
3142 int have = in->caps_issued(&implemented);
3143
3144 bool waitfor_caps = false;
3145 bool waitfor_commit = false;
3146
3147 if (have & need & CEPH_CAP_FILE_WR) {
3148 if (endoff > 0 &&
3149 (endoff >= (loff_t)in->max_size ||
3150 endoff > (loff_t)(in->size << 1)) &&
3151 endoff > (loff_t)in->wanted_max_size) {
3152 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3153 in->wanted_max_size = endoff;
3154 check_caps(in, 0);
3155 }
3156
3157 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3158 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3159 waitfor_caps = true;
3160 }
3161 if (!in->cap_snaps.empty()) {
3162 if (in->cap_snaps.rbegin()->second.writing) {
3163 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3164 waitfor_caps = true;
3165 }
3166 for (auto &p : in->cap_snaps) {
3167 if (p.second.dirty_data) {
3168 waitfor_commit = true;
3169 break;
3170 }
3171 }
3172 if (waitfor_commit) {
3173 _flush(in, new C_Client_FlushComplete(this, in));
3174 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3175 }
3176 }
3177 }
3178
3179 if (!waitfor_caps && !waitfor_commit) {
3180 if ((have & need) == need) {
3181 int revoking = implemented & ~have;
3182 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3183 << " need " << ccap_string(need) << " want " << ccap_string(want)
3184 << " revoking " << ccap_string(revoking)
3185 << dendl;
3186 if ((revoking & want) == 0) {
3187 *phave = need | (have & want);
3188 in->get_cap_ref(need);
3189 return 0;
3190 }
3191 }
3192 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3193 waitfor_caps = true;
3194 }
3195
3196 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3197 in->auth_cap->session->readonly)
3198 return -EROFS;
3199
3200 if (in->flags & I_CAP_DROPPED) {
3201 int mds_wanted = in->caps_mds_wanted();
3202 if ((mds_wanted & need) != need) {
3203 int ret = _renew_caps(in);
3204 if (ret < 0)
3205 return ret;
3206 continue;
3207 }
3208 if ((mds_wanted & file_wanted) ==
3209 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3210 in->flags &= ~I_CAP_DROPPED;
3211 }
3212 }
3213
3214 if (waitfor_caps)
3215 wait_on_list(in->waitfor_caps);
3216 else if (waitfor_commit)
3217 wait_on_list(in->waitfor_commit);
3218 }
3219 }
3220
3221 int Client::get_caps_used(Inode *in)
3222 {
3223 unsigned used = in->caps_used();
3224 if (!(used & CEPH_CAP_FILE_CACHE) &&
3225 !objectcacher->set_is_empty(&in->oset))
3226 used |= CEPH_CAP_FILE_CACHE;
3227 return used;
3228 }
3229
3230 void Client::cap_delay_requeue(Inode *in)
3231 {
3232 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3233 in->hold_caps_until = ceph_clock_now();
3234 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3235 delayed_caps.push_back(&in->cap_item);
3236 }
3237
3238 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3239 bool sync, int used, int want, int retain,
3240 int flush, ceph_tid_t flush_tid)
3241 {
3242 int held = cap->issued | cap->implemented;
3243 int revoking = cap->implemented & ~cap->issued;
3244 retain &= ~revoking;
3245 int dropping = cap->issued & ~retain;
3246 int op = CEPH_CAP_OP_UPDATE;
3247
3248 ldout(cct, 10) << "send_cap " << *in
3249 << " mds." << session->mds_num << " seq " << cap->seq
3250 << (sync ? " sync " : " async ")
3251 << " used " << ccap_string(used)
3252 << " want " << ccap_string(want)
3253 << " flush " << ccap_string(flush)
3254 << " retain " << ccap_string(retain)
3255 << " held "<< ccap_string(held)
3256 << " revoking " << ccap_string(revoking)
3257 << " dropping " << ccap_string(dropping)
3258 << dendl;
3259
3260 if (cct->_conf->client_inject_release_failure && revoking) {
3261 const int would_have_issued = cap->issued & retain;
3262 const int would_have_implemented = cap->implemented & (cap->issued | used);
3263 // Simulated bug:
3264 // - tell the server we think issued is whatever they issued plus whatever we implemented
3265 // - leave what we have implemented in place
3266 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3267 cap->issued = cap->issued | cap->implemented;
3268
3269 // Make an exception for revoking xattr caps: we are injecting
3270 // failure to release other caps, but allow xattr because client
3271 // will block on xattr ops if it can't release these to MDS (#9800)
3272 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3273 cap->issued ^= xattr_mask & revoking;
3274 cap->implemented ^= xattr_mask & revoking;
3275
3276 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3277 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3278 } else {
3279 // Normal behaviour
3280 cap->issued &= retain;
3281 cap->implemented &= cap->issued | used;
3282 }
3283
3284 snapid_t follows = 0;
3285
3286 if (flush)
3287 follows = in->snaprealm->get_snap_context().seq;
3288
3289 MClientCaps *m = new MClientCaps(op,
3290 in->ino,
3291 0,
3292 cap->cap_id, cap->seq,
3293 cap->implemented,
3294 want,
3295 flush,
3296 cap->mseq,
3297 cap_epoch_barrier);
3298 m->caller_uid = in->cap_dirtier_uid;
3299 m->caller_gid = in->cap_dirtier_gid;
3300
3301 m->head.issue_seq = cap->issue_seq;
3302 m->set_tid(flush_tid);
3303
3304 m->head.uid = in->uid;
3305 m->head.gid = in->gid;
3306 m->head.mode = in->mode;
3307
3308 m->head.nlink = in->nlink;
3309
3310 if (flush & CEPH_CAP_XATTR_EXCL) {
3311 ::encode(in->xattrs, m->xattrbl);
3312 m->head.xattr_version = in->xattr_version;
3313 }
3314
3315 m->size = in->size;
3316 m->max_size = in->max_size;
3317 m->truncate_seq = in->truncate_seq;
3318 m->truncate_size = in->truncate_size;
3319 m->mtime = in->mtime;
3320 m->atime = in->atime;
3321 m->ctime = in->ctime;
3322 m->btime = in->btime;
3323 m->time_warp_seq = in->time_warp_seq;
3324 m->change_attr = in->change_attr;
3325 if (sync)
3326 m->flags |= CLIENT_CAPS_SYNC;
3327
3328 if (flush & CEPH_CAP_FILE_WR) {
3329 m->inline_version = in->inline_version;
3330 m->inline_data = in->inline_data;
3331 }
3332
3333 in->reported_size = in->size;
3334 m->set_snap_follows(follows);
3335 cap->wanted = want;
3336 if (cap == in->auth_cap) {
3337 m->set_max_size(in->wanted_max_size);
3338 in->requested_max_size = in->wanted_max_size;
3339 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3340 }
3341
3342 if (!session->flushing_caps_tids.empty())
3343 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3344
3345 session->con->send_message(m);
3346 }
3347
3348 static bool is_max_size_approaching(Inode *in)
3349 {
3350 /* mds will adjust max size according to the reported size */
3351 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3352 return false;
3353 if (in->size >= in->max_size)
3354 return true;
3355 /* half of previous max_size increment has been used */
3356 if (in->max_size > in->reported_size &&
3357 (in->size << 1) >= in->max_size + in->reported_size)
3358 return true;
3359 return false;
3360 }
3361
3362 /**
3363 * check_caps
3364 *
3365 * Examine currently used and wanted versus held caps. Release, flush or ack
3366 * revoked caps to the MDS as appropriate.
3367 *
3368 * @param in the inode to check
3369 * @param flags flags to apply to cap check
3370 */
3371 void Client::check_caps(Inode *in, unsigned flags)
3372 {
3373 unsigned wanted = in->caps_wanted();
3374 unsigned used = get_caps_used(in);
3375 unsigned cap_used;
3376
3377 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3378 // we do this here because we don't want to drop to Fs (and then
3379 // drop the Fs if we do a create!) if that alone makes us send lookups
3380 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3381 wanted |= CEPH_CAP_FILE_EXCL;
3382 }
3383
3384 int implemented;
3385 int issued = in->caps_issued(&implemented);
3386 int revoking = implemented & ~issued;
3387
3388 int retain = wanted | used | CEPH_CAP_PIN;
3389 if (!unmounting) {
3390 if (wanted)
3391 retain |= CEPH_CAP_ANY;
3392 else
3393 retain |= CEPH_CAP_ANY_SHARED;
3394 }
3395
3396 ldout(cct, 10) << "check_caps on " << *in
3397 << " wanted " << ccap_string(wanted)
3398 << " used " << ccap_string(used)
3399 << " issued " << ccap_string(issued)
3400 << " revoking " << ccap_string(revoking)
3401 << " flags=" << flags
3402 << dendl;
3403
3404 if (in->snapid != CEPH_NOSNAP)
3405 return; //snap caps last forever, can't write
3406
3407 if (in->caps.empty())
3408 return; // guard if at end of func
3409
3410 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3411 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3412 if (_release(in))
3413 used &= ~CEPH_CAP_FILE_CACHE;
3414 }
3415
3416 if (!in->cap_snaps.empty())
3417 flush_snaps(in);
3418
3419 if (flags & CHECK_CAPS_NODELAY)
3420 in->hold_caps_until = utime_t();
3421 else
3422 cap_delay_requeue(in);
3423
3424 utime_t now = ceph_clock_now();
3425
3426 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3427 while (it != in->caps.end()) {
3428 mds_rank_t mds = it->first;
3429 Cap *cap = it->second;
3430 ++it;
3431
3432 MetaSession *session = mds_sessions[mds];
3433 assert(session);
3434
3435 cap_used = used;
3436 if (in->auth_cap && cap != in->auth_cap)
3437 cap_used &= ~in->auth_cap->issued;
3438
3439 revoking = cap->implemented & ~cap->issued;
3440
3441 ldout(cct, 10) << " cap mds." << mds
3442 << " issued " << ccap_string(cap->issued)
3443 << " implemented " << ccap_string(cap->implemented)
3444 << " revoking " << ccap_string(revoking) << dendl;
3445
3446 if (in->wanted_max_size > in->max_size &&
3447 in->wanted_max_size > in->requested_max_size &&
3448 cap == in->auth_cap)
3449 goto ack;
3450
3451 /* approaching file_max? */
3452 if ((cap->issued & CEPH_CAP_FILE_WR) &&
3453 cap == in->auth_cap &&
3454 is_max_size_approaching(in)) {
3455 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3456 << ", reported " << in->reported_size << dendl;
3457 goto ack;
3458 }
3459
3460 /* completed revocation? */
3461 if (revoking && (revoking & cap_used) == 0) {
3462 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3463 goto ack;
3464 }
3465
3466 /* want more caps from mds? */
3467 if (wanted & ~(cap->wanted | cap->issued))
3468 goto ack;
3469
3470 if (!revoking && unmounting && (cap_used == 0))
3471 goto ack;
3472
3473 if (wanted == cap->wanted && // mds knows what we want.
3474 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3475 !in->dirty_caps) // and we have no dirty caps
3476 continue;
3477
3478 if (now < in->hold_caps_until) {
3479 ldout(cct, 10) << "delaying cap release" << dendl;
3480 continue;
3481 }
3482
3483 ack:
3484 // re-send old cap/snapcap flushes first.
3485 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3486 session->mds_state < MDSMap::STATE_ACTIVE &&
3487 session->early_flushing_caps.count(in) == 0) {
3488 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3489 << " to mds." << session->mds_num << dendl;
3490 session->early_flushing_caps.insert(in);
3491 if (in->cap_snaps.size())
3492 flush_snaps(in, true);
3493 if (in->flushing_caps)
3494 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3495 }
3496
3497 int flushing;
3498 ceph_tid_t flush_tid;
3499 if (in->auth_cap == cap && in->dirty_caps) {
3500 flushing = mark_caps_flushing(in, &flush_tid);
3501 } else {
3502 flushing = 0;
3503 flush_tid = 0;
3504 }
3505
3506 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3507 retain, flushing, flush_tid);
3508 }
3509 }
3510
3511
3512 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3513 {
3514 int used = get_caps_used(in);
3515 int dirty = in->caps_dirty();
3516 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3517
3518 if (in->cap_snaps.size() &&
3519 in->cap_snaps.rbegin()->second.writing) {
3520 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3521 return;
3522 } else if (in->caps_dirty() ||
3523 (used & CEPH_CAP_FILE_WR) ||
3524 (dirty & CEPH_CAP_ANY_WR)) {
3525 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3526 assert(capsnapem.second == true); /* element inserted */
3527 CapSnap &capsnap = capsnapem.first->second;
3528 capsnap.context = old_snapc;
3529 capsnap.issued = in->caps_issued();
3530 capsnap.dirty = in->caps_dirty();
3531
3532 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3533
3534 capsnap.uid = in->uid;
3535 capsnap.gid = in->gid;
3536 capsnap.mode = in->mode;
3537 capsnap.btime = in->btime;
3538 capsnap.xattrs = in->xattrs;
3539 capsnap.xattr_version = in->xattr_version;
3540
3541 if (used & CEPH_CAP_FILE_WR) {
3542 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3543 capsnap.writing = 1;
3544 } else {
3545 finish_cap_snap(in, capsnap, used);
3546 }
3547 } else {
3548 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3549 }
3550 }
3551
3552 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3553 {
3554 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3555 capsnap.size = in->size;
3556 capsnap.mtime = in->mtime;
3557 capsnap.atime = in->atime;
3558 capsnap.ctime = in->ctime;
3559 capsnap.time_warp_seq = in->time_warp_seq;
3560 capsnap.change_attr = in->change_attr;
3561
3562 capsnap.dirty |= in->caps_dirty();
3563
3564 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3565 capsnap.inline_data = in->inline_data;
3566 capsnap.inline_version = in->inline_version;
3567 }
3568
3569 if (used & CEPH_CAP_FILE_BUFFER) {
3570 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3571 << " WRBUFFER, delaying" << dendl;
3572 } else {
3573 capsnap.dirty_data = 0;
3574 flush_snaps(in);
3575 }
3576 }
3577
3578 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3579 {
3580 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3581 in->cap_snaps.at(seq).dirty_data = 0;
3582 flush_snaps(in);
3583 }
3584
3585 void Client::flush_snaps(Inode *in, bool all_again)
3586 {
3587 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3588 assert(in->cap_snaps.size());
3589
3590 // pick auth mds
3591 assert(in->auth_cap);
3592 MetaSession *session = in->auth_cap->session;
3593 int mseq = in->auth_cap->mseq;
3594
3595 for (auto &p : in->cap_snaps) {
3596 CapSnap &capsnap = p.second;
3597 if (!all_again) {
3598 // only flush once per session
3599 if (capsnap.flush_tid > 0)
3600 continue;
3601 }
3602
3603 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3604 << " follows " << p.first
3605 << " size " << capsnap.size
3606 << " mtime " << capsnap.mtime
3607 << " dirty_data=" << capsnap.dirty_data
3608 << " writing=" << capsnap.writing
3609 << " on " << *in << dendl;
3610 if (capsnap.dirty_data || capsnap.writing)
3611 continue;
3612
3613 if (capsnap.flush_tid == 0) {
3614 capsnap.flush_tid = ++last_flush_tid;
3615 if (!in->flushing_cap_item.is_on_list())
3616 session->flushing_caps.push_back(&in->flushing_cap_item);
3617 session->flushing_caps_tids.insert(capsnap.flush_tid);
3618 }
3619
3620 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3621 cap_epoch_barrier);
3622 if (user_id >= 0)
3623 m->caller_uid = user_id;
3624 if (group_id >= 0)
3625 m->caller_gid = group_id;
3626
3627 m->set_client_tid(capsnap.flush_tid);
3628 m->head.snap_follows = p.first;
3629
3630 m->head.caps = capsnap.issued;
3631 m->head.dirty = capsnap.dirty;
3632
3633 m->head.uid = capsnap.uid;
3634 m->head.gid = capsnap.gid;
3635 m->head.mode = capsnap.mode;
3636 m->btime = capsnap.btime;
3637
3638 m->size = capsnap.size;
3639
3640 m->head.xattr_version = capsnap.xattr_version;
3641 ::encode(capsnap.xattrs, m->xattrbl);
3642
3643 m->ctime = capsnap.ctime;
3644 m->btime = capsnap.btime;
3645 m->mtime = capsnap.mtime;
3646 m->atime = capsnap.atime;
3647 m->time_warp_seq = capsnap.time_warp_seq;
3648 m->change_attr = capsnap.change_attr;
3649
3650 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3651 m->inline_version = in->inline_version;
3652 m->inline_data = in->inline_data;
3653 }
3654
3655 assert(!session->flushing_caps_tids.empty());
3656 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3657
3658 session->con->send_message(m);
3659 }
3660 }
3661
3662
3663
3664 void Client::wait_on_list(list<Cond*>& ls)
3665 {
3666 Cond cond;
3667 ls.push_back(&cond);
3668 cond.Wait(client_lock);
3669 ls.remove(&cond);
3670 }
3671
3672 void Client::signal_cond_list(list<Cond*>& ls)
3673 {
3674 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3675 (*it)->Signal();
3676 }
3677
3678 void Client::wait_on_context_list(list<Context*>& ls)
3679 {
3680 Cond cond;
3681 bool done = false;
3682 int r;
3683 ls.push_back(new C_Cond(&cond, &done, &r));
3684 while (!done)
3685 cond.Wait(client_lock);
3686 }
3687
3688 void Client::signal_context_list(list<Context*>& ls)
3689 {
3690 while (!ls.empty()) {
3691 ls.front()->complete(0);
3692 ls.pop_front();
3693 }
3694 }
3695
3696 void Client::wake_inode_waiters(MetaSession *s)
3697 {
3698 xlist<Cap*>::iterator iter = s->caps.begin();
3699 while (!iter.end()){
3700 signal_cond_list((*iter)->inode->waitfor_caps);
3701 ++iter;
3702 }
3703 }
3704
3705
3706 // flush dirty data (from objectcache)
3707
3708 class C_Client_CacheInvalidate : public Context {
3709 private:
3710 Client *client;
3711 vinodeno_t ino;
3712 int64_t offset, length;
3713 public:
3714 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3715 client(c), offset(off), length(len) {
3716 if (client->use_faked_inos())
3717 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3718 else
3719 ino = in->vino();
3720 }
3721 void finish(int r) override {
3722 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3723 assert(!client->client_lock.is_locked_by_me());
3724 client->_async_invalidate(ino, offset, length);
3725 }
3726 };
3727
3728 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3729 {
3730 if (unmounting)
3731 return;
3732 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3733 ino_invalidate_cb(callback_handle, ino, off, len);
3734 }
3735
3736 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3737
3738 if (ino_invalidate_cb)
3739 // we queue the invalidate, which calls the callback and decrements the ref
3740 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3741 }
3742
3743 void Client::_invalidate_inode_cache(Inode *in)
3744 {
3745 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3746
3747 // invalidate our userspace inode cache
3748 if (cct->_conf->client_oc) {
3749 objectcacher->release_set(&in->oset);
3750 if (!objectcacher->set_is_empty(&in->oset))
3751 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3752 }
3753
3754 _schedule_invalidate_callback(in, 0, 0);
3755 }
3756
3757 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3758 {
3759 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3760
3761 // invalidate our userspace inode cache
3762 if (cct->_conf->client_oc) {
3763 vector<ObjectExtent> ls;
3764 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3765 objectcacher->discard_set(&in->oset, ls);
3766 }
3767
3768 _schedule_invalidate_callback(in, off, len);
3769 }
3770
3771 bool Client::_release(Inode *in)
3772 {
3773 ldout(cct, 20) << "_release " << *in << dendl;
3774 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3775 _invalidate_inode_cache(in);
3776 return true;
3777 }
3778 return false;
3779 }
3780
3781 bool Client::_flush(Inode *in, Context *onfinish)
3782 {
3783 ldout(cct, 10) << "_flush " << *in << dendl;
3784
3785 if (!in->oset.dirty_or_tx) {
3786 ldout(cct, 10) << " nothing to flush" << dendl;
3787 onfinish->complete(0);
3788 return true;
3789 }
3790
3791 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3792 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3793 objectcacher->purge_set(&in->oset);
3794 if (onfinish) {
3795 onfinish->complete(-ENOSPC);
3796 }
3797 return true;
3798 }
3799
3800 return objectcacher->flush_set(&in->oset, onfinish);
3801 }
3802
3803 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3804 {
3805 assert(client_lock.is_locked());
3806 if (!in->oset.dirty_or_tx) {
3807 ldout(cct, 10) << " nothing to flush" << dendl;
3808 return;
3809 }
3810
3811 Mutex flock("Client::_flush_range flock");
3812 Cond cond;
3813 bool safe = false;
3814 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3815 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3816 offset, size, onflush);
3817 if (!ret) {
3818 // wait for flush
3819 client_lock.Unlock();
3820 flock.Lock();
3821 while (!safe)
3822 cond.Wait(flock);
3823 flock.Unlock();
3824 client_lock.Lock();
3825 }
3826 }
3827
3828 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3829 {
3830 // Mutex::Locker l(client_lock);
3831 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3832 Inode *in = static_cast<Inode *>(oset->parent);
3833 assert(in);
3834 _flushed(in);
3835 }
3836
3837 void Client::_flushed(Inode *in)
3838 {
3839 ldout(cct, 10) << "_flushed " << *in << dendl;
3840
3841 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3842 }
3843
3844
3845
3846 // checks common to add_update_cap, handle_cap_grant
3847 void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3848 {
3849 unsigned had = in->caps_issued();
3850
3851 if ((issued & CEPH_CAP_FILE_CACHE) &&
3852 !(had & CEPH_CAP_FILE_CACHE))
3853 in->cache_gen++;
3854
3855 if ((issued & CEPH_CAP_FILE_SHARED) &&
3856 !(had & CEPH_CAP_FILE_SHARED)) {
3857 in->shared_gen++;
3858
3859 if (in->is_dir())
3860 clear_dir_complete_and_ordered(in, true);
3861 }
3862 }
3863
3864 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3865 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3866 int flags, const UserPerm& cap_perms)
3867 {
3868 Cap *cap = 0;
3869 mds_rank_t mds = mds_session->mds_num;
3870 if (in->caps.count(mds)) {
3871 cap = in->caps[mds];
3872
3873 /*
3874 * auth mds of the inode changed. we received the cap export
3875 * message, but still haven't received the cap import message.
3876 * handle_cap_export() updated the new auth MDS' cap.
3877 *
3878 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3879 * a message that was send before the cap import message. So
3880 * don't remove caps.
3881 */
3882 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3883 assert(cap == in->auth_cap);
3884 assert(cap->cap_id == cap_id);
3885 seq = cap->seq;
3886 mseq = cap->mseq;
3887 issued |= cap->issued;
3888 flags |= CEPH_CAP_FLAG_AUTH;
3889 }
3890 } else {
3891 mds_session->num_caps++;
3892 if (!in->is_any_caps()) {
3893 assert(in->snaprealm == 0);
3894 in->snaprealm = get_snap_realm(realm);
3895 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3896 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3897 }
3898 in->caps[mds] = cap = new Cap;
3899
3900 mds_session->caps.push_back(&cap->cap_item);
3901 cap->session = mds_session;
3902 cap->inode = in;
3903 cap->gen = mds_session->cap_gen;
3904 cap_list.push_back(&in->cap_item);
3905 }
3906
3907 check_cap_issue(in, cap, issued);
3908
3909 if (flags & CEPH_CAP_FLAG_AUTH) {
3910 if (in->auth_cap != cap &&
3911 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3912 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3913 ldout(cct, 10) << "add_update_cap changing auth cap: "
3914 << "add myself to new auth MDS' flushing caps list" << dendl;
3915 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3916 }
3917 in->auth_cap = cap;
3918 }
3919 }
3920
3921 unsigned old_caps = cap->issued;
3922 cap->cap_id = cap_id;
3923 cap->issued |= issued;
3924 cap->implemented |= issued;
3925 cap->seq = seq;
3926 cap->issue_seq = seq;
3927 cap->mseq = mseq;
3928 cap->latest_perms = cap_perms;
3929 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3930 << " from mds." << mds
3931 << " on " << *in
3932 << dendl;
3933
3934 if ((issued & ~old_caps) && in->auth_cap == cap) {
3935 // non-auth MDS is revoking the newly grant caps ?
3936 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3937 if (it->second == cap)
3938 continue;
3939 if (it->second->implemented & ~it->second->issued & issued) {
3940 check_caps(in, CHECK_CAPS_NODELAY);
3941 break;
3942 }
3943 }
3944 }
3945
3946 if (issued & ~old_caps)
3947 signal_cond_list(in->waitfor_caps);
3948 }
3949
3950 void Client::remove_cap(Cap *cap, bool queue_release)
3951 {
3952 Inode *in = cap->inode;
3953 MetaSession *session = cap->session;
3954 mds_rank_t mds = cap->session->mds_num;
3955
3956 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3957
3958 if (queue_release) {
3959 session->enqueue_cap_release(
3960 in->ino,
3961 cap->cap_id,
3962 cap->issue_seq,
3963 cap->mseq,
3964 cap_epoch_barrier);
3965 }
3966
3967 if (in->auth_cap == cap) {
3968 if (in->flushing_cap_item.is_on_list()) {
3969 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3970 in->flushing_cap_item.remove_myself();
3971 }
3972 in->auth_cap = NULL;
3973 }
3974 assert(in->caps.count(mds));
3975 in->caps.erase(mds);
3976
3977 cap->cap_item.remove_myself();
3978 delete cap;
3979 cap = nullptr;
3980
3981 if (!in->is_any_caps()) {
3982 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3983 in->snaprealm_item.remove_myself();
3984 put_snap_realm(in->snaprealm);
3985 in->snaprealm = 0;
3986 }
3987 }
3988
3989 void Client::remove_all_caps(Inode *in)
3990 {
3991 while (!in->caps.empty())
3992 remove_cap(in->caps.begin()->second, true);
3993 }
3994
3995 void Client::remove_session_caps(MetaSession *s)
3996 {
3997 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3998
3999 while (s->caps.size()) {
4000 Cap *cap = *s->caps.begin();
4001 Inode *in = cap->inode;
4002 bool dirty_caps = false, cap_snaps = false;
4003 if (in->auth_cap == cap) {
4004 cap_snaps = !in->cap_snaps.empty();
4005 dirty_caps = in->dirty_caps | in->flushing_caps;
4006 in->wanted_max_size = 0;
4007 in->requested_max_size = 0;
4008 in->flags |= I_CAP_DROPPED;
4009 }
4010 remove_cap(cap, false);
4011 signal_cond_list(in->waitfor_caps);
4012 if (cap_snaps) {
4013 InodeRef tmp_ref(in);
4014 in->cap_snaps.clear();
4015 }
4016 if (dirty_caps) {
4017 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4018 if (in->flushing_caps) {
4019 num_flushing_caps--;
4020 in->flushing_cap_tids.clear();
4021 }
4022 in->flushing_caps = 0;
4023 in->dirty_caps = 0;
4024 put_inode(in);
4025 }
4026 }
4027 s->flushing_caps_tids.clear();
4028 sync_cond.Signal();
4029 }
4030
4031 int Client::_do_remount(void)
4032 {
4033 errno = 0;
4034 int r = remount_cb(callback_handle);
4035 if (r != 0) {
4036 int e = errno;
4037 client_t whoami = get_nodeid();
4038 if (r == -1) {
4039 lderr(cct) <<
4040 "failed to remount (to trim kernel dentries): "
4041 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4042 } else {
4043 lderr(cct) <<
4044 "failed to remount (to trim kernel dentries): "
4045 "return code = " << r << dendl;
4046 }
4047 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4048 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
4049 if (should_abort && !unmounting) {
4050 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4051 ceph_abort();
4052 }
4053 }
4054 return r;
4055 }
4056
4057 class C_Client_Remount : public Context {
4058 private:
4059 Client *client;
4060 public:
4061 explicit C_Client_Remount(Client *c) : client(c) {}
4062 void finish(int r) override {
4063 assert(r == 0);
4064 client->_do_remount();
4065 }
4066 };
4067
4068 void Client::_invalidate_kernel_dcache()
4069 {
4070 if (unmounting)
4071 return;
4072 if (can_invalidate_dentries) {
4073 if (dentry_invalidate_cb && root->dir) {
4074 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4075 p != root->dir->dentries.end();
4076 ++p) {
4077 if (p->second->inode)
4078 _schedule_invalidate_dentry_callback(p->second, false);
4079 }
4080 }
4081 } else if (remount_cb) {
4082 // Hacky:
4083 // when remounting a file system, linux kernel trims all unused dentries in the fs
4084 remount_finisher.queue(new C_Client_Remount(this));
4085 }
4086 }
4087
4088 void Client::trim_caps(MetaSession *s, int max)
4089 {
4090 mds_rank_t mds = s->mds_num;
4091 int caps_size = s->caps.size();
4092 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4093 << " caps " << caps_size << dendl;
4094
4095 int trimmed = 0;
4096 xlist<Cap*>::iterator p = s->caps.begin();
4097 std::set<InodeRef> anchor; /* prevent put_inode from deleting all caps during traversal */
4098 while ((caps_size - trimmed) > max && !p.end()) {
4099 Cap *cap = *p;
4100 InodeRef in(cap->inode);
4101
4102 // Increment p early because it will be invalidated if cap
4103 // is deleted inside remove_cap
4104 ++p;
4105
4106 if (in->caps.size() > 1 && cap != in->auth_cap) {
4107 int mine = cap->issued | cap->implemented;
4108 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4109 // disposable non-auth cap
4110 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4111 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4112 remove_cap(cap, true);
4113 /* N.B. no need to push onto anchor, as we are only removing one cap */
4114 trimmed++;
4115 }
4116 } else {
4117 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4118 bool all = true;
4119 set<Dentry*>::iterator q = in->dn_set.begin();
4120 while (q != in->dn_set.end()) {
4121 Dentry *dn = *q++;
4122 if (dn->lru_is_expireable()) {
4123 if (can_invalidate_dentries &&
4124 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4125 // Only issue one of these per DN for inodes in root: handle
4126 // others more efficiently by calling for root-child DNs at
4127 // the end of this function.
4128 _schedule_invalidate_dentry_callback(dn, true);
4129 }
4130 ldout(cct, 20) << " anchoring inode: " << in->ino << dendl;
4131 anchor.insert(in);
4132 trim_dentry(dn);
4133 } else {
4134 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4135 all = false;
4136 }
4137 }
4138 if (all && in->ino != MDS_INO_ROOT) {
4139 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4140 trimmed++;
4141 }
4142 }
4143 }
4144 ldout(cct, 20) << " clearing anchored inodes" << dendl;
4145 anchor.clear();
4146
4147 caps_size = s->caps.size();
4148 if (caps_size > max)
4149 _invalidate_kernel_dcache();
4150 }
4151
4152 void Client::force_session_readonly(MetaSession *s)
4153 {
4154 s->readonly = true;
4155 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4156 Inode *in = (*p)->inode;
4157 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4158 signal_cond_list(in->waitfor_caps);
4159 }
4160 }
4161
4162 void Client::mark_caps_dirty(Inode *in, int caps)
4163 {
4164 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4165 << ccap_string(in->dirty_caps | caps) << dendl;
4166 if (caps && !in->caps_dirty())
4167 in->get();
4168 in->dirty_caps |= caps;
4169 }
4170
4171 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4172 {
4173 MetaSession *session = in->auth_cap->session;
4174
4175 int flushing = in->dirty_caps;
4176 assert(flushing);
4177
4178 ceph_tid_t flush_tid = ++last_flush_tid;
4179 in->flushing_cap_tids[flush_tid] = flushing;
4180
4181 if (!in->flushing_caps) {
4182 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4183 num_flushing_caps++;
4184 } else {
4185 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4186 }
4187
4188 in->flushing_caps |= flushing;
4189 in->dirty_caps = 0;
4190
4191 if (!in->flushing_cap_item.is_on_list())
4192 session->flushing_caps.push_back(&in->flushing_cap_item);
4193 session->flushing_caps_tids.insert(flush_tid);
4194
4195 *ptid = flush_tid;
4196 return flushing;
4197 }
4198
4199 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4200 {
4201 for (auto &p : in->cap_snaps) {
4202 CapSnap &capsnap = p.second;
4203 if (capsnap.flush_tid > 0) {
4204 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4205 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4206 }
4207 }
4208 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4209 it != in->flushing_cap_tids.end();
4210 ++it) {
4211 old_s->flushing_caps_tids.erase(it->first);
4212 new_s->flushing_caps_tids.insert(it->first);
4213 }
4214 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4215 }
4216
4217 /*
4218 * Flush all caps back to the MDS. Because the callers generally wait on the
4219 * result of this function (syncfs and umount cases), we set
4220 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4221 */
4222 void Client::flush_caps_sync()
4223 {
4224 ldout(cct, 10) << __func__ << dendl;
4225 xlist<Inode*>::iterator p = delayed_caps.begin();
4226 while (!p.end()) {
4227 unsigned flags = CHECK_CAPS_NODELAY;
4228 Inode *in = *p;
4229
4230 ++p;
4231 delayed_caps.pop_front();
4232 if (p.end() && cap_list.empty())
4233 flags |= CHECK_CAPS_SYNCHRONOUS;
4234 check_caps(in, flags);
4235 }
4236
4237 // other caps, too
4238 p = cap_list.begin();
4239 while (!p.end()) {
4240 unsigned flags = CHECK_CAPS_NODELAY;
4241 Inode *in = *p;
4242
4243 ++p;
4244 if (p.end())
4245 flags |= CHECK_CAPS_SYNCHRONOUS;
4246 check_caps(in, flags);
4247 }
4248 }
4249
4250 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4251 {
4252 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4253 Cap *cap = in->auth_cap;
4254 assert(cap->session == session);
4255
4256 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4257 p != in->flushing_cap_tids.end();
4258 ++p) {
4259 bool req_sync = false;
4260
4261 /* If this is a synchronous request, then flush the journal on last one */
4262 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4263 req_sync = true;
4264
4265 send_cap(in, session, cap, req_sync,
4266 (get_caps_used(in) | in->caps_dirty()),
4267 in->caps_wanted(), (cap->issued | cap->implemented),
4268 p->second, p->first);
4269 }
4270 }
4271
4272 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4273 {
4274 while (in->flushing_caps) {
4275 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4276 assert(it != in->flushing_cap_tids.end());
4277 if (it->first > want)
4278 break;
4279 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4280 << ccap_string(it->second) << " want " << want
4281 << " last " << it->first << dendl;
4282 wait_on_list(in->waitfor_caps);
4283 }
4284 }
4285
4286 void Client::wait_sync_caps(ceph_tid_t want)
4287 {
4288 retry:
4289 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4290 << num_flushing_caps << " total flushing)" << dendl;
4291 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4292 p != mds_sessions.end();
4293 ++p) {
4294 MetaSession *s = p->second;
4295 if (s->flushing_caps_tids.empty())
4296 continue;
4297 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4298 if (oldest_tid <= want) {
4299 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4300 << " (want " << want << ")" << dendl;
4301 sync_cond.Wait(client_lock);
4302 goto retry;
4303 }
4304 }
4305 }
4306
4307 void Client::kick_flushing_caps(MetaSession *session)
4308 {
4309 mds_rank_t mds = session->mds_num;
4310 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4311
4312 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4313 Inode *in = *p;
4314 if (session->early_flushing_caps.count(in))
4315 continue;
4316 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4317 if (in->cap_snaps.size())
4318 flush_snaps(in, true);
4319 if (in->flushing_caps)
4320 flush_caps(in, session);
4321 }
4322
4323 session->early_flushing_caps.clear();
4324 }
4325
4326 void Client::early_kick_flushing_caps(MetaSession *session)
4327 {
4328 session->early_flushing_caps.clear();
4329
4330 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4331 Inode *in = *p;
4332 assert(in->auth_cap);
4333
4334 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4335 // stage. This guarantees that MDS processes the cap flush message before issuing
4336 // the flushing caps to other client.
4337 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4338 continue;
4339
4340 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4341 << " to mds." << session->mds_num << dendl;
4342
4343 session->early_flushing_caps.insert(in);
4344
4345 if (in->cap_snaps.size())
4346 flush_snaps(in, true);
4347 if (in->flushing_caps)
4348 flush_caps(in, session);
4349
4350 }
4351 }
4352
4353 void Client::kick_maxsize_requests(MetaSession *session)
4354 {
4355 xlist<Cap*>::iterator iter = session->caps.begin();
4356 while (!iter.end()){
4357 (*iter)->inode->requested_max_size = 0;
4358 (*iter)->inode->wanted_max_size = 0;
4359 signal_cond_list((*iter)->inode->waitfor_caps);
4360 ++iter;
4361 }
4362 }
4363
4364 void SnapRealm::build_snap_context()
4365 {
4366 set<snapid_t> snaps;
4367 snapid_t max_seq = seq;
4368
4369 // start with prior_parents?
4370 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4371 snaps.insert(prior_parent_snaps[i]);
4372
4373 // current parent's snaps
4374 if (pparent) {
4375 const SnapContext& psnapc = pparent->get_snap_context();
4376 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4377 if (psnapc.snaps[i] >= parent_since)
4378 snaps.insert(psnapc.snaps[i]);
4379 if (psnapc.seq > max_seq)
4380 max_seq = psnapc.seq;
4381 }
4382
4383 // my snaps
4384 for (unsigned i=0; i<my_snaps.size(); i++)
4385 snaps.insert(my_snaps[i]);
4386
4387 // ok!
4388 cached_snap_context.seq = max_seq;
4389 cached_snap_context.snaps.resize(0);
4390 cached_snap_context.snaps.reserve(snaps.size());
4391 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4392 cached_snap_context.snaps.push_back(*p);
4393 }
4394
4395 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4396 {
4397 list<SnapRealm*> q;
4398 q.push_back(realm);
4399
4400 while (!q.empty()) {
4401 realm = q.front();
4402 q.pop_front();
4403
4404 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4405 realm->invalidate_cache();
4406
4407 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4408 p != realm->pchildren.end();
4409 ++p)
4410 q.push_back(*p);
4411 }
4412 }
4413
4414 SnapRealm *Client::get_snap_realm(inodeno_t r)
4415 {
4416 SnapRealm *realm = snap_realms[r];
4417 if (!realm)
4418 snap_realms[r] = realm = new SnapRealm(r);
4419 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4420 realm->nref++;
4421 return realm;
4422 }
4423
4424 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4425 {
4426 if (snap_realms.count(r) == 0) {
4427 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4428 return NULL;
4429 }
4430 SnapRealm *realm = snap_realms[r];
4431 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4432 realm->nref++;
4433 return realm;
4434 }
4435
4436 void Client::put_snap_realm(SnapRealm *realm)
4437 {
4438 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4439 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4440 if (--realm->nref == 0) {
4441 snap_realms.erase(realm->ino);
4442 if (realm->pparent) {
4443 realm->pparent->pchildren.erase(realm);
4444 put_snap_realm(realm->pparent);
4445 }
4446 delete realm;
4447 }
4448 }
4449
4450 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4451 {
4452 if (realm->parent != parent) {
4453 ldout(cct, 10) << "adjust_realm_parent " << *realm
4454 << " " << realm->parent << " -> " << parent << dendl;
4455 realm->parent = parent;
4456 if (realm->pparent) {
4457 realm->pparent->pchildren.erase(realm);
4458 put_snap_realm(realm->pparent);
4459 }
4460 realm->pparent = get_snap_realm(parent);
4461 realm->pparent->pchildren.insert(realm);
4462 return true;
4463 }
4464 return false;
4465 }
4466
4467 static bool has_new_snaps(const SnapContext& old_snapc,
4468 const SnapContext& new_snapc)
4469 {
4470 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4471 }
4472
4473
4474 void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4475 {
4476 SnapRealm *first_realm = NULL;
4477 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4478
4479 map<SnapRealm*, SnapContext> dirty_realms;
4480
4481 bufferlist::iterator p = bl.begin();
4482 while (!p.end()) {
4483 SnapRealmInfo info;
4484 ::decode(info, p);
4485 SnapRealm *realm = get_snap_realm(info.ino());
4486
4487 bool invalidate = false;
4488
4489 if (info.seq() > realm->seq) {
4490 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4491 << dendl;
4492
4493 if (flush) {
4494 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4495 // flush me + children
4496 list<SnapRealm*> q;
4497 q.push_back(realm);
4498 while (!q.empty()) {
4499 SnapRealm *realm = q.front();
4500 q.pop_front();
4501
4502 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4503 p != realm->pchildren.end();
4504 ++p)
4505 q.push_back(*p);
4506
4507 if (dirty_realms.count(realm) == 0) {
4508 realm->nref++;
4509 dirty_realms[realm] = realm->get_snap_context();
4510 }
4511 }
4512 }
4513
4514 // update
4515 realm->seq = info.seq();
4516 realm->created = info.created();
4517 realm->parent_since = info.parent_since();
4518 realm->prior_parent_snaps = info.prior_parent_snaps;
4519 realm->my_snaps = info.my_snaps;
4520 invalidate = true;
4521 }
4522
4523 // _always_ verify parent
4524 if (adjust_realm_parent(realm, info.parent()))
4525 invalidate = true;
4526
4527 if (invalidate) {
4528 invalidate_snaprealm_and_children(realm);
4529 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4530 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4531 } else {
4532 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4533 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4534 }
4535
4536 if (!first_realm)
4537 first_realm = realm;
4538 else
4539 put_snap_realm(realm);
4540 }
4541
4542 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4543 q != dirty_realms.end();
4544 ++q) {
4545 SnapRealm *realm = q->first;
4546 // if there are new snaps ?
4547 if (has_new_snaps(q->second, realm->get_snap_context())) {
4548 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4549 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4550 while (!r.end()) {
4551 Inode *in = *r;
4552 ++r;
4553 queue_cap_snap(in, q->second);
4554 }
4555 } else {
4556 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4557 }
4558 put_snap_realm(realm);
4559 }
4560
4561 if (realm_ret)
4562 *realm_ret = first_realm;
4563 else
4564 put_snap_realm(first_realm);
4565 }
4566
4567 void Client::handle_snap(MClientSnap *m)
4568 {
4569 ldout(cct, 10) << "handle_snap " << *m << dendl;
4570 mds_rank_t mds = mds_rank_t(m->get_source().num());
4571 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4572 if (!session) {
4573 m->put();
4574 return;
4575 }
4576
4577 got_mds_push(session);
4578
4579 map<Inode*, SnapContext> to_move;
4580 SnapRealm *realm = 0;
4581
4582 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4583 assert(m->head.split);
4584 SnapRealmInfo info;
4585 bufferlist::iterator p = m->bl.begin();
4586 ::decode(info, p);
4587 assert(info.ino() == m->head.split);
4588
4589 // flush, then move, ino's.
4590 realm = get_snap_realm(info.ino());
4591 ldout(cct, 10) << " splitting off " << *realm << dendl;
4592 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4593 p != m->split_inos.end();
4594 ++p) {
4595 vinodeno_t vino(*p, CEPH_NOSNAP);
4596 if (inode_map.count(vino)) {
4597 Inode *in = inode_map[vino];
4598 if (!in->snaprealm || in->snaprealm == realm)
4599 continue;
4600 if (in->snaprealm->created > info.created()) {
4601 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4602 << *in->snaprealm << dendl;
4603 continue;
4604 }
4605 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4606
4607
4608 in->snaprealm_item.remove_myself();
4609 to_move[in] = in->snaprealm->get_snap_context();
4610 put_snap_realm(in->snaprealm);
4611 }
4612 }
4613
4614 // move child snaprealms, too
4615 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4616 p != m->split_realms.end();
4617 ++p) {
4618 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4619 SnapRealm *child = get_snap_realm_maybe(*p);
4620 if (!child)
4621 continue;
4622 adjust_realm_parent(child, realm->ino);
4623 put_snap_realm(child);
4624 }
4625 }
4626
4627 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4628
4629 if (realm) {
4630 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4631 Inode *in = p->first;
4632 in->snaprealm = realm;
4633 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4634 realm->nref++;
4635 // queue for snap writeback
4636 if (has_new_snaps(p->second, realm->get_snap_context()))
4637 queue_cap_snap(in, p->second);
4638 }
4639 put_snap_realm(realm);
4640 }
4641
4642 m->put();
4643 }
4644
4645 void Client::handle_quota(MClientQuota *m)
4646 {
4647 mds_rank_t mds = mds_rank_t(m->get_source().num());
4648 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4649 if (!session) {
4650 m->put();
4651 return;
4652 }
4653
4654 got_mds_push(session);
4655
4656 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4657
4658 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4659 if (inode_map.count(vino)) {
4660 Inode *in = NULL;
4661 in = inode_map[vino];
4662
4663 if (in) {
4664 in->quota = m->quota;
4665 in->rstat = m->rstat;
4666 }
4667 }
4668
4669 m->put();
4670 }
4671
4672 void Client::handle_caps(MClientCaps *m)
4673 {
4674 mds_rank_t mds = mds_rank_t(m->get_source().num());
4675 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4676 if (!session) {
4677 m->put();
4678 return;
4679 }
4680
4681 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4682 // Pause RADOS operations until we see the required epoch
4683 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4684 }
4685
4686 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4687 // Record the barrier so that we will transmit it to MDS when releasing
4688 set_cap_epoch_barrier(m->osd_epoch_barrier);
4689 }
4690
4691 got_mds_push(session);
4692
4693 m->clear_payload(); // for if/when we send back to MDS
4694
4695 Inode *in = 0;
4696 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4697 if (inode_map.count(vino))
4698 in = inode_map[vino];
4699 if (!in) {
4700 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4701 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4702 session->enqueue_cap_release(
4703 m->get_ino(),
4704 m->get_cap_id(),
4705 m->get_seq(),
4706 m->get_mseq(),
4707 cap_epoch_barrier);
4708 } else {
4709 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4710 }
4711 m->put();
4712
4713 // in case the mds is waiting on e.g. a revocation
4714 flush_cap_releases();
4715 return;
4716 }
4717
4718 switch (m->get_op()) {
4719 case CEPH_CAP_OP_EXPORT:
4720 return handle_cap_export(session, in, m);
4721 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4722 return handle_cap_flushsnap_ack(session, in, m);
4723 case CEPH_CAP_OP_IMPORT:
4724 handle_cap_import(session, in, m);
4725 }
4726
4727 if (in->caps.count(mds) == 0) {
4728 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4729 m->put();
4730 return;
4731 }
4732
4733 Cap *cap = in->caps[mds];
4734
4735 switch (m->get_op()) {
4736 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4737 case CEPH_CAP_OP_IMPORT:
4738 case CEPH_CAP_OP_REVOKE:
4739 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4740 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4741 default:
4742 m->put();
4743 }
4744 }
4745
4746 void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4747 {
4748 mds_rank_t mds = session->mds_num;
4749
4750 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4751 << " IMPORT from mds." << mds << dendl;
4752
4753 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4754 Cap *cap = NULL;
4755 UserPerm cap_perms;
4756 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4757 cap = in->caps[peer_mds];
4758 if (cap) {
4759 cap_perms = cap->latest_perms;
4760 }
4761 }
4762
4763 // add/update it
4764 SnapRealm *realm = NULL;
4765 update_snap_trace(m->snapbl, &realm);
4766
4767 add_update_cap(in, session, m->get_cap_id(),
4768 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4769 CEPH_CAP_FLAG_AUTH, cap_perms);
4770
4771 if (cap && cap->cap_id == m->peer.cap_id) {
4772 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4773 }
4774
4775 if (realm)
4776 put_snap_realm(realm);
4777
4778 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4779 // reflush any/all caps (if we are now the auth_cap)
4780 if (in->cap_snaps.size())
4781 flush_snaps(in, true);
4782 if (in->flushing_caps)
4783 flush_caps(in, session);
4784 }
4785 }
4786
4787 void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4788 {
4789 mds_rank_t mds = session->mds_num;
4790
4791 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4792 << " EXPORT from mds." << mds << dendl;
4793
4794 Cap *cap = NULL;
4795 if (in->caps.count(mds))
4796 cap = in->caps[mds];
4797
4798 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4799
4800 if (cap && cap->cap_id == m->get_cap_id()) {
4801 if (m->peer.cap_id) {
4802 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4803 if (in->caps.count(peer_mds)) {
4804 Cap *tcap = in->caps[peer_mds];
4805 if (tcap->cap_id == m->peer.cap_id &&
4806 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4807 tcap->cap_id = m->peer.cap_id;
4808 tcap->seq = m->peer.seq - 1;
4809 tcap->issue_seq = tcap->seq;
4810 tcap->mseq = m->peer.mseq;
4811 tcap->issued |= cap->issued;
4812 tcap->implemented |= cap->issued;
4813 if (cap == in->auth_cap)
4814 in->auth_cap = tcap;
4815 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4816 adjust_session_flushing_caps(in, session, tsession);
4817 }
4818 } else {
4819 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4820 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4821 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4822 cap->latest_perms);
4823 }
4824 } else {
4825 if (cap == in->auth_cap)
4826 in->flags |= I_CAP_DROPPED;
4827 }
4828
4829 remove_cap(cap, false);
4830 }
4831
4832 m->put();
4833 }
4834
4835 void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4836 {
4837 mds_rank_t mds = session->mds_num;
4838 assert(in->caps[mds]);
4839
4840 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4841 << " size " << in->size << " -> " << m->get_size()
4842 << dendl;
4843
4844 int implemented = 0;
4845 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4846 issued |= implemented;
4847 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4848 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4849 m->get_ctime(), m->get_mtime(), m->get_atime(),
4850 m->inline_version, m->inline_data, issued);
4851 m->put();
4852 }
4853
4854 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4855 {
4856 ceph_tid_t flush_ack_tid = m->get_client_tid();
4857 int dirty = m->get_dirty();
4858 int cleaned = 0;
4859 int flushed = 0;
4860
4861 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4862 it != in->flushing_cap_tids.end(); ) {
4863 if (it->first == flush_ack_tid)
4864 cleaned = it->second;
4865 if (it->first <= flush_ack_tid) {
4866 session->flushing_caps_tids.erase(it->first);
4867 in->flushing_cap_tids.erase(it++);
4868 ++flushed;
4869 continue;
4870 }
4871 cleaned &= ~it->second;
4872 if (!cleaned)
4873 break;
4874 ++it;
4875 }
4876
4877 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4878 << " cleaned " << ccap_string(cleaned) << " on " << *in
4879 << " with " << ccap_string(dirty) << dendl;
4880
4881 if (flushed) {
4882 signal_cond_list(in->waitfor_caps);
4883 if (session->flushing_caps_tids.empty() ||
4884 *session->flushing_caps_tids.begin() > flush_ack_tid)
4885 sync_cond.Signal();
4886 }
4887
4888 if (!dirty) {
4889 in->cap_dirtier_uid = -1;
4890 in->cap_dirtier_gid = -1;
4891 }
4892
4893 if (!cleaned) {
4894 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4895 } else {
4896 if (in->flushing_caps) {
4897 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4898 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4899 in->flushing_caps &= ~cleaned;
4900 if (in->flushing_caps == 0) {
4901 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4902 num_flushing_caps--;
4903 if (in->cap_snaps.empty())
4904 in->flushing_cap_item.remove_myself();
4905 }
4906 if (!in->caps_dirty())
4907 put_inode(in);
4908 }
4909 }
4910
4911 m->put();
4912 }
4913
4914
4915 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4916 {
4917 mds_rank_t mds = session->mds_num;
4918 assert(in->caps[mds]);
4919 snapid_t follows = m->get_snap_follows();
4920
4921 if (in->cap_snaps.count(follows)) {
4922 CapSnap &capsnap = in->cap_snaps.at(follows);
4923 if (m->get_client_tid() != capsnap.flush_tid) {
4924 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4925 } else {
4926 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4927 << " on " << *in << dendl;
4928 InodeRef tmp_ref;
4929 if (in->get_num_ref() == 1)
4930 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4931 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4932 in->flushing_cap_item.remove_myself();
4933 session->flushing_caps_tids.erase(capsnap.flush_tid);
4934 in->cap_snaps.erase(follows);
4935 }
4936 } else {
4937 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4938 << " on " << *in << dendl;
4939 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4940 }
4941
4942 m->put();
4943 }
4944
4945 class C_Client_DentryInvalidate : public Context {
4946 private:
4947 Client *client;
4948 vinodeno_t dirino;
4949 vinodeno_t ino;
4950 string name;
4951 public:
4952 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4953 client(c), name(dn->name) {
4954 if (client->use_faked_inos()) {
4955 dirino.ino = dn->dir->parent_inode->faked_ino;
4956 if (del)
4957 ino.ino = dn->inode->faked_ino;
4958 } else {
4959 dirino = dn->dir->parent_inode->vino();
4960 if (del)
4961 ino = dn->inode->vino();
4962 }
4963 if (!del)
4964 ino.ino = inodeno_t();
4965 }
4966 void finish(int r) override {
4967 // _async_dentry_invalidate is responsible for its own locking
4968 assert(!client->client_lock.is_locked_by_me());
4969 client->_async_dentry_invalidate(dirino, ino, name);
4970 }
4971 };
4972
4973 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4974 {
4975 if (unmounting)
4976 return;
4977 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4978 << " in dir " << dirino << dendl;
4979 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4980 }
4981
4982 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4983 {
4984 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4985 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4986 }
4987
4988 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4989 {
4990 int ref = in->get_num_ref();
4991
4992 if (in->dir && !in->dir->dentries.empty()) {
4993 for (auto p = in->dir->dentries.begin();
4994 p != in->dir->dentries.end(); ) {
4995 Dentry *dn = p->second;
4996 ++p;
4997 /* rmsnap removes whole subtree, need trim inodes recursively.
4998 * we don't need to invalidate dentries recursively. because
4999 * invalidating a directory dentry effectively invalidate
5000 * whole subtree */
5001 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5002 _try_to_trim_inode(dn->inode.get(), false);
5003
5004 if (dn->lru_is_expireable())
5005 unlink(dn, true, false); // keep dir, drop dentry
5006 }
5007 if (in->dir->dentries.empty()) {
5008 close_dir(in->dir);
5009 --ref;
5010 }
5011 }
5012
5013 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5014 InodeRef snapdir = open_snapdir(in);
5015 _try_to_trim_inode(snapdir.get(), false);
5016 --ref;
5017 }
5018
5019 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5020 set<Dentry*>::iterator q = in->dn_set.begin();
5021 while (q != in->dn_set.end()) {
5022 Dentry *dn = *q++;
5023 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5024 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5025 _schedule_invalidate_dentry_callback(dn, true);
5026 unlink(dn, true, true);
5027 }
5028 }
5029 }
5030
5031 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5032 {
5033 mds_rank_t mds = session->mds_num;
5034 int used = get_caps_used(in);
5035 int wanted = in->caps_wanted();
5036
5037 const int old_caps = cap->issued;
5038 const int new_caps = m->get_caps();
5039 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5040 << " mds." << mds << " seq " << m->get_seq()
5041 << " caps now " << ccap_string(new_caps)
5042 << " was " << ccap_string(old_caps) << dendl;
5043 cap->seq = m->get_seq();
5044
5045 in->layout = m->get_layout();
5046
5047 // update inode
5048 int implemented = 0;
5049 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5050 issued |= implemented;
5051
5052 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5053 in->mode = m->head.mode;
5054 in->uid = m->head.uid;
5055 in->gid = m->head.gid;
5056 in->btime = m->btime;
5057 }
5058 bool deleted_inode = false;
5059 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5060 in->nlink = m->head.nlink;
5061 if (in->nlink == 0 &&
5062 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5063 deleted_inode = true;
5064 }
5065 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5066 m->xattrbl.length() &&
5067 m->head.xattr_version > in->xattr_version) {
5068 bufferlist::iterator p = m->xattrbl.begin();
5069 ::decode(in->xattrs, p);
5070 in->xattr_version = m->head.xattr_version;
5071 }
5072 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5073 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5074 m->get_mtime(), m->get_atime(),
5075 m->inline_version, m->inline_data, issued);
5076
5077 // max_size
5078 if (cap == in->auth_cap &&
5079 m->get_max_size() != in->max_size) {
5080 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5081 in->max_size = m->get_max_size();
5082 if (in->max_size > in->wanted_max_size) {
5083 in->wanted_max_size = 0;
5084 in->requested_max_size = 0;
5085 }
5086 }
5087
5088 bool check = false;
5089 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5090 check = true;
5091
5092 check_cap_issue(in, cap, new_caps);
5093
5094 // update caps
5095 int revoked = old_caps & ~new_caps;
5096 if (revoked) {
5097 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5098 cap->issued = new_caps;
5099 cap->implemented |= new_caps;
5100
5101 // recall delegations if we're losing caps necessary for them
5102 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5103 in->recall_deleg(false);
5104 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5105 in->recall_deleg(true);
5106
5107 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5108 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5109 // waitin' for flush
5110 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5111 if (_release(in))
5112 check = true;
5113 } else {
5114 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5115 check = true;
5116 }
5117
5118 } else if (old_caps == new_caps) {
5119 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5120 } else {
5121 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5122 cap->issued = new_caps;
5123 cap->implemented |= new_caps;
5124
5125 if (cap == in->auth_cap) {
5126 // non-auth MDS is revoking the newly grant caps ?
5127 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5128 if (it->second == cap)
5129 continue;
5130 if (it->second->implemented & ~it->second->issued & new_caps) {
5131 check = true;
5132 break;
5133 }
5134 }
5135 }
5136 }
5137
5138 if (check)
5139 check_caps(in, 0);
5140
5141 // wake up waiters
5142 if (new_caps)
5143 signal_cond_list(in->waitfor_caps);
5144
5145 // may drop inode's last ref
5146 if (deleted_inode)
5147 _try_to_trim_inode(in, true);
5148
5149 m->put();
5150 }
5151
5152 int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5153 {
5154 // cppcheck-suppress variableScope
5155 int sgid_count;
5156 gid_t *sgid_buf;
5157
5158 if (getgroups_cb) {
5159 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5160 if (sgid_count > 0) {
5161 *sgids = sgid_buf;
5162 return sgid_count;
5163 }
5164 }
5165
5166 #if HAVE_GETGROUPLIST
5167 struct passwd *pw;
5168 pw = getpwuid(uid);
5169 if (pw == NULL) {
5170 ldout(cct, 3) << "getting user entry failed" << dendl;
5171 return -errno;
5172 }
5173 //use PAM to get the group list
5174 // initial number of group entries, defaults to posix standard of 16
5175 // PAM implementations may provide more than 16 groups....
5176 sgid_count = 16;
5177 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5178 if (sgid_buf == NULL) {
5179 ldout(cct, 3) << "allocating group memory failed" << dendl;
5180 return -ENOMEM;
5181 }
5182
5183 while (1) {
5184 #if defined(__APPLE__)
5185 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5186 #else
5187 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5188 #endif
5189 // we need to resize the group list and try again
5190 void *_realloc = NULL;
5191 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5192 ldout(cct, 3) << "allocating group memory failed" << dendl;
5193 free(sgid_buf);
5194 return -ENOMEM;
5195 }
5196 sgid_buf = (gid_t*)_realloc;
5197 continue;
5198 }
5199 // list was successfully retrieved
5200 break;
5201 }
5202 *sgids = sgid_buf;
5203 return sgid_count;
5204 #else
5205 return 0;
5206 #endif
5207 }
5208
5209 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5210 {
5211 if (perms.uid() == 0)
5212 return 0;
5213
5214 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5215 int ret = _posix_acl_permission(in, perms, want);
5216 if (ret != -EAGAIN)
5217 return ret;
5218 }
5219
5220 // check permissions before doing anything else
5221 if (!in->check_mode(perms, want))
5222 return -EACCES;
5223 return 0;
5224 }
5225
5226 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5227 const UserPerm& perms)
5228 {
5229 int r = _getattr_for_perm(in, perms);
5230 if (r < 0)
5231 goto out;
5232
5233 r = 0;
5234 if (strncmp(name, "system.", 7) == 0) {
5235 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5236 r = -EPERM;
5237 } else {
5238 r = inode_permission(in, perms, want);
5239 }
5240 out:
5241 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5242 return r;
5243 }
5244
5245 ostream& operator<<(ostream &out, const UserPerm& perm) {
5246 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5247 return out;
5248 }
5249
5250 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5251 const UserPerm& perms)
5252 {
5253 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5254 int r = _getattr_for_perm(in, perms);
5255 if (r < 0)
5256 goto out;
5257
5258 if (mask & CEPH_SETATTR_SIZE) {
5259 r = inode_permission(in, perms, MAY_WRITE);
5260 if (r < 0)
5261 goto out;
5262 }
5263
5264 r = -EPERM;
5265 if (mask & CEPH_SETATTR_UID) {
5266 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5267 goto out;
5268 }
5269 if (mask & CEPH_SETATTR_GID) {
5270 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5271 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5272 goto out;
5273 }
5274
5275 if (mask & CEPH_SETATTR_MODE) {
5276 if (perms.uid() != 0 && perms.uid() != in->uid)
5277 goto out;
5278
5279 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5280 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5281 stx->stx_mode &= ~S_ISGID;
5282 }
5283
5284 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5285 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5286 if (perms.uid() != 0 && perms.uid() != in->uid) {
5287 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5288 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5289 check_mask |= CEPH_SETATTR_MTIME;
5290 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5291 check_mask |= CEPH_SETATTR_ATIME;
5292 if (check_mask & mask) {
5293 goto out;
5294 } else {
5295 r = inode_permission(in, perms, MAY_WRITE);
5296 if (r < 0)
5297 goto out;
5298 }
5299 }
5300 }
5301 r = 0;
5302 out:
5303 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5304 return r;
5305 }
5306
5307 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5308 {
5309 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5310 unsigned want = 0;
5311
5312 if ((flags & O_ACCMODE) == O_WRONLY)
5313 want = MAY_WRITE;
5314 else if ((flags & O_ACCMODE) == O_RDWR)
5315 want = MAY_READ | MAY_WRITE;
5316 else if ((flags & O_ACCMODE) == O_RDONLY)
5317 want = MAY_READ;
5318 if (flags & O_TRUNC)
5319 want |= MAY_WRITE;
5320
5321 int r = 0;
5322 switch (in->mode & S_IFMT) {
5323 case S_IFLNK:
5324 r = -ELOOP;
5325 goto out;
5326 case S_IFDIR:
5327 if (want & MAY_WRITE) {
5328 r = -EISDIR;
5329 goto out;
5330 }
5331 break;
5332 }
5333
5334 r = _getattr_for_perm(in, perms);
5335 if (r < 0)
5336 goto out;
5337
5338 r = inode_permission(in, perms, want);
5339 out:
5340 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5341 return r;
5342 }
5343
5344 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5345 {
5346 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5347 int r = _getattr_for_perm(dir, perms);
5348 if (r < 0)
5349 goto out;
5350
5351 r = inode_permission(dir, perms, MAY_EXEC);
5352 out:
5353 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5354 return r;
5355 }
5356
5357 int Client::may_create(Inode *dir, const UserPerm& perms)
5358 {
5359 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5360 int r = _getattr_for_perm(dir, perms);
5361 if (r < 0)
5362 goto out;
5363
5364 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5365 out:
5366 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5367 return r;
5368 }
5369
5370 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5371 {
5372 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5373 int r = _getattr_for_perm(dir, perms);
5374 if (r < 0)
5375 goto out;
5376
5377 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5378 if (r < 0)
5379 goto out;
5380
5381 /* 'name == NULL' means rmsnap */
5382 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5383 InodeRef otherin;
5384 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5385 if (r < 0)
5386 goto out;
5387 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5388 r = -EPERM;
5389 }
5390 out:
5391 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5392 return r;
5393 }
5394
5395 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5396 {
5397 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5398 int r = _getattr_for_perm(in, perms);
5399 if (r < 0)
5400 goto out;
5401
5402 if (perms.uid() == 0 || perms.uid() == in->uid) {
5403 r = 0;
5404 goto out;
5405 }
5406
5407 r = -EPERM;
5408 if (!S_ISREG(in->mode))
5409 goto out;
5410
5411 if (in->mode & S_ISUID)
5412 goto out;
5413
5414 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5415 goto out;
5416
5417 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5418 out:
5419 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5420 return r;
5421 }
5422
5423 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5424 {
5425 int mask = CEPH_STAT_CAP_MODE;
5426 bool force = false;
5427 if (acl_type != NO_ACL) {
5428 mask |= CEPH_STAT_CAP_XATTR;
5429 force = in->xattr_version == 0;
5430 }
5431 return _getattr(in, mask, perms, force);
5432 }
5433
5434 vinodeno_t Client::_get_vino(Inode *in)
5435 {
5436 /* The caller must hold the client lock */
5437 return vinodeno_t(in->ino, in->snapid);
5438 }
5439
5440 inodeno_t Client::_get_inodeno(Inode *in)
5441 {
5442 /* The caller must hold the client lock */
5443 return in->ino;
5444 }
5445
5446
5447 /**
5448 * Resolve an MDS spec to a list of MDS daemon GIDs.
5449 *
5450 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5451 * It may be '*' in which case it matches all GIDs.
5452 *
5453 * If no error is returned, the `targets` vector will be populated with at least
5454 * one MDS.
5455 */
5456 int Client::resolve_mds(
5457 const std::string &mds_spec,
5458 std::vector<mds_gid_t> *targets)
5459 {
5460 assert(fsmap);
5461 assert(targets != nullptr);
5462
5463 mds_role_t role;
5464 std::stringstream ss;
5465 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5466 if (role_r == 0) {
5467 // We got a role, resolve it to a GID
5468 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5469 << role << "'" << dendl;
5470 targets->push_back(
5471 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5472 return 0;
5473 }
5474
5475 std::string strtol_err;
5476 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5477 if (strtol_err.empty()) {
5478 // It is a possible GID
5479 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5480 if (fsmap->gid_exists(mds_gid)) {
5481 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5482 targets->push_back(mds_gid);
5483 } else {
5484 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5485 << dendl;
5486 return -ENOENT;
5487 }
5488 } else if (mds_spec == "*") {
5489 // It is a wildcard: use all MDSs
5490 const auto mds_info = fsmap->get_mds_info();
5491
5492 if (mds_info.empty()) {
5493 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5494 return -ENOENT;
5495 }
5496
5497 for (const auto i : mds_info) {
5498 targets->push_back(i.first);
5499 }
5500 } else {
5501 // It did not parse as an integer, it is not a wildcard, it must be a name
5502 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5503 if (mds_gid == 0) {
5504 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5505
5506 lderr(cct) << "FSMap: " << *fsmap << dendl;
5507
5508 return -ENOENT;
5509 } else {
5510 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5511 << "' to GID " << mds_gid << dendl;
5512 targets->push_back(mds_gid);
5513 }
5514 }
5515
5516 return 0;
5517 }
5518
5519
5520 /**
5521 * Authenticate with mon and establish global ID
5522 */
5523 int Client::authenticate()
5524 {
5525 assert(client_lock.is_locked_by_me());
5526
5527 if (monclient->is_authenticated()) {
5528 return 0;
5529 }
5530
5531 client_lock.Unlock();
5532 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5533 client_lock.Lock();
5534 if (r < 0) {
5535 return r;
5536 }
5537
5538 whoami = monclient->get_global_id();
5539 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5540
5541 return 0;
5542 }
5543
5544 int Client::fetch_fsmap(bool user)
5545 {
5546 int r;
5547 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5548 // rather than MDSMap because no one MDSMap contains all the daemons, and
5549 // a `tell` can address any daemon.
5550 version_t fsmap_latest;
5551 do {
5552 C_SaferCond cond;
5553 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5554 client_lock.Unlock();
5555 r = cond.wait();
5556 client_lock.Lock();
5557 } while (r == -EAGAIN);
5558
5559 if (r < 0) {
5560 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5561 return r;
5562 }
5563
5564 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5565
5566 if (user) {
5567 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5568 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5569 monclient->renew_subs();
5570 wait_on_list(waiting_for_fsmap);
5571 }
5572 assert(fsmap_user);
5573 assert(fsmap_user->get_epoch() >= fsmap_latest);
5574 } else {
5575 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5576 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5577 monclient->renew_subs();
5578 wait_on_list(waiting_for_fsmap);
5579 }
5580 assert(fsmap);
5581 assert(fsmap->get_epoch() >= fsmap_latest);
5582 }
5583 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5584 << fsmap_latest << dendl;
5585 return 0;
5586 }
5587
5588 /**
5589 *
5590 * @mds_spec one of ID, rank, GID, "*"
5591 *
5592 */
5593 int Client::mds_command(
5594 const std::string &mds_spec,
5595 const vector<string>& cmd,
5596 const bufferlist& inbl,
5597 bufferlist *outbl,
5598 string *outs,
5599 Context *onfinish)
5600 {
5601 Mutex::Locker lock(client_lock);
5602
5603 if (!initialized)
5604 return -ENOTCONN;
5605
5606 int r;
5607 r = authenticate();
5608 if (r < 0) {
5609 return r;
5610 }
5611
5612 r = fetch_fsmap(false);
5613 if (r < 0) {
5614 return r;
5615 }
5616
5617 // Look up MDS target(s) of the command
5618 std::vector<mds_gid_t> targets;
5619 r = resolve_mds(mds_spec, &targets);
5620 if (r < 0) {
5621 return r;
5622 }
5623
5624 // If daemons are laggy, we won't send them commands. If all
5625 // are laggy then we fail.
5626 std::vector<mds_gid_t> non_laggy;
5627 for (const auto gid : targets) {
5628 const auto info = fsmap->get_info_gid(gid);
5629 if (!info.laggy()) {
5630 non_laggy.push_back(gid);
5631 }
5632 }
5633 if (non_laggy.size() == 0) {
5634 *outs = "All targeted MDS daemons are laggy";
5635 return -ENOENT;
5636 }
5637
5638 if (metadata.empty()) {
5639 // We are called on an unmounted client, so metadata
5640 // won't be initialized yet.
5641 populate_metadata("");
5642 }
5643
5644 // Send commands to targets
5645 C_GatherBuilder gather(cct, onfinish);
5646 for (const auto target_gid : non_laggy) {
5647 const auto info = fsmap->get_info_gid(target_gid);
5648
5649 // Open a connection to the target MDS
5650 entity_inst_t inst = info.get_inst();
5651 ConnectionRef conn = messenger->get_connection(inst);
5652
5653 // Generate MDSCommandOp state
5654 auto &op = command_table.start_command();
5655
5656 op.on_finish = gather.new_sub();
5657 op.cmd = cmd;
5658 op.outbl = outbl;
5659 op.outs = outs;
5660 op.inbl = inbl;
5661 op.mds_gid = target_gid;
5662 op.con = conn;
5663
5664 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5665 << " tid=" << op.tid << cmd << dendl;
5666
5667 // Construct and send MCommand
5668 MCommand *m = op.get_message(monclient->get_fsid());
5669 conn->send_message(m);
5670 }
5671 gather.activate();
5672
5673 return 0;
5674 }
5675
5676 void Client::handle_command_reply(MCommandReply *m)
5677 {
5678 ceph_tid_t const tid = m->get_tid();
5679
5680 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5681
5682 if (!command_table.exists(tid)) {
5683 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5684 m->put();
5685 return;
5686 }
5687
5688 auto &op = command_table.get_command(tid);
5689 if (op.outbl) {
5690 op.outbl->claim(m->get_data());
5691 }
5692 if (op.outs) {
5693 *op.outs = m->rs;
5694 }
5695
5696 if (op.on_finish) {
5697 op.on_finish->complete(m->r);
5698 }
5699
5700 command_table.erase(tid);
5701
5702 m->put();
5703 }
5704
5705 // -------------------
5706 // MOUNT
5707
5708 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5709 bool require_mds)
5710 {
5711 Mutex::Locker lock(client_lock);
5712
5713 if (mounted) {
5714 ldout(cct, 5) << "already mounted" << dendl;
5715 return 0;
5716 }
5717
5718 unmounting = false;
5719
5720 int r = authenticate();
5721 if (r < 0) {
5722 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5723 return r;
5724 }
5725
5726 std::string want = "mdsmap";
5727 const auto &mds_ns = cct->_conf->client_mds_namespace;
5728 if (!mds_ns.empty()) {
5729 r = fetch_fsmap(true);
5730 if (r < 0)
5731 return r;
5732 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5733 if (cid == FS_CLUSTER_ID_NONE)
5734 return -ENOENT;
5735
5736 std::ostringstream oss;
5737 oss << want << "." << cid;
5738 want = oss.str();
5739 }
5740 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5741
5742 monclient->sub_want(want, 0, 0);
5743 monclient->renew_subs();
5744
5745 tick(); // start tick
5746
5747 if (require_mds) {
5748 while (1) {
5749 auto availability = mdsmap->is_cluster_available();
5750 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5751 // Error out
5752 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5753 return CEPH_FUSE_NO_MDS_UP;
5754 } else if (availability == MDSMap::AVAILABLE) {
5755 // Continue to mount
5756 break;
5757 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5758 // Else, wait. MDSMonitor will update the map to bring
5759 // us to a conclusion eventually.
5760 wait_on_list(waiting_for_mdsmap);
5761 } else {
5762 // Unexpected value!
5763 ceph_abort();
5764 }
5765 }
5766 }
5767
5768 populate_metadata(mount_root.empty() ? "/" : mount_root);
5769
5770 filepath fp(CEPH_INO_ROOT);
5771 if (!mount_root.empty()) {
5772 fp = filepath(mount_root.c_str());
5773 }
5774 while (true) {
5775 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5776 req->set_filepath(fp);
5777 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5778 int res = make_request(req, perms);
5779 if (res < 0) {
5780 if (res == -EACCES && root) {
5781 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5782 break;
5783 }
5784 return res;
5785 }
5786
5787 if (fp.depth())
5788 fp.pop_dentry();
5789 else
5790 break;
5791 }
5792
5793 assert(root);
5794 _ll_get(root);
5795
5796 mounted = true;
5797
5798 // trace?
5799 if (!cct->_conf->client_trace.empty()) {
5800 traceout.open(cct->_conf->client_trace.c_str());
5801 if (traceout.is_open()) {
5802 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5803 } else {
5804 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5805 }
5806 }
5807
5808 /*
5809 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5810 ldout(cct, 3) << "op: struct stat st;" << dendl;
5811 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5812 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5813 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5814 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5815 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5816 ldout(cct, 3) << "op: int fd;" << dendl;
5817 */
5818 return 0;
5819 }
5820
5821 // UNMOUNT
5822
5823 void Client::_close_sessions()
5824 {
5825 while (!mds_sessions.empty()) {
5826 // send session closes!
5827 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5828 p != mds_sessions.end();
5829 ++p) {
5830 if (p->second->state != MetaSession::STATE_CLOSING) {
5831 _close_mds_session(p->second);
5832 }
5833 }
5834
5835 // wait for sessions to close
5836 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5837 mount_cond.Wait(client_lock);
5838 }
5839 }
5840
5841 void Client::flush_mdlog_sync()
5842 {
5843 if (mds_requests.empty())
5844 return;
5845 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5846 p != mds_sessions.end();
5847 ++p) {
5848 MetaSession *s = p->second;
5849 flush_mdlog(s);
5850 }
5851 }
5852
5853 void Client::flush_mdlog(MetaSession *session)
5854 {
5855 // Only send this to Luminous or newer MDS daemons, older daemons
5856 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5857 const uint64_t features = session->con->get_features();
5858 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5859 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5860 session->con->send_message(m);
5861 }
5862 }
5863
5864
5865 void Client::_unmount()
5866 {
5867 if (unmounting)
5868 return;
5869
5870 ldout(cct, 2) << "unmounting" << dendl;
5871 unmounting = true;
5872
5873 deleg_timeout = 0;
5874
5875 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5876 while (!mds_requests.empty()) {
5877 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5878 mount_cond.Wait(client_lock);
5879 }
5880
5881 if (tick_event)
5882 timer.cancel_event(tick_event);
5883 tick_event = 0;
5884
5885 cwd.reset();
5886
5887 // clean up any unclosed files
5888 while (!fd_map.empty()) {
5889 Fh *fh = fd_map.begin()->second;
5890 fd_map.erase(fd_map.begin());
5891 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5892 _release_fh(fh);
5893 }
5894
5895 while (!ll_unclosed_fh_set.empty()) {
5896 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5897 Fh *fh = *it;
5898 ll_unclosed_fh_set.erase(fh);
5899 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5900 _release_fh(fh);
5901 }
5902
5903 while (!opened_dirs.empty()) {
5904 dir_result_t *dirp = *opened_dirs.begin();
5905 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5906 _closedir(dirp);
5907 }
5908
5909 _ll_drop_pins();
5910
5911 if (blacklisted) {
5912 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5913
5914 if (cct->_conf->client_oc) {
5915 // Purge all cached data so that ObjectCacher doesn't get hung up
5916 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5917 // is to just leave things marked dirty
5918 // (http://tracker.ceph.com/issues/9105)
5919 for (const auto &i : inode_map) {
5920 objectcacher->purge_set(&(i.second->oset));
5921 }
5922 }
5923
5924 mounted = false;
5925 return;
5926 }
5927
5928 while (unsafe_sync_write > 0) {
5929 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5930 mount_cond.Wait(client_lock);
5931 }
5932
5933 if (cct->_conf->client_oc) {
5934 // flush/release all buffered data
5935 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5936 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5937 p != inode_map.end();
5938 p = next) {
5939 next = p;
5940 ++next;
5941 Inode *in = p->second;
5942 if (!in) {
5943 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5944 assert(in);
5945 }
5946 if (!in->caps.empty()) {
5947 InodeRef tmp_ref(in);
5948 _release(in);
5949 _flush(in, new C_Client_FlushComplete(this, in));
5950 }
5951 }
5952 }
5953
5954 flush_caps_sync();
5955 wait_sync_caps(last_flush_tid);
5956
5957 // empty lru cache
5958 trim_cache();
5959
5960 while (lru.lru_get_size() > 0 ||
5961 !inode_map.empty()) {
5962 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5963 << "+" << inode_map.size() << " items"
5964 << ", waiting (for caps to release?)"
5965 << dendl;
5966 utime_t until = ceph_clock_now() + utime_t(5, 0);
5967 int r = mount_cond.WaitUntil(client_lock, until);
5968 if (r == ETIMEDOUT) {
5969 dump_cache(NULL);
5970 }
5971 }
5972 assert(lru.lru_get_size() == 0);
5973 assert(inode_map.empty());
5974
5975 // stop tracing
5976 if (!cct->_conf->client_trace.empty()) {
5977 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5978 traceout.close();
5979 }
5980
5981 _close_sessions();
5982
5983 mounted = false;
5984
5985 ldout(cct, 2) << "unmounted." << dendl;
5986 }
5987
5988 void Client::unmount()
5989 {
5990 Mutex::Locker lock(client_lock);
5991 _unmount();
5992 }
5993
5994 void Client::flush_cap_releases()
5995 {
5996 // send any cap releases
5997 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5998 p != mds_sessions.end();
5999 ++p) {
6000 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
6001 p->first)) {
6002 if (cct->_conf->client_inject_release_failure) {
6003 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6004 p->second->release->put();
6005 } else {
6006 p->second->con->send_message(p->second->release);
6007 }
6008 p->second->release = 0;
6009 }
6010 }
6011 }
6012
6013 void Client::tick()
6014 {
6015 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6016 sleep(cct->_conf->client_debug_inject_tick_delay);
6017 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6018 cct->_conf->apply_changes(NULL);
6019 }
6020
6021 ldout(cct, 21) << "tick" << dendl;
6022 tick_event = timer.add_event_after(
6023 cct->_conf->client_tick_interval,
6024 new FunctionContext([this](int) {
6025 // Called back via Timer, which takes client_lock for us
6026 assert(client_lock.is_locked_by_me());
6027 tick();
6028 }));
6029 utime_t now = ceph_clock_now();
6030
6031 if (!mounted && !mds_requests.empty()) {
6032 MetaRequest *req = mds_requests.begin()->second;
6033 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6034 req->abort(-ETIMEDOUT);
6035 if (req->caller_cond) {
6036 req->kick = true;
6037 req->caller_cond->Signal();
6038 }
6039 signal_cond_list(waiting_for_mdsmap);
6040 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6041 p != mds_sessions.end();
6042 ++p)
6043 signal_context_list(p->second->waiting_for_open);
6044 }
6045 }
6046
6047 if (mdsmap->get_epoch()) {
6048 // renew caps?
6049 utime_t el = now - last_cap_renew;
6050 if (el > mdsmap->get_session_timeout() / 3.0)
6051 renew_caps();
6052
6053 flush_cap_releases();
6054 }
6055
6056 // delayed caps
6057 xlist<Inode*>::iterator p = delayed_caps.begin();
6058 while (!p.end()) {
6059 Inode *in = *p;
6060 ++p;
6061 if (in->hold_caps_until > now)
6062 break;
6063 delayed_caps.pop_front();
6064 cap_list.push_back(&in->cap_item);
6065 check_caps(in, CHECK_CAPS_NODELAY);
6066 }
6067
6068 trim_cache(true);
6069 }
6070
6071 void Client::renew_caps()
6072 {
6073 ldout(cct, 10) << "renew_caps()" << dendl;
6074 last_cap_renew = ceph_clock_now();
6075
6076 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6077 p != mds_sessions.end();
6078 ++p) {
6079 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6080 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6081 renew_caps(p->second);
6082 }
6083 }
6084
6085 void Client::renew_caps(MetaSession *session)
6086 {
6087 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6088 session->last_cap_renew_request = ceph_clock_now();
6089 uint64_t seq = ++session->cap_renew_seq;
6090 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6091 }
6092
6093
6094 // ===============================================================
6095 // high level (POSIXy) interface
6096
6097 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6098 InodeRef *target, const UserPerm& perms)
6099 {
6100 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6101 MetaRequest *req = new MetaRequest(op);
6102 filepath path;
6103 dir->make_nosnap_relative_path(path);
6104 path.push_dentry(name);
6105 req->set_filepath(path);
6106 req->set_inode(dir);
6107 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6108 mask |= DEBUG_GETATTR_CAPS;
6109 req->head.args.getattr.mask = mask;
6110
6111 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6112
6113 int r = make_request(req, perms, target);
6114 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6115 return r;
6116 }
6117
6118 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6119 const UserPerm& perms)
6120 {
6121 int r = 0;
6122 Dentry *dn = NULL;
6123
6124 if (!dir->is_dir()) {
6125 r = -ENOTDIR;
6126 goto done;
6127 }
6128
6129 if (dname == "..") {
6130 if (dir->dn_set.empty())
6131 *target = dir;
6132 else
6133 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6134 goto done;
6135 }
6136
6137 if (dname == ".") {
6138 *target = dir;
6139 goto done;
6140 }
6141
6142 if (dname.length() > NAME_MAX) {
6143 r = -ENAMETOOLONG;
6144 goto done;
6145 }
6146
6147 if (dname == cct->_conf->client_snapdir &&
6148 dir->snapid == CEPH_NOSNAP) {
6149 *target = open_snapdir(dir);
6150 goto done;
6151 }
6152
6153 if (dir->dir &&
6154 dir->dir->dentries.count(dname)) {
6155 dn = dir->dir->dentries[dname];
6156
6157 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6158 << " seq " << dn->lease_seq
6159 << dendl;
6160
6161 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6162 // is dn lease valid?
6163 utime_t now = ceph_clock_now();
6164 if (dn->lease_mds >= 0 &&
6165 dn->lease_ttl > now &&
6166 mds_sessions.count(dn->lease_mds)) {
6167 MetaSession *s = mds_sessions[dn->lease_mds];
6168 if (s->cap_ttl > now &&
6169 s->cap_gen == dn->lease_gen) {
6170 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6171 // make trim_caps() behave.
6172 dir->try_touch_cap(dn->lease_mds);
6173 goto hit_dn;
6174 }
6175 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6176 << " vs lease_gen " << dn->lease_gen << dendl;
6177 }
6178 // dir lease?
6179 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6180 if (dn->cap_shared_gen == dir->shared_gen &&
6181 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6182 goto hit_dn;
6183 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6184 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6185 << *dir << " dn '" << dname << "'" << dendl;
6186 return -ENOENT;
6187 }
6188 }
6189 } else {
6190 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6191 }
6192 } else {
6193 // can we conclude ENOENT locally?
6194 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6195 (dir->flags & I_COMPLETE)) {
6196 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6197 return -ENOENT;
6198 }
6199 }
6200
6201 r = _do_lookup(dir, dname, mask, target, perms);
6202 goto done;
6203
6204 hit_dn:
6205 if (dn->inode) {
6206 *target = dn->inode;
6207 } else {
6208 r = -ENOENT;
6209 }
6210 touch_dn(dn);
6211
6212 done:
6213 if (r < 0)
6214 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6215 else
6216 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6217 return r;
6218 }
6219
6220 int Client::get_or_create(Inode *dir, const char* name,
6221 Dentry **pdn, bool expect_null)
6222 {
6223 // lookup
6224 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6225 dir->open_dir();
6226 if (dir->dir->dentries.count(name)) {
6227 Dentry *dn = dir->dir->dentries[name];
6228
6229 // is dn lease valid?
6230 utime_t now = ceph_clock_now();
6231 if (dn->inode &&
6232 dn->lease_mds >= 0 &&
6233 dn->lease_ttl > now &&
6234 mds_sessions.count(dn->lease_mds)) {
6235 MetaSession *s = mds_sessions[dn->lease_mds];
6236 if (s->cap_ttl > now &&
6237 s->cap_gen == dn->lease_gen) {
6238 if (expect_null)
6239 return -EEXIST;
6240 }
6241 }
6242 *pdn = dn;
6243 } else {
6244 // otherwise link up a new one
6245 *pdn = link(dir->dir, name, NULL, NULL);
6246 }
6247
6248 // success
6249 return 0;
6250 }
6251
6252 int Client::path_walk(const filepath& origpath, InodeRef *end,
6253 const UserPerm& perms, bool followsym, int mask)
6254 {
6255 filepath path = origpath;
6256 InodeRef cur;
6257 if (origpath.absolute())
6258 cur = root;
6259 else
6260 cur = cwd;
6261 assert(cur);
6262
6263 ldout(cct, 10) << "path_walk " << path << dendl;
6264
6265 int symlinks = 0;
6266
6267 unsigned i=0;
6268 while (i < path.depth() && cur) {
6269 int caps = 0;
6270 const string &dname = path[i];
6271 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6272 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6273 InodeRef next;
6274 if (cct->_conf->client_permissions) {
6275 int r = may_lookup(cur.get(), perms);
6276 if (r < 0)
6277 return r;
6278 caps = CEPH_CAP_AUTH_SHARED;
6279 }
6280
6281 /* Get extra requested caps on the last component */
6282 if (i == (path.depth() - 1))
6283 caps |= mask;
6284 int r = _lookup(cur.get(), dname, caps, &next, perms);
6285 if (r < 0)
6286 return r;
6287 // only follow trailing symlink if followsym. always follow
6288 // 'directory' symlinks.
6289 if (next && next->is_symlink()) {
6290 symlinks++;
6291 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6292 if (symlinks > MAXSYMLINKS) {
6293 return -ELOOP;
6294 }
6295
6296 if (i < path.depth() - 1) {
6297 // dir symlink
6298 // replace consumed components of path with symlink dir target
6299 filepath resolved(next->symlink.c_str());
6300 resolved.append(path.postfixpath(i + 1));
6301 path = resolved;
6302 i = 0;
6303 if (next->symlink[0] == '/') {
6304 cur = root;
6305 }
6306 continue;
6307 } else if (followsym) {
6308 if (next->symlink[0] == '/') {
6309 path = next->symlink.c_str();
6310 i = 0;
6311 // reset position
6312 cur = root;
6313 } else {
6314 filepath more(next->symlink.c_str());
6315 // we need to remove the symlink component from off of the path
6316 // before adding the target that the symlink points to. remain
6317 // at the same position in the path.
6318 path.pop_dentry();
6319 path.append(more);
6320 }
6321 continue;
6322 }
6323 }
6324 cur.swap(next);
6325 i++;
6326 }
6327 if (!cur)
6328 return -ENOENT;
6329 if (end)
6330 end->swap(cur);
6331 return 0;
6332 }
6333
6334
6335 // namespace ops
6336
6337 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6338 {
6339 Mutex::Locker lock(client_lock);
6340 tout(cct) << "link" << std::endl;
6341 tout(cct) << relexisting << std::endl;
6342 tout(cct) << relpath << std::endl;
6343
6344 if (unmounting)
6345 return -ENOTCONN;
6346
6347 filepath existing(relexisting);
6348
6349 InodeRef in, dir;
6350 int r = path_walk(existing, &in, perm, true);
6351 if (r < 0)
6352 return r;
6353 if (std::string(relpath) == "/") {
6354 r = -EEXIST;
6355 return r;
6356 }
6357 filepath path(relpath);
6358 string name = path.last_dentry();
6359 path.pop_dentry();
6360
6361 r = path_walk(path, &dir, perm, true);
6362 if (r < 0)
6363 return r;
6364 if (cct->_conf->client_permissions) {
6365 if (S_ISDIR(in->mode)) {
6366 r = -EPERM;
6367 return r;
6368 }
6369 r = may_hardlink(in.get(), perm);
6370 if (r < 0)
6371 return r;
6372 r = may_create(dir.get(), perm);
6373 if (r < 0)
6374 return r;
6375 }
6376 r = _link(in.get(), dir.get(), name.c_str(), perm);
6377 return r;
6378 }
6379
6380 int Client::unlink(const char *relpath, const UserPerm& perm)
6381 {
6382 Mutex::Locker lock(client_lock);
6383 tout(cct) << "unlink" << std::endl;
6384 tout(cct) << relpath << std::endl;
6385
6386 if (unmounting)
6387 return -ENOTCONN;
6388
6389 if (std::string(relpath) == "/")
6390 return -EISDIR;
6391
6392 filepath path(relpath);
6393 string name = path.last_dentry();
6394 path.pop_dentry();
6395 InodeRef dir;
6396 int r = path_walk(path, &dir, perm);
6397 if (r < 0)
6398 return r;
6399 if (cct->_conf->client_permissions) {
6400 r = may_delete(dir.get(), name.c_str(), perm);
6401 if (r < 0)
6402 return r;
6403 }
6404 return _unlink(dir.get(), name.c_str(), perm);
6405 }
6406
6407 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6408 {
6409 Mutex::Locker lock(client_lock);
6410 tout(cct) << "rename" << std::endl;
6411 tout(cct) << relfrom << std::endl;
6412 tout(cct) << relto << std::endl;
6413
6414 if (unmounting)
6415 return -ENOTCONN;
6416
6417 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6418 return -EBUSY;
6419
6420 filepath from(relfrom);
6421 filepath to(relto);
6422 string fromname = from.last_dentry();
6423 from.pop_dentry();
6424 string toname = to.last_dentry();
6425 to.pop_dentry();
6426
6427 InodeRef fromdir, todir;
6428 int r = path_walk(from, &fromdir, perm);
6429 if (r < 0)
6430 goto out;
6431 r = path_walk(to, &todir, perm);
6432 if (r < 0)
6433 goto out;
6434
6435 if (cct->_conf->client_permissions) {
6436 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6437 if (r < 0)
6438 return r;
6439 r = may_delete(todir.get(), toname.c_str(), perm);
6440 if (r < 0 && r != -ENOENT)
6441 return r;
6442 }
6443 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6444 out:
6445 return r;
6446 }
6447
6448 // dirs
6449
6450 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6451 {
6452 Mutex::Locker lock(client_lock);
6453 tout(cct) << "mkdir" << std::endl;
6454 tout(cct) << relpath << std::endl;
6455 tout(cct) << mode << std::endl;
6456 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6457
6458 if (unmounting)
6459 return -ENOTCONN;
6460
6461 if (std::string(relpath) == "/")
6462 return -EEXIST;
6463
6464 filepath path(relpath);
6465 string name = path.last_dentry();
6466 path.pop_dentry();
6467 InodeRef dir;
6468 int r = path_walk(path, &dir, perm);
6469 if (r < 0)
6470 return r;
6471 if (cct->_conf->client_permissions) {
6472 r = may_create(dir.get(), perm);
6473 if (r < 0)
6474 return r;
6475 }
6476 return _mkdir(dir.get(), name.c_str(), mode, perm);
6477 }
6478
6479 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6480 {
6481 Mutex::Locker lock(client_lock);
6482 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6483 tout(cct) << "mkdirs" << std::endl;
6484 tout(cct) << relpath << std::endl;
6485 tout(cct) << mode << std::endl;
6486
6487 if (unmounting)
6488 return -ENOTCONN;
6489
6490 //get through existing parts of path
6491 filepath path(relpath);
6492 unsigned int i;
6493 int r = 0, caps = 0;
6494 InodeRef cur, next;
6495 cur = cwd;
6496 for (i=0; i<path.depth(); ++i) {
6497 if (cct->_conf->client_permissions) {
6498 r = may_lookup(cur.get(), perms);
6499 if (r < 0)
6500 break;
6501 caps = CEPH_CAP_AUTH_SHARED;
6502 }
6503 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6504 if (r < 0)
6505 break;
6506 cur.swap(next);
6507 }
6508 //check that we have work left to do
6509 if (i==path.depth()) return -EEXIST;
6510 if (r!=-ENOENT) return r;
6511 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6512 //make new directory at each level
6513 for (; i<path.depth(); ++i) {
6514 if (cct->_conf->client_permissions) {
6515 r = may_create(cur.get(), perms);
6516 if (r < 0)
6517 return r;
6518 }
6519 //make new dir
6520 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6521
6522 //check proper creation/existence
6523 if(-EEXIST == r && i < path.depth() - 1) {
6524 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6525 }
6526 if (r < 0)
6527 return r;
6528 //move to new dir and continue
6529 cur.swap(next);
6530 ldout(cct, 20) << "mkdirs: successfully created directory "
6531 << filepath(cur->ino).get_path() << dendl;
6532 }
6533 return 0;
6534 }
6535
6536 int Client::rmdir(const char *relpath, const UserPerm& perms)
6537 {
6538 Mutex::Locker lock(client_lock);
6539 tout(cct) << "rmdir" << std::endl;
6540 tout(cct) << relpath << std::endl;
6541
6542 if (unmounting)
6543 return -ENOTCONN;
6544
6545 if (std::string(relpath) == "/")
6546 return -EBUSY;
6547
6548 filepath path(relpath);
6549 string name = path.last_dentry();
6550 path.pop_dentry();
6551 InodeRef dir;
6552 int r = path_walk(path, &dir, perms);
6553 if (r < 0)
6554 return r;
6555 if (cct->_conf->client_permissions) {
6556 int r = may_delete(dir.get(), name.c_str(), perms);
6557 if (r < 0)
6558 return r;
6559 }
6560 return _rmdir(dir.get(), name.c_str(), perms);
6561 }
6562
6563 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6564 {
6565 Mutex::Locker lock(client_lock);
6566 tout(cct) << "mknod" << std::endl;
6567 tout(cct) << relpath << std::endl;
6568 tout(cct) << mode << std::endl;
6569 tout(cct) << rdev << std::endl;
6570
6571 if (unmounting)
6572 return -ENOTCONN;
6573
6574 if (std::string(relpath) == "/")
6575 return -EEXIST;
6576
6577 filepath path(relpath);
6578 string name = path.last_dentry();
6579 path.pop_dentry();
6580 InodeRef dir;
6581 int r = path_walk(path, &dir, perms);
6582 if (r < 0)
6583 return r;
6584 if (cct->_conf->client_permissions) {
6585 int r = may_create(dir.get(), perms);
6586 if (r < 0)
6587 return r;
6588 }
6589 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6590 }
6591
6592 // symlinks
6593
6594 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6595 {
6596 Mutex::Locker lock(client_lock);
6597 tout(cct) << "symlink" << std::endl;
6598 tout(cct) << target << std::endl;
6599 tout(cct) << relpath << std::endl;
6600
6601 if (unmounting)
6602 return -ENOTCONN;
6603
6604 if (std::string(relpath) == "/")
6605 return -EEXIST;
6606
6607 filepath path(relpath);
6608 string name = path.last_dentry();
6609 path.pop_dentry();
6610 InodeRef dir;
6611 int r = path_walk(path, &dir, perms);
6612 if (r < 0)
6613 return r;
6614 if (cct->_conf->client_permissions) {
6615 int r = may_create(dir.get(), perms);
6616 if (r < 0)
6617 return r;
6618 }
6619 return _symlink(dir.get(), name.c_str(), target, perms);
6620 }
6621
6622 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6623 {
6624 Mutex::Locker lock(client_lock);
6625 tout(cct) << "readlink" << std::endl;
6626 tout(cct) << relpath << std::endl;
6627
6628 if (unmounting)
6629 return -ENOTCONN;
6630
6631 filepath path(relpath);
6632 InodeRef in;
6633 int r = path_walk(path, &in, perms, false);
6634 if (r < 0)
6635 return r;
6636
6637 return _readlink(in.get(), buf, size);
6638 }
6639
6640 int Client::_readlink(Inode *in, char *buf, size_t size)
6641 {
6642 if (!in->is_symlink())
6643 return -EINVAL;
6644
6645 // copy into buf (at most size bytes)
6646 int r = in->symlink.length();
6647 if (r > (int)size)
6648 r = size;
6649 memcpy(buf, in->symlink.c_str(), r);
6650 return r;
6651 }
6652
6653
6654 // inode stuff
6655
6656 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6657 {
6658 bool yes = in->caps_issued_mask(mask, true);
6659
6660 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6661 if (yes && !force)
6662 return 0;
6663
6664 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6665 filepath path;
6666 in->make_nosnap_relative_path(path);
6667 req->set_filepath(path);
6668 req->set_inode(in);
6669 req->head.args.getattr.mask = mask;
6670
6671 int res = make_request(req, perms);
6672 ldout(cct, 10) << "_getattr result=" << res << dendl;
6673 return res;
6674 }
6675
6676 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6677 const UserPerm& perms, InodeRef *inp)
6678 {
6679 int issued = in->caps_issued();
6680
6681 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6682 ccap_string(issued) << dendl;
6683
6684 if (in->snapid != CEPH_NOSNAP) {
6685 return -EROFS;
6686 }
6687 if ((mask & CEPH_SETATTR_SIZE) &&
6688 (unsigned long)stx->stx_size > in->size &&
6689 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6690 perms)) {
6691 return -EDQUOT;
6692 }
6693
6694 // make the change locally?
6695 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6696 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6697 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6698 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6699 << in->cap_dirtier_gid << ", forcing sync setattr"
6700 << dendl;
6701 /*
6702 * This works because we implicitly flush the caps as part of the
6703 * request, so the cap update check will happen with the writeback
6704 * cap context, and then the setattr check will happen with the
6705 * caller's context.
6706 *
6707 * In reality this pattern is likely pretty rare (different users
6708 * setattr'ing the same file). If that turns out not to be the
6709 * case later, we can build a more complex pipelined cap writeback
6710 * infrastructure...
6711 */
6712 if (!mask)
6713 mask |= CEPH_SETATTR_CTIME;
6714 goto force_request;
6715 }
6716
6717 if (!mask) {
6718 // caller just needs us to bump the ctime
6719 in->ctime = ceph_clock_now();
6720 in->cap_dirtier_uid = perms.uid();
6721 in->cap_dirtier_gid = perms.gid();
6722 if (issued & CEPH_CAP_AUTH_EXCL)
6723 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6724 else if (issued & CEPH_CAP_FILE_EXCL)
6725 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6726 else if (issued & CEPH_CAP_XATTR_EXCL)
6727 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6728 else
6729 mask |= CEPH_SETATTR_CTIME;
6730 }
6731
6732 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6733 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6734
6735 mask &= ~CEPH_SETATTR_KILL_SGUID;
6736
6737 if (mask & CEPH_SETATTR_UID) {
6738 in->ctime = ceph_clock_now();
6739 in->cap_dirtier_uid = perms.uid();
6740 in->cap_dirtier_gid = perms.gid();
6741 in->uid = stx->stx_uid;
6742 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6743 mask &= ~CEPH_SETATTR_UID;
6744 kill_sguid = true;
6745 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6746 }
6747 if (mask & CEPH_SETATTR_GID) {
6748 in->ctime = ceph_clock_now();
6749 in->cap_dirtier_uid = perms.uid();
6750 in->cap_dirtier_gid = perms.gid();
6751 in->gid = stx->stx_gid;
6752 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6753 mask &= ~CEPH_SETATTR_GID;
6754 kill_sguid = true;
6755 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6756 }
6757
6758 if (mask & CEPH_SETATTR_MODE) {
6759 in->ctime = ceph_clock_now();
6760 in->cap_dirtier_uid = perms.uid();
6761 in->cap_dirtier_gid = perms.gid();
6762 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6763 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6764 mask &= ~CEPH_SETATTR_MODE;
6765 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6766 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6767 /* Must squash the any setuid/setgid bits with an ownership change */
6768 in->mode &= ~(S_ISUID|S_ISGID);
6769 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6770 }
6771
6772 if (mask & CEPH_SETATTR_BTIME) {
6773 in->ctime = ceph_clock_now();
6774 in->cap_dirtier_uid = perms.uid();
6775 in->cap_dirtier_gid = perms.gid();
6776 in->btime = utime_t(stx->stx_btime);
6777 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6778 mask &= ~CEPH_SETATTR_BTIME;
6779 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6780 }
6781 } else if (mask & CEPH_SETATTR_SIZE) {
6782 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6783 mask |= CEPH_SETATTR_KILL_SGUID;
6784 }
6785
6786 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6787 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6788 if (mask & CEPH_SETATTR_MTIME)
6789 in->mtime = utime_t(stx->stx_mtime);
6790 if (mask & CEPH_SETATTR_ATIME)
6791 in->atime = utime_t(stx->stx_atime);
6792 in->ctime = ceph_clock_now();
6793 in->cap_dirtier_uid = perms.uid();
6794 in->cap_dirtier_gid = perms.gid();
6795 in->time_warp_seq++;
6796 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6797 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6798 }
6799 }
6800 if (!mask) {
6801 in->change_attr++;
6802 return 0;
6803 }
6804
6805 force_request:
6806 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6807
6808 filepath path;
6809
6810 in->make_nosnap_relative_path(path);
6811 req->set_filepath(path);
6812 req->set_inode(in);
6813
6814 if (mask & CEPH_SETATTR_KILL_SGUID) {
6815 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6816 }
6817 if (mask & CEPH_SETATTR_MODE) {
6818 req->head.args.setattr.mode = stx->stx_mode;
6819 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6820 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6821 }
6822 if (mask & CEPH_SETATTR_UID) {
6823 req->head.args.setattr.uid = stx->stx_uid;
6824 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6825 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6826 }
6827 if (mask & CEPH_SETATTR_GID) {
6828 req->head.args.setattr.gid = stx->stx_gid;
6829 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6830 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6831 }
6832 if (mask & CEPH_SETATTR_BTIME) {
6833 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6834 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6835 }
6836 if (mask & CEPH_SETATTR_MTIME) {
6837 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6838 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6839 CEPH_CAP_FILE_WR;
6840 }
6841 if (mask & CEPH_SETATTR_ATIME) {
6842 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6843 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6844 CEPH_CAP_FILE_WR;
6845 }
6846 if (mask & CEPH_SETATTR_SIZE) {
6847 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6848 req->head.args.setattr.size = stx->stx_size;
6849 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6850 } else { //too big!
6851 put_request(req);
6852 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6853 return -EFBIG;
6854 }
6855 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6856 CEPH_CAP_FILE_WR;
6857 }
6858 req->head.args.setattr.mask = mask;
6859
6860 req->regetattr_mask = mask;
6861
6862 int res = make_request(req, perms, inp);
6863 ldout(cct, 10) << "_setattr result=" << res << dendl;
6864 return res;
6865 }
6866
6867 /* Note that we only care about attrs that setattr cares about */
6868 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6869 {
6870 stx->stx_size = st->st_size;
6871 stx->stx_mode = st->st_mode;
6872 stx->stx_uid = st->st_uid;
6873 stx->stx_gid = st->st_gid;
6874 stx->stx_mtime = st->st_mtim;
6875 stx->stx_atime = st->st_atim;
6876 }
6877
6878 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6879 const UserPerm& perms, InodeRef *inp)
6880 {
6881 int ret = _do_setattr(in, stx, mask, perms, inp);
6882 if (ret < 0)
6883 return ret;
6884 if (mask & CEPH_SETATTR_MODE)
6885 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6886 return ret;
6887 }
6888
6889 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6890 const UserPerm& perms)
6891 {
6892 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6893 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6894 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6895 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6896 if (cct->_conf->client_permissions) {
6897 int r = may_setattr(in.get(), stx, mask, perms);
6898 if (r < 0)
6899 return r;
6900 }
6901 return __setattrx(in.get(), stx, mask, perms);
6902 }
6903
6904 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6905 const UserPerm& perms)
6906 {
6907 struct ceph_statx stx;
6908
6909 stat_to_statx(attr, &stx);
6910 mask &= ~CEPH_SETATTR_BTIME;
6911
6912 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6913 mask &= ~CEPH_SETATTR_UID;
6914 }
6915 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6916 mask &= ~CEPH_SETATTR_GID;
6917 }
6918
6919 return _setattrx(in, &stx, mask, perms);
6920 }
6921
6922 int Client::setattr(const char *relpath, struct stat *attr, int mask,
6923 const UserPerm& perms)
6924 {
6925 Mutex::Locker lock(client_lock);
6926 tout(cct) << "setattr" << std::endl;
6927 tout(cct) << relpath << std::endl;
6928 tout(cct) << mask << std::endl;
6929
6930 if (unmounting)
6931 return -ENOTCONN;
6932
6933 filepath path(relpath);
6934 InodeRef in;
6935 int r = path_walk(path, &in, perms);
6936 if (r < 0)
6937 return r;
6938 return _setattr(in, attr, mask, perms);
6939 }
6940
6941 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6942 const UserPerm& perms, int flags)
6943 {
6944 Mutex::Locker lock(client_lock);
6945 tout(cct) << "setattrx" << std::endl;
6946 tout(cct) << relpath << std::endl;
6947 tout(cct) << mask << std::endl;
6948
6949 if (unmounting)
6950 return -ENOTCONN;
6951
6952 filepath path(relpath);
6953 InodeRef in;
6954 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6955 if (r < 0)
6956 return r;
6957 return _setattrx(in, stx, mask, perms);
6958 }
6959
6960 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6961 {
6962 Mutex::Locker lock(client_lock);
6963 tout(cct) << "fsetattr" << std::endl;
6964 tout(cct) << fd << std::endl;
6965 tout(cct) << mask << std::endl;
6966
6967 if (unmounting)
6968 return -ENOTCONN;
6969
6970 Fh *f = get_filehandle(fd);
6971 if (!f)
6972 return -EBADF;
6973 #if defined(__linux__) && defined(O_PATH)
6974 if (f->flags & O_PATH)
6975 return -EBADF;
6976 #endif
6977 return _setattr(f->inode, attr, mask, perms);
6978 }
6979
6980 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6981 {
6982 Mutex::Locker lock(client_lock);
6983 tout(cct) << "fsetattr" << std::endl;
6984 tout(cct) << fd << std::endl;
6985 tout(cct) << mask << std::endl;
6986
6987 if (unmounting)
6988 return -ENOTCONN;
6989
6990 Fh *f = get_filehandle(fd);
6991 if (!f)
6992 return -EBADF;
6993 #if defined(__linux__) && defined(O_PATH)
6994 if (f->flags & O_PATH)
6995 return -EBADF;
6996 #endif
6997 return _setattrx(f->inode, stx, mask, perms);
6998 }
6999
7000 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7001 frag_info_t *dirstat, int mask)
7002 {
7003 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7004 Mutex::Locker lock(client_lock);
7005 tout(cct) << "stat" << std::endl;
7006 tout(cct) << relpath << std::endl;
7007
7008 if (unmounting)
7009 return -ENOTCONN;
7010
7011 filepath path(relpath);
7012 InodeRef in;
7013 int r = path_walk(path, &in, perms, true, mask);
7014 if (r < 0)
7015 return r;
7016 r = _getattr(in, mask, perms);
7017 if (r < 0) {
7018 ldout(cct, 3) << "stat exit on error!" << dendl;
7019 return r;
7020 }
7021 fill_stat(in, stbuf, dirstat);
7022 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7023 return r;
7024 }
7025
7026 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7027 {
7028 unsigned mask = 0;
7029
7030 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7031 if (flags & AT_NO_ATTR_SYNC)
7032 goto out;
7033
7034 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7035 mask |= CEPH_CAP_PIN;
7036 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7037 mask |= CEPH_CAP_AUTH_SHARED;
7038 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7039 mask |= CEPH_CAP_LINK_SHARED;
7040 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7041 mask |= CEPH_CAP_FILE_SHARED;
7042 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7043 mask |= CEPH_CAP_XATTR_SHARED;
7044 out:
7045 return mask;
7046 }
7047
7048 int Client::statx(const char *relpath, struct ceph_statx *stx,
7049 const UserPerm& perms,
7050 unsigned int want, unsigned int flags)
7051 {
7052 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7053 Mutex::Locker lock(client_lock);
7054 tout(cct) << "statx" << std::endl;
7055 tout(cct) << relpath << std::endl;
7056
7057 if (unmounting)
7058 return -ENOTCONN;
7059
7060 filepath path(relpath);
7061 InodeRef in;
7062
7063 unsigned mask = statx_to_mask(flags, want);
7064
7065 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7066 if (r < 0)
7067 return r;
7068
7069 r = _getattr(in, mask, perms);
7070 if (r < 0) {
7071 ldout(cct, 3) << "statx exit on error!" << dendl;
7072 return r;
7073 }
7074
7075 fill_statx(in, mask, stx);
7076 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7077 return r;
7078 }
7079
7080 int Client::lstat(const char *relpath, struct stat *stbuf,
7081 const UserPerm& perms, frag_info_t *dirstat, int mask)
7082 {
7083 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7084 Mutex::Locker lock(client_lock);
7085 tout(cct) << "lstat" << std::endl;
7086 tout(cct) << relpath << std::endl;
7087
7088 if (unmounting)
7089 return -ENOTCONN;
7090
7091 filepath path(relpath);
7092 InodeRef in;
7093 // don't follow symlinks
7094 int r = path_walk(path, &in, perms, false, mask);
7095 if (r < 0)
7096 return r;
7097 r = _getattr(in, mask, perms);
7098 if (r < 0) {
7099 ldout(cct, 3) << "lstat exit on error!" << dendl;
7100 return r;
7101 }
7102 fill_stat(in, stbuf, dirstat);
7103 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7104 return r;
7105 }
7106
7107 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7108 {
7109 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7110 << " mode 0" << oct << in->mode << dec
7111 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7112 memset(st, 0, sizeof(struct stat));
7113 if (use_faked_inos())
7114 st->st_ino = in->faked_ino;
7115 else
7116 st->st_ino = in->ino;
7117 st->st_dev = in->snapid;
7118 st->st_mode = in->mode;
7119 st->st_rdev = in->rdev;
7120 st->st_nlink = in->nlink;
7121 st->st_uid = in->uid;
7122 st->st_gid = in->gid;
7123 if (in->ctime > in->mtime) {
7124 stat_set_ctime_sec(st, in->ctime.sec());
7125 stat_set_ctime_nsec(st, in->ctime.nsec());
7126 } else {
7127 stat_set_ctime_sec(st, in->mtime.sec());
7128 stat_set_ctime_nsec(st, in->mtime.nsec());
7129 }
7130 stat_set_atime_sec(st, in->atime.sec());
7131 stat_set_atime_nsec(st, in->atime.nsec());
7132 stat_set_mtime_sec(st, in->mtime.sec());
7133 stat_set_mtime_nsec(st, in->mtime.nsec());
7134 if (in->is_dir()) {
7135 if (cct->_conf->client_dirsize_rbytes)
7136 st->st_size = in->rstat.rbytes;
7137 else
7138 st->st_size = in->dirstat.size();
7139 st->st_blocks = 1;
7140 } else {
7141 st->st_size = in->size;
7142 st->st_blocks = (in->size + 511) >> 9;
7143 }
7144 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7145
7146 if (dirstat)
7147 *dirstat = in->dirstat;
7148 if (rstat)
7149 *rstat = in->rstat;
7150
7151 return in->caps_issued();
7152 }
7153
7154 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7155 {
7156 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7157 << " mode 0" << oct << in->mode << dec
7158 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7159 memset(stx, 0, sizeof(struct ceph_statx));
7160
7161 /*
7162 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7163 * so that all bits are set.
7164 */
7165 if (!mask)
7166 mask = ~0;
7167
7168 /* These are always considered to be available */
7169 stx->stx_dev = in->snapid;
7170 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7171
7172 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7173 stx->stx_mode = S_IFMT & in->mode;
7174 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7175 stx->stx_rdev = in->rdev;
7176 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7177
7178 if (mask & CEPH_CAP_AUTH_SHARED) {
7179 stx->stx_uid = in->uid;
7180 stx->stx_gid = in->gid;
7181 stx->stx_mode = in->mode;
7182 in->btime.to_timespec(&stx->stx_btime);
7183 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7184 }
7185
7186 if (mask & CEPH_CAP_LINK_SHARED) {
7187 stx->stx_nlink = in->nlink;
7188 stx->stx_mask |= CEPH_STATX_NLINK;
7189 }
7190
7191 if (mask & CEPH_CAP_FILE_SHARED) {
7192
7193 in->atime.to_timespec(&stx->stx_atime);
7194 in->mtime.to_timespec(&stx->stx_mtime);
7195
7196 if (in->is_dir()) {
7197 if (cct->_conf->client_dirsize_rbytes)
7198 stx->stx_size = in->rstat.rbytes;
7199 else
7200 stx->stx_size = in->dirstat.size();
7201 stx->stx_blocks = 1;
7202 } else {
7203 stx->stx_size = in->size;
7204 stx->stx_blocks = (in->size + 511) >> 9;
7205 }
7206 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7207 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7208 }
7209
7210 /* Change time and change_attr both require all shared caps to view */
7211 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7212 stx->stx_version = in->change_attr;
7213 if (in->ctime > in->mtime)
7214 in->ctime.to_timespec(&stx->stx_ctime);
7215 else
7216 in->mtime.to_timespec(&stx->stx_ctime);
7217 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7218 }
7219
7220 }
7221
7222 void Client::touch_dn(Dentry *dn)
7223 {
7224 lru.lru_touch(dn);
7225 }
7226
7227 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7228 {
7229 Mutex::Locker lock(client_lock);
7230 tout(cct) << "chmod" << std::endl;
7231 tout(cct) << relpath << std::endl;
7232 tout(cct) << mode << std::endl;
7233
7234 if (unmounting)
7235 return -ENOTCONN;
7236
7237 filepath path(relpath);
7238 InodeRef in;
7239 int r = path_walk(path, &in, perms);
7240 if (r < 0)
7241 return r;
7242 struct stat attr;
7243 attr.st_mode = mode;
7244 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7245 }
7246
7247 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7248 {
7249 Mutex::Locker lock(client_lock);
7250 tout(cct) << "fchmod" << std::endl;
7251 tout(cct) << fd << std::endl;
7252 tout(cct) << mode << std::endl;
7253
7254 if (unmounting)
7255 return -ENOTCONN;
7256
7257 Fh *f = get_filehandle(fd);
7258 if (!f)
7259 return -EBADF;
7260 #if defined(__linux__) && defined(O_PATH)
7261 if (f->flags & O_PATH)
7262 return -EBADF;
7263 #endif
7264 struct stat attr;
7265 attr.st_mode = mode;
7266 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7267 }
7268
7269 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7270 {
7271 Mutex::Locker lock(client_lock);
7272 tout(cct) << "lchmod" << std::endl;
7273 tout(cct) << relpath << std::endl;
7274 tout(cct) << mode << std::endl;
7275
7276 if (unmounting)
7277 return -ENOTCONN;
7278
7279 filepath path(relpath);
7280 InodeRef in;
7281 // don't follow symlinks
7282 int r = path_walk(path, &in, perms, false);
7283 if (r < 0)
7284 return r;
7285 struct stat attr;
7286 attr.st_mode = mode;
7287 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7288 }
7289
7290 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7291 const UserPerm& perms)
7292 {
7293 Mutex::Locker lock(client_lock);
7294 tout(cct) << "chown" << std::endl;
7295 tout(cct) << relpath << std::endl;
7296 tout(cct) << new_uid << std::endl;
7297 tout(cct) << new_gid << std::endl;
7298
7299 if (unmounting)
7300 return -ENOTCONN;
7301
7302 filepath path(relpath);
7303 InodeRef in;
7304 int r = path_walk(path, &in, perms);
7305 if (r < 0)
7306 return r;
7307 struct stat attr;
7308 attr.st_uid = new_uid;
7309 attr.st_gid = new_gid;
7310 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7311 }
7312
7313 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7314 {
7315 Mutex::Locker lock(client_lock);
7316 tout(cct) << "fchown" << std::endl;
7317 tout(cct) << fd << std::endl;
7318 tout(cct) << new_uid << std::endl;
7319 tout(cct) << new_gid << std::endl;
7320
7321 if (unmounting)
7322 return -ENOTCONN;
7323
7324 Fh *f = get_filehandle(fd);
7325 if (!f)
7326 return -EBADF;
7327 #if defined(__linux__) && defined(O_PATH)
7328 if (f->flags & O_PATH)
7329 return -EBADF;
7330 #endif
7331 struct stat attr;
7332 attr.st_uid = new_uid;
7333 attr.st_gid = new_gid;
7334 int mask = 0;
7335 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7336 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7337 return _setattr(f->inode, &attr, mask, perms);
7338 }
7339
7340 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7341 const UserPerm& perms)
7342 {
7343 Mutex::Locker lock(client_lock);
7344 tout(cct) << "lchown" << std::endl;
7345 tout(cct) << relpath << std::endl;
7346 tout(cct) << new_uid << std::endl;
7347 tout(cct) << new_gid << std::endl;
7348
7349 if (unmounting)
7350 return -ENOTCONN;
7351
7352 filepath path(relpath);
7353 InodeRef in;
7354 // don't follow symlinks
7355 int r = path_walk(path, &in, perms, false);
7356 if (r < 0)
7357 return r;
7358 struct stat attr;
7359 attr.st_uid = new_uid;
7360 attr.st_gid = new_gid;
7361 int mask = 0;
7362 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7363 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7364 return _setattr(in, &attr, mask, perms);
7365 }
7366
7367 int Client::utime(const char *relpath, struct utimbuf *buf,
7368 const UserPerm& perms)
7369 {
7370 Mutex::Locker lock(client_lock);
7371 tout(cct) << "utime" << std::endl;
7372 tout(cct) << relpath << std::endl;
7373 tout(cct) << buf->modtime << std::endl;
7374 tout(cct) << buf->actime << std::endl;
7375
7376 if (unmounting)
7377 return -ENOTCONN;
7378
7379 filepath path(relpath);
7380 InodeRef in;
7381 int r = path_walk(path, &in, perms);
7382 if (r < 0)
7383 return r;
7384 struct stat attr;
7385 stat_set_mtime_sec(&attr, buf->modtime);
7386 stat_set_mtime_nsec(&attr, 0);
7387 stat_set_atime_sec(&attr, buf->actime);
7388 stat_set_atime_nsec(&attr, 0);
7389 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7390 }
7391
7392 int Client::lutime(const char *relpath, struct utimbuf *buf,
7393 const UserPerm& perms)
7394 {
7395 Mutex::Locker lock(client_lock);
7396 tout(cct) << "lutime" << std::endl;
7397 tout(cct) << relpath << std::endl;
7398 tout(cct) << buf->modtime << std::endl;
7399 tout(cct) << buf->actime << std::endl;
7400
7401 if (unmounting)
7402 return -ENOTCONN;
7403
7404 filepath path(relpath);
7405 InodeRef in;
7406 // don't follow symlinks
7407 int r = path_walk(path, &in, perms, false);
7408 if (r < 0)
7409 return r;
7410 struct stat attr;
7411 stat_set_mtime_sec(&attr, buf->modtime);
7412 stat_set_mtime_nsec(&attr, 0);
7413 stat_set_atime_sec(&attr, buf->actime);
7414 stat_set_atime_nsec(&attr, 0);
7415 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7416 }
7417
7418 int Client::flock(int fd, int operation, uint64_t owner)
7419 {
7420 Mutex::Locker lock(client_lock);
7421 tout(cct) << "flock" << std::endl;
7422 tout(cct) << fd << std::endl;
7423 tout(cct) << operation << std::endl;
7424 tout(cct) << owner << std::endl;
7425
7426 if (unmounting)
7427 return -ENOTCONN;
7428
7429 Fh *f = get_filehandle(fd);
7430 if (!f)
7431 return -EBADF;
7432
7433 return _flock(f, operation, owner);
7434 }
7435
7436 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7437 {
7438 Mutex::Locker lock(client_lock);
7439 tout(cct) << "opendir" << std::endl;
7440 tout(cct) << relpath << std::endl;
7441
7442 if (unmounting)
7443 return -ENOTCONN;
7444
7445 filepath path(relpath);
7446 InodeRef in;
7447 int r = path_walk(path, &in, perms, true);
7448 if (r < 0)
7449 return r;
7450 if (cct->_conf->client_permissions) {
7451 int r = may_open(in.get(), O_RDONLY, perms);
7452 if (r < 0)
7453 return r;
7454 }
7455 r = _opendir(in.get(), dirpp, perms);
7456 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7457 if (r != -ENOTDIR)
7458 tout(cct) << (unsigned long)*dirpp << std::endl;
7459 return r;
7460 }
7461
7462 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7463 {
7464 if (!in->is_dir())
7465 return -ENOTDIR;
7466 *dirpp = new dir_result_t(in, perms);
7467 opened_dirs.insert(*dirpp);
7468 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7469 return 0;
7470 }
7471
7472
7473 int Client::closedir(dir_result_t *dir)
7474 {
7475 Mutex::Locker lock(client_lock);
7476 tout(cct) << "closedir" << std::endl;
7477 tout(cct) << (unsigned long)dir << std::endl;
7478
7479 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7480 _closedir(dir);
7481 return 0;
7482 }
7483
7484 void Client::_closedir(dir_result_t *dirp)
7485 {
7486 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7487 if (dirp->inode) {
7488 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7489 dirp->inode.reset();
7490 }
7491 _readdir_drop_dirp_buffer(dirp);
7492 opened_dirs.erase(dirp);
7493 delete dirp;
7494 }
7495
7496 void Client::rewinddir(dir_result_t *dirp)
7497 {
7498 Mutex::Locker lock(client_lock);
7499 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7500
7501 if (unmounting)
7502 return;
7503
7504 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7505 _readdir_drop_dirp_buffer(d);
7506 d->reset();
7507 }
7508
7509 loff_t Client::telldir(dir_result_t *dirp)
7510 {
7511 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7512 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7513 return d->offset;
7514 }
7515
7516 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7517 {
7518 Mutex::Locker lock(client_lock);
7519
7520 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7521
7522 if (unmounting)
7523 return;
7524
7525 if (offset == dirp->offset)
7526 return;
7527
7528 if (offset > dirp->offset)
7529 dirp->release_count = 0; // bump if we do a forward seek
7530 else
7531 dirp->ordered_count = 0; // disable filling readdir cache
7532
7533 if (dirp->hash_order()) {
7534 if (dirp->offset > offset) {
7535 _readdir_drop_dirp_buffer(dirp);
7536 dirp->reset();
7537 }
7538 } else {
7539 if (offset == 0 ||
7540 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7541 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7542 _readdir_drop_dirp_buffer(dirp);
7543 dirp->reset();
7544 }
7545 }
7546
7547 dirp->offset = offset;
7548 }
7549
7550
7551 //struct dirent {
7552 // ino_t d_ino; /* inode number */
7553 // off_t d_off; /* offset to the next dirent */
7554 // unsigned short d_reclen; /* length of this record */
7555 // unsigned char d_type; /* type of file */
7556 // char d_name[256]; /* filename */
7557 //};
7558 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7559 {
7560 strncpy(de->d_name, name, 255);
7561 de->d_name[255] = '\0';
7562 #ifndef __CYGWIN__
7563 de->d_ino = ino;
7564 #if !defined(DARWIN) && !defined(__FreeBSD__)
7565 de->d_off = next_off;
7566 #endif
7567 de->d_reclen = 1;
7568 de->d_type = IFTODT(type);
7569 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7570 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7571 #endif
7572 }
7573
7574 void Client::_readdir_next_frag(dir_result_t *dirp)
7575 {
7576 frag_t fg = dirp->buffer_frag;
7577
7578 if (fg.is_rightmost()) {
7579 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7580 dirp->set_end();
7581 return;
7582 }
7583
7584 // advance
7585 fg = fg.next();
7586 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7587
7588 if (dirp->hash_order()) {
7589 // keep last_name
7590 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7591 if (dirp->offset < new_offset) // don't decrease offset
7592 dirp->offset = new_offset;
7593 } else {
7594 dirp->last_name.clear();
7595 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7596 _readdir_rechoose_frag(dirp);
7597 }
7598 }
7599
7600 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7601 {
7602 assert(dirp->inode);
7603
7604 if (dirp->hash_order())
7605 return;
7606
7607 frag_t cur = frag_t(dirp->offset_high());
7608 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7609 if (fg != cur) {
7610 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7611 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7612 dirp->last_name.clear();
7613 dirp->next_offset = 2;
7614 }
7615 }
7616
7617 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7618 {
7619 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7620 dirp->buffer.clear();
7621 }
7622
7623 int Client::_readdir_get_frag(dir_result_t *dirp)
7624 {
7625 assert(dirp);
7626 assert(dirp->inode);
7627
7628 // get the current frag.
7629 frag_t fg;
7630 if (dirp->hash_order())
7631 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7632 else
7633 fg = frag_t(dirp->offset_high());
7634
7635 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7636 << " offset " << hex << dirp->offset << dec << dendl;
7637
7638 int op = CEPH_MDS_OP_READDIR;
7639 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7640 op = CEPH_MDS_OP_LSSNAP;
7641
7642 InodeRef& diri = dirp->inode;
7643
7644 MetaRequest *req = new MetaRequest(op);
7645 filepath path;
7646 diri->make_nosnap_relative_path(path);
7647 req->set_filepath(path);
7648 req->set_inode(diri.get());
7649 req->head.args.readdir.frag = fg;
7650 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7651 if (dirp->last_name.length()) {
7652 req->path2.set_path(dirp->last_name);
7653 } else if (dirp->hash_order()) {
7654 req->head.args.readdir.offset_hash = dirp->offset_high();
7655 }
7656 req->dirp = dirp;
7657
7658 bufferlist dirbl;
7659 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7660
7661 if (res == -EAGAIN) {
7662 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7663 _readdir_rechoose_frag(dirp);
7664 return _readdir_get_frag(dirp);
7665 }
7666
7667 if (res == 0) {
7668 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7669 << " size " << dirp->buffer.size() << dendl;
7670 } else {
7671 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7672 dirp->set_end();
7673 }
7674
7675 return res;
7676 }
7677
7678 struct dentry_off_lt {
7679 bool operator()(const Dentry* dn, int64_t off) const {
7680 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7681 }
7682 };
7683
7684 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7685 int caps, bool getref)
7686 {
7687 assert(client_lock.is_locked());
7688 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7689 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7690 << dendl;
7691 Dir *dir = dirp->inode->dir;
7692
7693 if (!dir) {
7694 ldout(cct, 10) << " dir is empty" << dendl;
7695 dirp->set_end();
7696 return 0;
7697 }
7698
7699 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7700 dir->readdir_cache.end(),
7701 dirp->offset, dentry_off_lt());
7702
7703 string dn_name;
7704 while (true) {
7705 if (!dirp->inode->is_complete_and_ordered())
7706 return -EAGAIN;
7707 if (pd == dir->readdir_cache.end())
7708 break;
7709 Dentry *dn = *pd;
7710 if (dn->inode == NULL) {
7711 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7712 ++pd;
7713 continue;
7714 }
7715 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7716 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7717 ++pd;
7718 continue;
7719 }
7720
7721 int r = _getattr(dn->inode, caps, dirp->perms);
7722 if (r < 0)
7723 return r;
7724
7725 struct ceph_statx stx;
7726 struct dirent de;
7727 fill_statx(dn->inode, caps, &stx);
7728
7729 uint64_t next_off = dn->offset + 1;
7730 ++pd;
7731 if (pd == dir->readdir_cache.end())
7732 next_off = dir_result_t::END;
7733
7734 Inode *in = NULL;
7735 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7736 if (getref) {
7737 in = dn->inode.get();
7738 _ll_get(in);
7739 }
7740
7741 dn_name = dn->name; // fill in name while we have lock
7742
7743 client_lock.Unlock();
7744 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7745 client_lock.Lock();
7746 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7747 << " = " << r << dendl;
7748 if (r < 0) {
7749 return r;
7750 }
7751
7752 dirp->offset = next_off;
7753 if (dirp->at_end())
7754 dirp->next_offset = 2;
7755 else
7756 dirp->next_offset = dirp->offset_low();
7757 dirp->last_name = dn_name; // we successfully returned this one; update!
7758 if (r > 0)
7759 return r;
7760 }
7761
7762 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7763 dirp->set_end();
7764 return 0;
7765 }
7766
7767 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7768 unsigned want, unsigned flags, bool getref)
7769 {
7770 int caps = statx_to_mask(flags, want);
7771
7772 Mutex::Locker lock(client_lock);
7773
7774 if (unmounting)
7775 return -ENOTCONN;
7776
7777 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7778
7779 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7780 << dec << " at_end=" << dirp->at_end()
7781 << " hash_order=" << dirp->hash_order() << dendl;
7782
7783 struct dirent de;
7784 struct ceph_statx stx;
7785 memset(&de, 0, sizeof(de));
7786 memset(&stx, 0, sizeof(stx));
7787
7788 InodeRef& diri = dirp->inode;
7789
7790 if (dirp->at_end())
7791 return 0;
7792
7793 if (dirp->offset == 0) {
7794 ldout(cct, 15) << " including ." << dendl;
7795 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7796 uint64_t next_off = 1;
7797
7798 int r;
7799 r = _getattr(diri, caps, dirp->perms);
7800 if (r < 0)
7801 return r;
7802
7803 fill_statx(diri, caps, &stx);
7804 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7805
7806 Inode *inode = NULL;
7807 if (getref) {
7808 inode = diri.get();
7809 _ll_get(inode);
7810 }
7811
7812 client_lock.Unlock();
7813 r = cb(p, &de, &stx, next_off, inode);
7814 client_lock.Lock();
7815 if (r < 0)
7816 return r;
7817
7818 dirp->offset = next_off;
7819 if (r > 0)
7820 return r;
7821 }
7822 if (dirp->offset == 1) {
7823 ldout(cct, 15) << " including .." << dendl;
7824 uint64_t next_off = 2;
7825 InodeRef in;
7826 if (diri->dn_set.empty())
7827 in = diri;
7828 else
7829 in = diri->get_first_parent()->dir->parent_inode;
7830
7831 int r;
7832 r = _getattr(in, caps, dirp->perms);
7833 if (r < 0)
7834 return r;
7835
7836 fill_statx(in, caps, &stx);
7837 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7838
7839 Inode *inode = NULL;
7840 if (getref) {
7841 inode = in.get();
7842 _ll_get(inode);
7843 }
7844
7845 client_lock.Unlock();
7846 r = cb(p, &de, &stx, next_off, inode);
7847 client_lock.Lock();
7848 if (r < 0)
7849 return r;
7850
7851 dirp->offset = next_off;
7852 if (r > 0)
7853 return r;
7854 }
7855
7856 // can we read from our cache?
7857 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7858 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7859 << dirp->inode->is_complete_and_ordered()
7860 << " issued " << ccap_string(dirp->inode->caps_issued())
7861 << dendl;
7862 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7863 dirp->inode->is_complete_and_ordered() &&
7864 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7865 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7866 if (err != -EAGAIN)
7867 return err;
7868 }
7869
7870 while (1) {
7871 if (dirp->at_end())
7872 return 0;
7873
7874 bool check_caps = true;
7875 if (!dirp->is_cached()) {
7876 int r = _readdir_get_frag(dirp);
7877 if (r)
7878 return r;
7879 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7880 // different than the requested one. (our dirfragtree was outdated)
7881 check_caps = false;
7882 }
7883 frag_t fg = dirp->buffer_frag;
7884
7885 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7886 << " offset " << hex << dirp->offset << dendl;
7887
7888 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7889 dirp->offset, dir_result_t::dentry_off_lt());
7890 it != dirp->buffer.end();
7891 ++it) {
7892 dir_result_t::dentry &entry = *it;
7893
7894 uint64_t next_off = entry.offset + 1;
7895
7896 int r;
7897 if (check_caps) {
7898 r = _getattr(entry.inode, caps, dirp->perms);
7899 if (r < 0)
7900 return r;
7901 }
7902
7903 fill_statx(entry.inode, caps, &stx);
7904 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7905
7906 Inode *inode = NULL;
7907 if (getref) {
7908 inode = entry.inode.get();
7909 _ll_get(inode);
7910 }
7911
7912 client_lock.Unlock();
7913 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7914 client_lock.Lock();
7915
7916 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7917 << " = " << r << dendl;
7918 if (r < 0)
7919 return r;
7920
7921 dirp->offset = next_off;
7922 if (r > 0)
7923 return r;
7924 }
7925
7926 if (dirp->next_offset > 2) {
7927 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7928 _readdir_drop_dirp_buffer(dirp);
7929 continue; // more!
7930 }
7931
7932 if (!fg.is_rightmost()) {
7933 // next frag!
7934 _readdir_next_frag(dirp);
7935 continue;
7936 }
7937
7938 if (diri->shared_gen == dirp->start_shared_gen &&
7939 diri->dir_release_count == dirp->release_count) {
7940 if (diri->dir_ordered_count == dirp->ordered_count) {
7941 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7942 if (diri->dir) {
7943 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7944 diri->dir->readdir_cache.resize(dirp->cache_index);
7945 }
7946 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7947 } else {
7948 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7949 diri->flags |= I_COMPLETE;
7950 }
7951 }
7952
7953 dirp->set_end();
7954 return 0;
7955 }
7956 ceph_abort();
7957 return 0;
7958 }
7959
7960
7961 int Client::readdir_r(dir_result_t *d, struct dirent *de)
7962 {
7963 return readdirplus_r(d, de, 0, 0, 0, NULL);
7964 }
7965
7966 /*
7967 * readdirplus_r
7968 *
7969 * returns
7970 * 1 if we got a dirent
7971 * 0 for end of directory
7972 * <0 on error
7973 */
7974
7975 struct single_readdir {
7976 struct dirent *de;
7977 struct ceph_statx *stx;
7978 Inode *inode;
7979 bool full;
7980 };
7981
7982 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7983 struct ceph_statx *stx, off_t off,
7984 Inode *in)
7985 {
7986 single_readdir *c = static_cast<single_readdir *>(p);
7987
7988 if (c->full)
7989 return -1; // already filled this dirent
7990
7991 *c->de = *de;
7992 if (c->stx)
7993 *c->stx = *stx;
7994 c->inode = in;
7995 c->full = true;
7996 return 1;
7997 }
7998
7999 struct dirent *Client::readdir(dir_result_t *d)
8000 {
8001 int ret;
8002 static struct dirent de;
8003 single_readdir sr;
8004 sr.de = &de;
8005 sr.stx = NULL;
8006 sr.inode = NULL;
8007 sr.full = false;
8008
8009 // our callback fills the dirent and sets sr.full=true on first
8010 // call, and returns -1 the second time around.
8011 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8012 if (ret < -1) {
8013 errno = -ret; // this sucks.
8014 return (dirent *) NULL;
8015 }
8016 if (sr.full) {
8017 return &de;
8018 }
8019 return (dirent *) NULL;
8020 }
8021
8022 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8023 struct ceph_statx *stx, unsigned want,
8024 unsigned flags, Inode **out)
8025 {
8026 single_readdir sr;
8027 sr.de = de;
8028 sr.stx = stx;
8029 sr.inode = NULL;
8030 sr.full = false;
8031
8032 // our callback fills the dirent and sets sr.full=true on first
8033 // call, and returns -1 the second time around.
8034 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8035 if (r < -1)
8036 return r;
8037 if (out)
8038 *out = sr.inode;
8039 if (sr.full)
8040 return 1;
8041 return 0;
8042 }
8043
8044
8045 /* getdents */
8046 struct getdents_result {
8047 char *buf;
8048 int buflen;
8049 int pos;
8050 bool fullent;
8051 };
8052
8053 static int _readdir_getdent_cb(void *p, struct dirent *de,
8054 struct ceph_statx *stx, off_t off, Inode *in)
8055 {
8056 struct getdents_result *c = static_cast<getdents_result *>(p);
8057
8058 int dlen;
8059 if (c->fullent)
8060 dlen = sizeof(*de);
8061 else
8062 dlen = strlen(de->d_name) + 1;
8063
8064 if (c->pos + dlen > c->buflen)
8065 return -1; // doesn't fit
8066
8067 if (c->fullent) {
8068 memcpy(c->buf + c->pos, de, sizeof(*de));
8069 } else {
8070 memcpy(c->buf + c->pos, de->d_name, dlen);
8071 }
8072 c->pos += dlen;
8073 return 0;
8074 }
8075
8076 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8077 {
8078 getdents_result gr;
8079 gr.buf = buf;
8080 gr.buflen = buflen;
8081 gr.fullent = fullent;
8082 gr.pos = 0;
8083
8084 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8085
8086 if (r < 0) { // some error
8087 if (r == -1) { // buffer ran out of space
8088 if (gr.pos) { // but we got some entries already!
8089 return gr.pos;
8090 } // or we need a larger buffer
8091 return -ERANGE;
8092 } else { // actual error, return it
8093 return r;
8094 }
8095 }
8096 return gr.pos;
8097 }
8098
8099
8100 /* getdir */
8101 struct getdir_result {
8102 list<string> *contents;
8103 int num;
8104 };
8105
8106 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8107 {
8108 getdir_result *r = static_cast<getdir_result *>(p);
8109
8110 r->contents->push_back(de->d_name);
8111 r->num++;
8112 return 0;
8113 }
8114
8115 int Client::getdir(const char *relpath, list<string>& contents,
8116 const UserPerm& perms)
8117 {
8118 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8119 {
8120 Mutex::Locker lock(client_lock);
8121 tout(cct) << "getdir" << std::endl;
8122 tout(cct) << relpath << std::endl;
8123 }
8124
8125 dir_result_t *d;
8126 int r = opendir(relpath, &d, perms);
8127 if (r < 0)
8128 return r;
8129
8130 getdir_result gr;
8131 gr.contents = &contents;
8132 gr.num = 0;
8133 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8134
8135 closedir(d);
8136
8137 if (r < 0)
8138 return r;
8139 return gr.num;
8140 }
8141
8142
8143 /****** file i/o **********/
8144 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8145 mode_t mode, int stripe_unit, int stripe_count,
8146 int object_size, const char *data_pool)
8147 {
8148 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8149 Mutex::Locker lock(client_lock);
8150 tout(cct) << "open" << std::endl;
8151 tout(cct) << relpath << std::endl;
8152 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8153
8154 if (unmounting)
8155 return -ENOTCONN;
8156
8157 Fh *fh = NULL;
8158
8159 #if defined(__linux__) && defined(O_PATH)
8160 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8161 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8162 * in kernel (fs/open.c). */
8163 if (flags & O_PATH)
8164 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8165 #endif
8166
8167 filepath path(relpath);
8168 InodeRef in;
8169 bool created = false;
8170 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8171 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8172 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8173
8174 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8175 return -EEXIST;
8176
8177 #if defined(__linux__) && defined(O_PATH)
8178 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8179 #else
8180 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8181 #endif
8182 return -ELOOP;
8183
8184 if (r == -ENOENT && (flags & O_CREAT)) {
8185 filepath dirpath = path;
8186 string dname = dirpath.last_dentry();
8187 dirpath.pop_dentry();
8188 InodeRef dir;
8189 r = path_walk(dirpath, &dir, perms, true,
8190 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8191 if (r < 0)
8192 goto out;
8193 if (cct->_conf->client_permissions) {
8194 r = may_create(dir.get(), perms);
8195 if (r < 0)
8196 goto out;
8197 }
8198 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8199 stripe_count, object_size, data_pool, &created, perms);
8200 }
8201 if (r < 0)
8202 goto out;
8203
8204 if (!created) {
8205 // posix says we can only check permissions of existing files
8206 if (cct->_conf->client_permissions) {
8207 r = may_open(in.get(), flags, perms);
8208 if (r < 0)
8209 goto out;
8210 }
8211 }
8212
8213 if (!fh)
8214 r = _open(in.get(), flags, mode, &fh, perms);
8215 if (r >= 0) {
8216 // allocate a integer file descriptor
8217 assert(fh);
8218 r = get_fd();
8219 assert(fd_map.count(r) == 0);
8220 fd_map[r] = fh;
8221 }
8222
8223 out:
8224 tout(cct) << r << std::endl;
8225 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8226 return r;
8227 }
8228
8229 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8230 {
8231 /* Use default file striping parameters */
8232 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8233 }
8234
8235 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8236 const UserPerm& perms)
8237 {
8238 Mutex::Locker lock(client_lock);
8239 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8240
8241 if (unmounting)
8242 return -ENOTCONN;
8243
8244 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8245 filepath path(ino);
8246 req->set_filepath(path);
8247
8248 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8249 char f[30];
8250 sprintf(f, "%u", h);
8251 filepath path2(dirino);
8252 path2.push_dentry(string(f));
8253 req->set_filepath2(path2);
8254
8255 int r = make_request(req, perms, NULL, NULL,
8256 rand() % mdsmap->get_num_in_mds());
8257 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8258 return r;
8259 }
8260
8261
8262 /**
8263 * Load inode into local cache.
8264 *
8265 * If inode pointer is non-NULL, and take a reference on
8266 * the resulting Inode object in one operation, so that caller
8267 * can safely assume inode will still be there after return.
8268 */
8269 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8270 {
8271 Mutex::Locker lock(client_lock);
8272 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8273
8274 if (unmounting)
8275 return -ENOTCONN;
8276
8277 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8278 filepath path(ino);
8279 req->set_filepath(path);
8280
8281 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8282 if (r == 0 && inode != NULL) {
8283 vinodeno_t vino(ino, CEPH_NOSNAP);
8284 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8285 assert(p != inode_map.end());
8286 *inode = p->second;
8287 _ll_get(*inode);
8288 }
8289 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8290 return r;
8291 }
8292
8293
8294
8295 /**
8296 * Find the parent inode of `ino` and insert it into
8297 * our cache. Conditionally also set `parent` to a referenced
8298 * Inode* if caller provides non-NULL value.
8299 */
8300 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8301 {
8302 Mutex::Locker lock(client_lock);
8303 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8304
8305 if (unmounting)
8306 return -ENOTCONN;
8307
8308 if (!ino->dn_set.empty()) {
8309 // if we exposed the parent here, we'd need to check permissions,
8310 // but right now we just rely on the MDS doing so in make_request
8311 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8312 return 0;
8313 }
8314
8315 if (ino->is_root()) {
8316 *parent = NULL;
8317 ldout(cct, 3) << "ino is root, no parent" << dendl;
8318 return -EINVAL;
8319 }
8320
8321 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8322 filepath path(ino->ino);
8323 req->set_filepath(path);
8324
8325 InodeRef target;
8326 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8327 // Give caller a reference to the parent ino if they provided a pointer.
8328 if (parent != NULL) {
8329 if (r == 0) {
8330 *parent = target.get();
8331 _ll_get(*parent);
8332 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8333 } else {
8334 *parent = NULL;
8335 }
8336 }
8337 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8338 return r;
8339 }
8340
8341
8342 /**
8343 * Populate the parent dentry for `ino`, provided it is
8344 * a child of `parent`.
8345 */
8346 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8347 {
8348 assert(parent->is_dir());
8349
8350 Mutex::Locker lock(client_lock);
8351 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8352
8353 if (unmounting)
8354 return -ENOTCONN;
8355
8356 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8357 req->set_filepath2(filepath(parent->ino));
8358 req->set_filepath(filepath(ino->ino));
8359 req->set_inode(ino);
8360
8361 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8362 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8363 return r;
8364 }
8365
8366
8367 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8368 {
8369 assert(in);
8370 Fh *f = new Fh(in);
8371 f->mode = cmode;
8372 f->flags = flags;
8373
8374 // inode
8375 f->actor_perms = perms;
8376
8377 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8378
8379 if (in->snapid != CEPH_NOSNAP) {
8380 in->snap_cap_refs++;
8381 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8382 << ccap_string(in->caps_issued()) << dendl;
8383 }
8384
8385 const md_config_t *conf = cct->_conf;
8386 f->readahead.set_trigger_requests(1);
8387 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8388 uint64_t max_readahead = Readahead::NO_LIMIT;
8389 if (conf->client_readahead_max_bytes) {
8390 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8391 }
8392 if (conf->client_readahead_max_periods) {
8393 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8394 }
8395 f->readahead.set_max_readahead_size(max_readahead);
8396 vector<uint64_t> alignments;
8397 alignments.push_back(in->layout.get_period());
8398 alignments.push_back(in->layout.stripe_unit);
8399 f->readahead.set_alignments(alignments);
8400
8401 return f;
8402 }
8403
8404 int Client::_release_fh(Fh *f)
8405 {
8406 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8407 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8408 Inode *in = f->inode.get();
8409 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8410
8411 in->unset_deleg(f);
8412
8413 if (in->snapid == CEPH_NOSNAP) {
8414 if (in->put_open_ref(f->mode)) {
8415 _flush(in, new C_Client_FlushComplete(this, in));
8416 check_caps(in, 0);
8417 }
8418 } else {
8419 assert(in->snap_cap_refs > 0);
8420 in->snap_cap_refs--;
8421 }
8422
8423 _release_filelocks(f);
8424
8425 // Finally, read any async err (i.e. from flushes)
8426 int err = f->take_async_err();
8427 if (err != 0) {
8428 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8429 << cpp_strerror(err) << dendl;
8430 } else {
8431 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8432 }
8433
8434 _put_fh(f);
8435
8436 return err;
8437 }
8438
8439 void Client::_put_fh(Fh *f)
8440 {
8441 int left = f->put();
8442 if (!left) {
8443 delete f;
8444 }
8445 }
8446
8447 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8448 const UserPerm& perms)
8449 {
8450 if (in->snapid != CEPH_NOSNAP &&
8451 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8452 return -EROFS;
8453 }
8454
8455 // use normalized flags to generate cmode
8456 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8457 if (cmode < 0)
8458 return -EINVAL;
8459 int want = ceph_caps_for_mode(cmode);
8460 int result = 0;
8461
8462 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8463
8464 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8465 // update wanted?
8466 check_caps(in, CHECK_CAPS_NODELAY);
8467 } else {
8468
8469 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8470 filepath path;
8471 in->make_nosnap_relative_path(path);
8472 req->set_filepath(path);
8473 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8474 req->head.args.open.mode = mode;
8475 req->head.args.open.pool = -1;
8476 if (cct->_conf->client_debug_getattr_caps)
8477 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8478 else
8479 req->head.args.open.mask = 0;
8480 req->head.args.open.old_size = in->size; // for O_TRUNC
8481 req->set_inode(in);
8482 result = make_request(req, perms);
8483
8484 /*
8485 * NFS expects that delegations will be broken on a conflicting open,
8486 * not just when there is actual conflicting access to the file. SMB leases
8487 * and oplocks also have similar semantics.
8488 *
8489 * Ensure that clients that have delegations enabled will wait on minimal
8490 * caps during open, just to ensure that other clients holding delegations
8491 * return theirs first.
8492 */
8493 if (deleg_timeout && result == 0) {
8494 int need = 0, have;
8495
8496 if (cmode & CEPH_FILE_MODE_WR)
8497 need |= CEPH_CAP_FILE_WR;
8498 if (cmode & CEPH_FILE_MODE_RD)
8499 need |= CEPH_CAP_FILE_RD;
8500
8501 result = get_caps(in, need, want, &have, -1);
8502 if (result < 0) {
8503 ldout(cct, 1) << "Unable to get caps after open of inode " << *in <<
8504 " . Denying open: " <<
8505 cpp_strerror(result) << dendl;
8506 in->put_open_ref(cmode);
8507 } else {
8508 put_cap_ref(in, need);
8509 }
8510 }
8511 }
8512
8513 // success?
8514 if (result >= 0) {
8515 if (fhp)
8516 *fhp = _create_fh(in, flags, cmode, perms);
8517 } else {
8518 in->put_open_ref(cmode);
8519 }
8520
8521 trim_cache();
8522
8523 return result;
8524 }
8525
8526 int Client::_renew_caps(Inode *in)
8527 {
8528 int wanted = in->caps_file_wanted();
8529 if (in->is_any_caps() &&
8530 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8531 check_caps(in, CHECK_CAPS_NODELAY);
8532 return 0;
8533 }
8534
8535 int flags = 0;
8536 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8537 flags = O_RDWR;
8538 else if (wanted & CEPH_CAP_FILE_RD)
8539 flags = O_RDONLY;
8540 else if (wanted & CEPH_CAP_FILE_WR)
8541 flags = O_WRONLY;
8542
8543 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8544 filepath path;
8545 in->make_nosnap_relative_path(path);
8546 req->set_filepath(path);
8547 req->head.args.open.flags = flags;
8548 req->head.args.open.pool = -1;
8549 if (cct->_conf->client_debug_getattr_caps)
8550 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8551 else
8552 req->head.args.open.mask = 0;
8553 req->set_inode(in);
8554
8555 // duplicate in case Cap goes away; not sure if that race is a concern?
8556 const UserPerm *pperm = in->get_best_perms();
8557 UserPerm perms;
8558 if (pperm != NULL)
8559 perms = *pperm;
8560 int ret = make_request(req, perms);
8561 return ret;
8562 }
8563
8564 int Client::close(int fd)
8565 {
8566 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8567 Mutex::Locker lock(client_lock);
8568 tout(cct) << "close" << std::endl;
8569 tout(cct) << fd << std::endl;
8570
8571 if (unmounting)
8572 return -ENOTCONN;
8573
8574 Fh *fh = get_filehandle(fd);
8575 if (!fh)
8576 return -EBADF;
8577 int err = _release_fh(fh);
8578 fd_map.erase(fd);
8579 put_fd(fd);
8580 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8581 return err;
8582 }
8583
8584
8585 // ------------
8586 // read, write
8587
8588 loff_t Client::lseek(int fd, loff_t offset, int whence)
8589 {
8590 Mutex::Locker lock(client_lock);
8591 tout(cct) << "lseek" << std::endl;
8592 tout(cct) << fd << std::endl;
8593 tout(cct) << offset << std::endl;
8594 tout(cct) << whence << std::endl;
8595
8596 if (unmounting)
8597 return -ENOTCONN;
8598
8599 Fh *f = get_filehandle(fd);
8600 if (!f)
8601 return -EBADF;
8602 #if defined(__linux__) && defined(O_PATH)
8603 if (f->flags & O_PATH)
8604 return -EBADF;
8605 #endif
8606 return _lseek(f, offset, whence);
8607 }
8608
8609 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8610 {
8611 Inode *in = f->inode.get();
8612 int r;
8613
8614 switch (whence) {
8615 case SEEK_SET:
8616 f->pos = offset;
8617 break;
8618
8619 case SEEK_CUR:
8620 f->pos += offset;
8621 break;
8622
8623 case SEEK_END:
8624 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8625 if (r < 0)
8626 return r;
8627 f->pos = in->size + offset;
8628 break;
8629
8630 default:
8631 ceph_abort();
8632 }
8633
8634 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8635 return f->pos;
8636 }
8637
8638
8639 void Client::lock_fh_pos(Fh *f)
8640 {
8641 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8642
8643 if (f->pos_locked || !f->pos_waiters.empty()) {
8644 Cond cond;
8645 f->pos_waiters.push_back(&cond);
8646 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8647 while (f->pos_locked || f->pos_waiters.front() != &cond)
8648 cond.Wait(client_lock);
8649 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8650 assert(f->pos_waiters.front() == &cond);
8651 f->pos_waiters.pop_front();
8652 }
8653
8654 f->pos_locked = true;
8655 }
8656
8657 void Client::unlock_fh_pos(Fh *f)
8658 {
8659 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8660 f->pos_locked = false;
8661 }
8662
8663 int Client::uninline_data(Inode *in, Context *onfinish)
8664 {
8665 if (!in->inline_data.length()) {
8666 onfinish->complete(0);
8667 return 0;
8668 }
8669
8670 char oid_buf[32];
8671 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8672 object_t oid = oid_buf;
8673
8674 ObjectOperation create_ops;
8675 create_ops.create(false);
8676
8677 objecter->mutate(oid,
8678 OSDMap::file_to_object_locator(in->layout),
8679 create_ops,
8680 in->snaprealm->get_snap_context(),
8681 ceph::real_clock::now(),
8682 0,
8683 NULL);
8684
8685 bufferlist inline_version_bl;
8686 ::encode(in->inline_version, inline_version_bl);
8687
8688 ObjectOperation uninline_ops;
8689 uninline_ops.cmpxattr("inline_version",
8690 CEPH_OSD_CMPXATTR_OP_GT,
8691 CEPH_OSD_CMPXATTR_MODE_U64,
8692 inline_version_bl);
8693 bufferlist inline_data = in->inline_data;
8694 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8695 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8696
8697 objecter->mutate(oid,
8698 OSDMap::file_to_object_locator(in->layout),
8699 uninline_ops,
8700 in->snaprealm->get_snap_context(),
8701 ceph::real_clock::now(),
8702 0,
8703 onfinish);
8704
8705 return 0;
8706 }
8707
8708 //
8709
8710 // blocking osd interface
8711
8712 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8713 {
8714 Mutex::Locker lock(client_lock);
8715 tout(cct) << "read" << std::endl;
8716 tout(cct) << fd << std::endl;
8717 tout(cct) << size << std::endl;
8718 tout(cct) << offset << std::endl;
8719
8720 if (unmounting)
8721 return -ENOTCONN;
8722
8723 Fh *f = get_filehandle(fd);
8724 if (!f)
8725 return -EBADF;
8726 #if defined(__linux__) && defined(O_PATH)
8727 if (f->flags & O_PATH)
8728 return -EBADF;
8729 #endif
8730 bufferlist bl;
8731 int r = _read(f, offset, size, &bl);
8732 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8733 if (r >= 0) {
8734 bl.copy(0, bl.length(), buf);
8735 r = bl.length();
8736 }
8737 return r;
8738 }
8739
8740 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8741 {
8742 if (iovcnt < 0)
8743 return -EINVAL;
8744 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8745 }
8746
8747 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8748 {
8749 const md_config_t *conf = cct->_conf;
8750 Inode *in = f->inode.get();
8751
8752 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8753 return -EBADF;
8754 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8755
8756 bool movepos = false;
8757 if (offset < 0) {
8758 lock_fh_pos(f);
8759 offset = f->pos;
8760 movepos = true;
8761 }
8762 loff_t start_pos = offset;
8763
8764 if (in->inline_version == 0) {
8765 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8766 if (r < 0) {
8767 if (movepos)
8768 unlock_fh_pos(f);
8769 return r;
8770 }
8771 assert(in->inline_version > 0);
8772 }
8773
8774 retry:
8775 int have;
8776 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8777 if (r < 0) {
8778 if (movepos)
8779 unlock_fh_pos(f);
8780 return r;
8781 }
8782 if (f->flags & O_DIRECT)
8783 have &= ~CEPH_CAP_FILE_CACHE;
8784
8785 Mutex uninline_flock("Client::_read_uninline_data flock");
8786 Cond uninline_cond;
8787 bool uninline_done = false;
8788 int uninline_ret = 0;
8789 Context *onuninline = NULL;
8790
8791 if (in->inline_version < CEPH_INLINE_NONE) {
8792 if (!(have & CEPH_CAP_FILE_CACHE)) {
8793 onuninline = new C_SafeCond(&uninline_flock,
8794 &uninline_cond,
8795 &uninline_done,
8796 &uninline_ret);
8797 uninline_data(in, onuninline);
8798 } else {
8799 uint32_t len = in->inline_data.length();
8800
8801 uint64_t endoff = offset + size;
8802 if (endoff > in->size)
8803 endoff = in->size;
8804
8805 if (offset < len) {
8806 if (endoff <= len) {
8807 bl->substr_of(in->inline_data, offset, endoff - offset);
8808 } else {
8809 bl->substr_of(in->inline_data, offset, len - offset);
8810 bl->append_zero(endoff - len);
8811 }
8812 } else if ((uint64_t)offset < endoff) {
8813 bl->append_zero(endoff - offset);
8814 }
8815
8816 goto success;
8817 }
8818 }
8819
8820 if (!conf->client_debug_force_sync_read &&
8821 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8822
8823 if (f->flags & O_RSYNC) {
8824 _flush_range(in, offset, size);
8825 }
8826 r = _read_async(f, offset, size, bl);
8827 if (r < 0)
8828 goto done;
8829 } else {
8830 if (f->flags & O_DIRECT)
8831 _flush_range(in, offset, size);
8832
8833 bool checkeof = false;
8834 r = _read_sync(f, offset, size, bl, &checkeof);
8835 if (r < 0)
8836 goto done;
8837 if (checkeof) {
8838 offset += r;
8839 size -= r;
8840
8841 put_cap_ref(in, CEPH_CAP_FILE_RD);
8842 have = 0;
8843 // reverify size
8844 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8845 if (r < 0)
8846 goto done;
8847
8848 // eof? short read.
8849 if ((uint64_t)offset < in->size)
8850 goto retry;
8851 }
8852 }
8853
8854 success:
8855 if (movepos) {
8856 // adjust fd pos
8857 f->pos = start_pos + bl->length();
8858 unlock_fh_pos(f);
8859 }
8860
8861 done:
8862 // done!
8863
8864 if (onuninline) {
8865 client_lock.Unlock();
8866 uninline_flock.Lock();
8867 while (!uninline_done)
8868 uninline_cond.Wait(uninline_flock);
8869 uninline_flock.Unlock();
8870 client_lock.Lock();
8871
8872 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8873 in->inline_data.clear();
8874 in->inline_version = CEPH_INLINE_NONE;
8875 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8876 check_caps(in, 0);
8877 } else
8878 r = uninline_ret;
8879 }
8880
8881 if (have)
8882 put_cap_ref(in, CEPH_CAP_FILE_RD);
8883 if (r < 0) {
8884 if (movepos)
8885 unlock_fh_pos(f);
8886 return r;
8887 } else
8888 return bl->length();
8889 }
8890
8891 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8892 client(c), f(f) {
8893 f->get();
8894 f->readahead.inc_pending();
8895 }
8896
8897 Client::C_Readahead::~C_Readahead() {
8898 f->readahead.dec_pending();
8899 client->_put_fh(f);
8900 }
8901
8902 void Client::C_Readahead::finish(int r) {
8903 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8904 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8905 }
8906
8907 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8908 {
8909 const md_config_t *conf = cct->_conf;
8910 Inode *in = f->inode.get();
8911
8912 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8913
8914 // trim read based on file size?
8915 if (off >= in->size)
8916 return 0;
8917 if (len == 0)
8918 return 0;
8919 if (off + len > in->size) {
8920 len = in->size - off;
8921 }
8922
8923 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8924 << " max_bytes=" << f->readahead.get_max_readahead_size()
8925 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8926
8927 // read (and possibly block)
8928 int r, rvalue = 0;
8929 Mutex flock("Client::_read_async flock");
8930 Cond cond;
8931 bool done = false;
8932 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8933 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8934 off, len, bl, 0, onfinish);
8935 if (r == 0) {
8936 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8937 client_lock.Unlock();
8938 flock.Lock();
8939 while (!done)
8940 cond.Wait(flock);
8941 flock.Unlock();
8942 client_lock.Lock();
8943 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8944 r = rvalue;
8945 } else {
8946 // it was cached.
8947 delete onfinish;
8948 }
8949
8950 if(f->readahead.get_min_readahead_size() > 0) {
8951 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8952 if (readahead_extent.second > 0) {
8953 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8954 << " (caller wants " << off << "~" << len << ")" << dendl;
8955 Context *onfinish2 = new C_Readahead(this, f);
8956 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8957 readahead_extent.first, readahead_extent.second,
8958 NULL, 0, onfinish2);
8959 if (r2 == 0) {
8960 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8961 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8962 } else {
8963 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8964 delete onfinish2;
8965 }
8966 }
8967 }
8968
8969 return r;
8970 }
8971
8972 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8973 bool *checkeof)
8974 {
8975 Inode *in = f->inode.get();
8976 uint64_t pos = off;
8977 int left = len;
8978 int read = 0;
8979
8980 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8981
8982 Mutex flock("Client::_read_sync flock");
8983 Cond cond;
8984 while (left > 0) {
8985 int r = 0;
8986 bool done = false;
8987 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8988 bufferlist tbl;
8989
8990 int wanted = left;
8991 filer->read_trunc(in->ino, &in->layout, in->snapid,
8992 pos, left, &tbl, 0,
8993 in->truncate_size, in->truncate_seq,
8994 onfinish);
8995 client_lock.Unlock();
8996 flock.Lock();
8997 while (!done)
8998 cond.Wait(flock);
8999 flock.Unlock();
9000 client_lock.Lock();
9001
9002 // if we get ENOENT from OSD, assume 0 bytes returned
9003 if (r == -ENOENT)
9004 r = 0;
9005 if (r < 0)
9006 return r;
9007 if (tbl.length()) {
9008 r = tbl.length();
9009
9010 read += r;
9011 pos += r;
9012 left -= r;
9013 bl->claim_append(tbl);
9014 }
9015 // short read?
9016 if (r >= 0 && r < wanted) {
9017 if (pos < in->size) {
9018 // zero up to known EOF
9019 int64_t some = in->size - pos;
9020 if (some > left)
9021 some = left;
9022 bufferptr z(some);
9023 z.zero();
9024 bl->push_back(z);
9025 read += some;
9026 pos += some;
9027 left -= some;
9028 if (left == 0)
9029 return read;
9030 }
9031
9032 *checkeof = true;
9033 return read;
9034 }
9035 }
9036 return read;
9037 }
9038
9039
9040 /*
9041 * we keep count of uncommitted sync writes on the inode, so that
9042 * fsync can DDRT.
9043 */
9044 void Client::_sync_write_commit(Inode *in)
9045 {
9046 assert(unsafe_sync_write > 0);
9047 unsafe_sync_write--;
9048
9049 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9050
9051 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9052 if (unsafe_sync_write == 0 && unmounting) {
9053 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9054 mount_cond.Signal();
9055 }
9056 }
9057
9058 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9059 {
9060 Mutex::Locker lock(client_lock);
9061 tout(cct) << "write" << std::endl;
9062 tout(cct) << fd << std::endl;
9063 tout(cct) << size << std::endl;
9064 tout(cct) << offset << std::endl;
9065
9066 if (unmounting)
9067 return -ENOTCONN;
9068
9069 Fh *fh = get_filehandle(fd);
9070 if (!fh)
9071 return -EBADF;
9072 #if defined(__linux__) && defined(O_PATH)
9073 if (fh->flags & O_PATH)
9074 return -EBADF;
9075 #endif
9076 int r = _write(fh, offset, size, buf, NULL, 0);
9077 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9078 return r;
9079 }
9080
9081 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9082 {
9083 if (iovcnt < 0)
9084 return -EINVAL;
9085 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9086 }
9087
9088 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9089 {
9090 Mutex::Locker lock(client_lock);
9091 tout(cct) << fd << std::endl;
9092 tout(cct) << offset << std::endl;
9093
9094 if (unmounting)
9095 return -ENOTCONN;
9096
9097 Fh *fh = get_filehandle(fd);
9098 if (!fh)
9099 return -EBADF;
9100 #if defined(__linux__) && defined(O_PATH)
9101 if (fh->flags & O_PATH)
9102 return -EBADF;
9103 #endif
9104 loff_t totallen = 0;
9105 for (unsigned i = 0; i < iovcnt; i++) {
9106 totallen += iov[i].iov_len;
9107 }
9108 if (write) {
9109 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9110 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9111 return w;
9112 } else {
9113 bufferlist bl;
9114 int r = _read(fh, offset, totallen, &bl);
9115 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9116 if (r <= 0)
9117 return r;
9118
9119 int bufoff = 0;
9120 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9121 /*
9122 * This piece of code aims to handle the case that bufferlist does not have enough data
9123 * to fill in the iov
9124 */
9125 if (resid < iov[j].iov_len) {
9126 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9127 break;
9128 } else {
9129 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9130 }
9131 resid -= iov[j].iov_len;
9132 bufoff += iov[j].iov_len;
9133 }
9134 return r;
9135 }
9136 }
9137
9138 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9139 const struct iovec *iov, int iovcnt)
9140 {
9141 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9142 return -EFBIG;
9143
9144 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9145 Inode *in = f->inode.get();
9146
9147 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9148 return -ENOSPC;
9149 }
9150
9151 assert(in->snapid == CEPH_NOSNAP);
9152
9153 // was Fh opened as writeable?
9154 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9155 return -EBADF;
9156
9157 // check quota
9158 uint64_t endoff = offset + size;
9159 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9160 f->actor_perms)) {
9161 return -EDQUOT;
9162 }
9163
9164 // use/adjust fd pos?
9165 if (offset < 0) {
9166 lock_fh_pos(f);
9167 /*
9168 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9169 * change out from under us.
9170 */
9171 if (f->flags & O_APPEND) {
9172 int r = _lseek(f, 0, SEEK_END);
9173 if (r < 0) {
9174 unlock_fh_pos(f);
9175 return r;
9176 }
9177 }
9178 offset = f->pos;
9179 f->pos = offset+size;
9180 unlock_fh_pos(f);
9181 }
9182
9183 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9184
9185 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9186
9187 // time it.
9188 utime_t start = ceph_clock_now();
9189
9190 if (in->inline_version == 0) {
9191 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9192 if (r < 0)
9193 return r;
9194 assert(in->inline_version > 0);
9195 }
9196
9197 // copy into fresh buffer (since our write may be resub, async)
9198 bufferlist bl;
9199 if (buf) {
9200 if (size > 0)
9201 bl.append(buf, size);
9202 } else if (iov){
9203 for (int i = 0; i < iovcnt; i++) {
9204 if (iov[i].iov_len > 0) {
9205 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9206 }
9207 }
9208 }
9209
9210 utime_t lat;
9211 uint64_t totalwritten;
9212 int have;
9213 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9214 CEPH_CAP_FILE_BUFFER, &have, endoff);
9215 if (r < 0)
9216 return r;
9217
9218 /* clear the setuid/setgid bits, if any */
9219 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9220 struct ceph_statx stx = { 0 };
9221
9222 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9223 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9224 if (r < 0)
9225 return r;
9226 } else {
9227 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9228 }
9229
9230 if (f->flags & O_DIRECT)
9231 have &= ~CEPH_CAP_FILE_BUFFER;
9232
9233 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9234
9235 Mutex uninline_flock("Client::_write_uninline_data flock");
9236 Cond uninline_cond;
9237 bool uninline_done = false;
9238 int uninline_ret = 0;
9239 Context *onuninline = NULL;
9240
9241 if (in->inline_version < CEPH_INLINE_NONE) {
9242 if (endoff > cct->_conf->client_max_inline_size ||
9243 endoff > CEPH_INLINE_MAX_SIZE ||
9244 !(have & CEPH_CAP_FILE_BUFFER)) {
9245 onuninline = new C_SafeCond(&uninline_flock,
9246 &uninline_cond,
9247 &uninline_done,
9248 &uninline_ret);
9249 uninline_data(in, onuninline);
9250 } else {
9251 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9252
9253 uint32_t len = in->inline_data.length();
9254
9255 if (endoff < len)
9256 in->inline_data.copy(endoff, len - endoff, bl);
9257
9258 if (offset < len)
9259 in->inline_data.splice(offset, len - offset);
9260 else if (offset > len)
9261 in->inline_data.append_zero(offset - len);
9262
9263 in->inline_data.append(bl);
9264 in->inline_version++;
9265
9266 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9267
9268 goto success;
9269 }
9270 }
9271
9272 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9273 // do buffered write
9274 if (!in->oset.dirty_or_tx)
9275 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9276
9277 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9278
9279 // async, caching, non-blocking.
9280 r = objectcacher->file_write(&in->oset, &in->layout,
9281 in->snaprealm->get_snap_context(),
9282 offset, size, bl, ceph::real_clock::now(),
9283 0);
9284 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9285
9286 if (r < 0)
9287 goto done;
9288
9289 // flush cached write if O_SYNC is set on file fh
9290 // O_DSYNC == O_SYNC on linux < 2.6.33
9291 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9292 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9293 _flush_range(in, offset, size);
9294 }
9295 } else {
9296 if (f->flags & O_DIRECT)
9297 _flush_range(in, offset, size);
9298
9299 // simple, non-atomic sync write
9300 Mutex flock("Client::_write flock");
9301 Cond cond;
9302 bool done = false;
9303 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9304
9305 unsafe_sync_write++;
9306 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9307
9308 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9309 offset, size, bl, ceph::real_clock::now(), 0,
9310 in->truncate_size, in->truncate_seq,
9311 onfinish);
9312 client_lock.Unlock();
9313 flock.Lock();
9314
9315 while (!done)
9316 cond.Wait(flock);
9317 flock.Unlock();
9318 client_lock.Lock();
9319 _sync_write_commit(in);
9320 }
9321
9322 // if we get here, write was successful, update client metadata
9323 success:
9324 // time
9325 lat = ceph_clock_now();
9326 lat -= start;
9327 logger->tinc(l_c_wrlat, lat);
9328
9329 totalwritten = size;
9330 r = (int)totalwritten;
9331
9332 // extend file?
9333 if (totalwritten + offset > in->size) {
9334 in->size = totalwritten + offset;
9335 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9336
9337 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9338 check_caps(in, CHECK_CAPS_NODELAY);
9339 } else if (is_max_size_approaching(in)) {
9340 check_caps(in, 0);
9341 }
9342
9343 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9344 } else {
9345 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9346 }
9347
9348 // mtime
9349 in->mtime = ceph_clock_now();
9350 in->change_attr++;
9351 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9352
9353 done:
9354
9355 if (onuninline) {
9356 client_lock.Unlock();
9357 uninline_flock.Lock();
9358 while (!uninline_done)
9359 uninline_cond.Wait(uninline_flock);
9360 uninline_flock.Unlock();
9361 client_lock.Lock();
9362
9363 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9364 in->inline_data.clear();
9365 in->inline_version = CEPH_INLINE_NONE;
9366 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9367 check_caps(in, 0);
9368 } else
9369 r = uninline_ret;
9370 }
9371
9372 put_cap_ref(in, CEPH_CAP_FILE_WR);
9373 return r;
9374 }
9375
9376 int Client::_flush(Fh *f)
9377 {
9378 Inode *in = f->inode.get();
9379 int err = f->take_async_err();
9380 if (err != 0) {
9381 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9382 << cpp_strerror(err) << dendl;
9383 } else {
9384 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9385 }
9386
9387 return err;
9388 }
9389
9390 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9391 {
9392 struct ceph_statx stx;
9393 stx.stx_size = length;
9394 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9395 }
9396
9397 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9398 {
9399 Mutex::Locker lock(client_lock);
9400 tout(cct) << "ftruncate" << std::endl;
9401 tout(cct) << fd << std::endl;
9402 tout(cct) << length << std::endl;
9403
9404 if (unmounting)
9405 return -ENOTCONN;
9406
9407 Fh *f = get_filehandle(fd);
9408 if (!f)
9409 return -EBADF;
9410 #if defined(__linux__) && defined(O_PATH)
9411 if (f->flags & O_PATH)
9412 return -EBADF;
9413 #endif
9414 struct stat attr;
9415 attr.st_size = length;
9416 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9417 }
9418
9419 int Client::fsync(int fd, bool syncdataonly)
9420 {
9421 Mutex::Locker lock(client_lock);
9422 tout(cct) << "fsync" << std::endl;
9423 tout(cct) << fd << std::endl;
9424 tout(cct) << syncdataonly << std::endl;
9425
9426 if (unmounting)
9427 return -ENOTCONN;
9428
9429 Fh *f = get_filehandle(fd);
9430 if (!f)
9431 return -EBADF;
9432 #if defined(__linux__) && defined(O_PATH)
9433 if (f->flags & O_PATH)
9434 return -EBADF;
9435 #endif
9436 int r = _fsync(f, syncdataonly);
9437 if (r == 0) {
9438 // The IOs in this fsync were okay, but maybe something happened
9439 // in the background that we shoudl be reporting?
9440 r = f->take_async_err();
9441 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9442 << ") = 0, async_err = " << r << dendl;
9443 } else {
9444 // Assume that an error we encountered during fsync, even reported
9445 // synchronously, would also have applied the error to the Fh, and we
9446 // should clear it here to avoid returning the same error again on next
9447 // call.
9448 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9449 << r << dendl;
9450 f->take_async_err();
9451 }
9452 return r;
9453 }
9454
9455 int Client::_fsync(Inode *in, bool syncdataonly)
9456 {
9457 int r = 0;
9458 Mutex lock("Client::_fsync::lock");
9459 Cond cond;
9460 bool done = false;
9461 C_SafeCond *object_cacher_completion = NULL;
9462 ceph_tid_t flush_tid = 0;
9463 InodeRef tmp_ref;
9464
9465 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9466
9467 if (cct->_conf->client_oc) {
9468 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9469 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9470 _flush(in, object_cacher_completion);
9471 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9472 }
9473
9474 if (!syncdataonly && in->dirty_caps) {
9475 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9476 if (in->flushing_caps)
9477 flush_tid = last_flush_tid;
9478 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9479
9480 if (!syncdataonly && !in->unsafe_ops.empty()) {
9481 MetaRequest *req = in->unsafe_ops.back();
9482 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9483
9484 req->get();
9485 wait_on_list(req->waitfor_safe);
9486 put_request(req);
9487 }
9488
9489 if (object_cacher_completion) { // wait on a real reply instead of guessing
9490 client_lock.Unlock();
9491 lock.Lock();
9492 ldout(cct, 15) << "waiting on data to flush" << dendl;
9493 while (!done)
9494 cond.Wait(lock);
9495 lock.Unlock();
9496 client_lock.Lock();
9497 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9498 } else {
9499 // FIXME: this can starve
9500 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9501 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9502 << " uncommitted, waiting" << dendl;
9503 wait_on_list(in->waitfor_commit);
9504 }
9505 }
9506
9507 if (!r) {
9508 if (flush_tid > 0)
9509 wait_sync_caps(in, flush_tid);
9510
9511 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9512 } else {
9513 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9514 << cpp_strerror(-r) << dendl;
9515 }
9516
9517 return r;
9518 }
9519
9520 int Client::_fsync(Fh *f, bool syncdataonly)
9521 {
9522 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9523 return _fsync(f->inode.get(), syncdataonly);
9524 }
9525
9526 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9527 {
9528 Mutex::Locker lock(client_lock);
9529 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9530 tout(cct) << fd << std::endl;
9531
9532 if (unmounting)
9533 return -ENOTCONN;
9534
9535 Fh *f = get_filehandle(fd);
9536 if (!f)
9537 return -EBADF;
9538 int r = _getattr(f->inode, mask, perms);
9539 if (r < 0)
9540 return r;
9541 fill_stat(f->inode, stbuf, NULL);
9542 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9543 return r;
9544 }
9545
9546 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9547 unsigned int want, unsigned int flags)
9548 {
9549 Mutex::Locker lock(client_lock);
9550 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9551 tout(cct) << fd << std::endl;
9552
9553 if (unmounting)
9554 return -ENOTCONN;
9555
9556 Fh *f = get_filehandle(fd);
9557 if (!f)
9558 return -EBADF;
9559
9560 unsigned mask = statx_to_mask(flags, want);
9561
9562 int r = 0;
9563 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9564 r = _getattr(f->inode, mask, perms);
9565 if (r < 0) {
9566 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9567 return r;
9568 }
9569 }
9570
9571 fill_statx(f->inode, mask, stx);
9572 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9573 return r;
9574 }
9575
9576 // not written yet, but i want to link!
9577
9578 int Client::chdir(const char *relpath, std::string &new_cwd,
9579 const UserPerm& perms)
9580 {
9581 Mutex::Locker lock(client_lock);
9582 tout(cct) << "chdir" << std::endl;
9583 tout(cct) << relpath << std::endl;
9584
9585 if (unmounting)
9586 return -ENOTCONN;
9587
9588 filepath path(relpath);
9589 InodeRef in;
9590 int r = path_walk(path, &in, perms);
9591 if (r < 0)
9592 return r;
9593 if (cwd != in)
9594 cwd.swap(in);
9595 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9596
9597 _getcwd(new_cwd, perms);
9598 return 0;
9599 }
9600
9601 void Client::_getcwd(string& dir, const UserPerm& perms)
9602 {
9603 filepath path;
9604 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9605
9606 Inode *in = cwd.get();
9607 while (in != root) {
9608 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9609
9610 // A cwd or ancester is unlinked
9611 if (in->dn_set.empty()) {
9612 return;
9613 }
9614
9615 Dentry *dn = in->get_first_parent();
9616
9617
9618 if (!dn) {
9619 // look it up
9620 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9621 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9622 filepath path(in->ino);
9623 req->set_filepath(path);
9624 req->set_inode(in);
9625 int res = make_request(req, perms);
9626 if (res < 0)
9627 break;
9628
9629 // start over
9630 path = filepath();
9631 in = cwd.get();
9632 continue;
9633 }
9634 path.push_front_dentry(dn->name);
9635 in = dn->dir->parent_inode;
9636 }
9637 dir = "/";
9638 dir += path.get_path();
9639 }
9640
9641 void Client::getcwd(string& dir, const UserPerm& perms)
9642 {
9643 Mutex::Locker l(client_lock);
9644 if (!unmounting)
9645 _getcwd(dir, perms);
9646 }
9647
9648 int Client::statfs(const char *path, struct statvfs *stbuf,
9649 const UserPerm& perms)
9650 {
9651 Mutex::Locker l(client_lock);
9652 tout(cct) << "statfs" << std::endl;
9653
9654 if (unmounting)
9655 return -ENOTCONN;
9656
9657 ceph_statfs stats;
9658 C_SaferCond cond;
9659
9660 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9661 if (data_pools.size() == 1) {
9662 objecter->get_fs_stats(stats, data_pools[0], &cond);
9663 } else {
9664 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9665 }
9666
9667 client_lock.Unlock();
9668 int rval = cond.wait();
9669 client_lock.Lock();
9670
9671 if (rval < 0) {
9672 ldout(cct, 1) << "underlying call to statfs returned error: "
9673 << cpp_strerror(rval)
9674 << dendl;
9675 return rval;
9676 }
9677
9678 memset(stbuf, 0, sizeof(*stbuf));
9679
9680 /*
9681 * we're going to set a block size of 4MB so we can represent larger
9682 * FSes without overflowing. Additionally convert the space
9683 * measurements from KB to bytes while making them in terms of
9684 * blocks. We use 4MB only because it is big enough, and because it
9685 * actually *is* the (ceph) default block size.
9686 */
9687 const int CEPH_BLOCK_SHIFT = 22;
9688 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9689 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9690 stbuf->f_files = stats.num_objects;
9691 stbuf->f_ffree = -1;
9692 stbuf->f_favail = -1;
9693 stbuf->f_fsid = -1; // ??
9694 stbuf->f_flag = 0; // ??
9695 stbuf->f_namemax = NAME_MAX;
9696
9697 // Usually quota_root will == root_ancestor, but if the mount root has no
9698 // quota but we can see a parent of it that does have a quota, we'll
9699 // respect that one instead.
9700 assert(root != nullptr);
9701 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9702
9703 // get_quota_root should always give us something
9704 // because client quotas are always enabled
9705 assert(quota_root != nullptr);
9706
9707 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9708
9709 // Skip the getattr if any sessions are stale, as we don't want to
9710 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9711 // is unhealthy.
9712 if (!_any_stale_sessions()) {
9713 int r = _getattr(quota_root, 0, perms, true);
9714 if (r != 0) {
9715 // Ignore return value: error getting latest inode metadata is not a good
9716 // reason to break "df".
9717 lderr(cct) << "Error in getattr on quota root 0x"
9718 << std::hex << quota_root->ino << std::dec
9719 << " statfs result may be outdated" << dendl;
9720 }
9721 }
9722
9723 // Special case: if there is a size quota set on the Inode acting
9724 // as the root for this client mount, then report the quota status
9725 // as the filesystem statistics.
9726 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9727 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
9728 // It is possible for a quota to be exceeded: arithmetic here must
9729 // handle case where used > total.
9730 const fsblkcnt_t free = total > used ? total - used : 0;
9731
9732 stbuf->f_blocks = total;
9733 stbuf->f_bfree = free;
9734 stbuf->f_bavail = free;
9735 } else {
9736 // General case: report the cluster statistics returned from RADOS. Because
9737 // multiple pools may be used without one filesystem namespace via
9738 // layouts, this is the most correct thing we can do.
9739 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9740 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9741 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9742 }
9743
9744 return rval;
9745 }
9746
9747 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9748 struct flock *fl, uint64_t owner, bool removing)
9749 {
9750 ldout(cct, 10) << "_do_filelock ino " << in->ino
9751 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9752 << " type " << fl->l_type << " owner " << owner
9753 << " " << fl->l_start << "~" << fl->l_len << dendl;
9754
9755 int lock_cmd;
9756 if (F_RDLCK == fl->l_type)
9757 lock_cmd = CEPH_LOCK_SHARED;
9758 else if (F_WRLCK == fl->l_type)
9759 lock_cmd = CEPH_LOCK_EXCL;
9760 else if (F_UNLCK == fl->l_type)
9761 lock_cmd = CEPH_LOCK_UNLOCK;
9762 else
9763 return -EIO;
9764
9765 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9766 sleep = 0;
9767
9768 /*
9769 * Set the most significant bit, so that MDS knows the 'owner'
9770 * is sufficient to identify the owner of lock. (old code uses
9771 * both 'owner' and 'pid')
9772 */
9773 owner |= (1ULL << 63);
9774
9775 MetaRequest *req = new MetaRequest(op);
9776 filepath path;
9777 in->make_nosnap_relative_path(path);
9778 req->set_filepath(path);
9779 req->set_inode(in);
9780
9781 req->head.args.filelock_change.rule = lock_type;
9782 req->head.args.filelock_change.type = lock_cmd;
9783 req->head.args.filelock_change.owner = owner;
9784 req->head.args.filelock_change.pid = fl->l_pid;
9785 req->head.args.filelock_change.start = fl->l_start;
9786 req->head.args.filelock_change.length = fl->l_len;
9787 req->head.args.filelock_change.wait = sleep;
9788
9789 int ret;
9790 bufferlist bl;
9791
9792 if (sleep && switch_interrupt_cb) {
9793 // enable interrupt
9794 switch_interrupt_cb(callback_handle, req->get());
9795 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9796 // disable interrupt
9797 switch_interrupt_cb(callback_handle, NULL);
9798 if (ret == 0 && req->aborted()) {
9799 // effect of this lock request has been revoked by the 'lock intr' request
9800 ret = req->get_abort_code();
9801 }
9802 put_request(req);
9803 } else {
9804 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9805 }
9806
9807 if (ret == 0) {
9808 if (op == CEPH_MDS_OP_GETFILELOCK) {
9809 ceph_filelock filelock;
9810 bufferlist::iterator p = bl.begin();
9811 ::decode(filelock, p);
9812
9813 if (CEPH_LOCK_SHARED == filelock.type)
9814 fl->l_type = F_RDLCK;
9815 else if (CEPH_LOCK_EXCL == filelock.type)
9816 fl->l_type = F_WRLCK;
9817 else
9818 fl->l_type = F_UNLCK;
9819
9820 fl->l_whence = SEEK_SET;
9821 fl->l_start = filelock.start;
9822 fl->l_len = filelock.length;
9823 fl->l_pid = filelock.pid;
9824 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9825 ceph_lock_state_t *lock_state;
9826 if (lock_type == CEPH_LOCK_FCNTL) {
9827 if (!in->fcntl_locks)
9828 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9829 lock_state = in->fcntl_locks;
9830 } else if (lock_type == CEPH_LOCK_FLOCK) {
9831 if (!in->flock_locks)
9832 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9833 lock_state = in->flock_locks;
9834 } else {
9835 ceph_abort();
9836 return -EINVAL;
9837 }
9838 _update_lock_state(fl, owner, lock_state);
9839
9840 if (!removing) {
9841 if (lock_type == CEPH_LOCK_FCNTL) {
9842 if (!fh->fcntl_locks)
9843 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9844 lock_state = fh->fcntl_locks;
9845 } else {
9846 if (!fh->flock_locks)
9847 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9848 lock_state = fh->flock_locks;
9849 }
9850 _update_lock_state(fl, owner, lock_state);
9851 }
9852 } else
9853 ceph_abort();
9854 }
9855 return ret;
9856 }
9857
9858 int Client::_interrupt_filelock(MetaRequest *req)
9859 {
9860 // Set abort code, but do not kick. The abort code prevents the request
9861 // from being re-sent.
9862 req->abort(-EINTR);
9863 if (req->mds < 0)
9864 return 0; // haven't sent the request
9865
9866 Inode *in = req->inode();
9867
9868 int lock_type;
9869 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9870 lock_type = CEPH_LOCK_FLOCK_INTR;
9871 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9872 lock_type = CEPH_LOCK_FCNTL_INTR;
9873 else {
9874 ceph_abort();
9875 return -EINVAL;
9876 }
9877
9878 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9879 filepath path;
9880 in->make_nosnap_relative_path(path);
9881 intr_req->set_filepath(path);
9882 intr_req->set_inode(in);
9883 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9884 intr_req->head.args.filelock_change.rule = lock_type;
9885 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9886
9887 UserPerm perms(req->get_uid(), req->get_gid());
9888 return make_request(intr_req, perms, NULL, NULL, -1);
9889 }
9890
9891 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9892 {
9893 if (!in->fcntl_locks && !in->flock_locks)
9894 return;
9895
9896 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9897 ::encode(nr_fcntl_locks, bl);
9898 if (nr_fcntl_locks) {
9899 ceph_lock_state_t* lock_state = in->fcntl_locks;
9900 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9901 p != lock_state->held_locks.end();
9902 ++p)
9903 ::encode(p->second, bl);
9904 }
9905
9906 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9907 ::encode(nr_flock_locks, bl);
9908 if (nr_flock_locks) {
9909 ceph_lock_state_t* lock_state = in->flock_locks;
9910 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9911 p != lock_state->held_locks.end();
9912 ++p)
9913 ::encode(p->second, bl);
9914 }
9915
9916 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9917 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9918 }
9919
9920 void Client::_release_filelocks(Fh *fh)
9921 {
9922 if (!fh->fcntl_locks && !fh->flock_locks)
9923 return;
9924
9925 Inode *in = fh->inode.get();
9926 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9927
9928 list<pair<int, ceph_filelock> > to_release;
9929
9930 if (fh->fcntl_locks) {
9931 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9932 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9933 p != lock_state->held_locks.end();
9934 ++p)
9935 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9936 delete fh->fcntl_locks;
9937 }
9938 if (fh->flock_locks) {
9939 ceph_lock_state_t* lock_state = fh->flock_locks;
9940 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9941 p != lock_state->held_locks.end();
9942 ++p)
9943 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9944 delete fh->flock_locks;
9945 }
9946
9947 if (to_release.empty())
9948 return;
9949
9950 struct flock fl;
9951 memset(&fl, 0, sizeof(fl));
9952 fl.l_whence = SEEK_SET;
9953 fl.l_type = F_UNLCK;
9954
9955 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9956 p != to_release.end();
9957 ++p) {
9958 fl.l_start = p->second.start;
9959 fl.l_len = p->second.length;
9960 fl.l_pid = p->second.pid;
9961 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9962 p->second.owner, true);
9963 }
9964 }
9965
9966 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9967 ceph_lock_state_t *lock_state)
9968 {
9969 int lock_cmd;
9970 if (F_RDLCK == fl->l_type)
9971 lock_cmd = CEPH_LOCK_SHARED;
9972 else if (F_WRLCK == fl->l_type)
9973 lock_cmd = CEPH_LOCK_EXCL;
9974 else
9975 lock_cmd = CEPH_LOCK_UNLOCK;;
9976
9977 ceph_filelock filelock;
9978 filelock.start = fl->l_start;
9979 filelock.length = fl->l_len;
9980 filelock.client = 0;
9981 // see comment in _do_filelock()
9982 filelock.owner = owner | (1ULL << 63);
9983 filelock.pid = fl->l_pid;
9984 filelock.type = lock_cmd;
9985
9986 if (filelock.type == CEPH_LOCK_UNLOCK) {
9987 list<ceph_filelock> activated_locks;
9988 lock_state->remove_lock(filelock, activated_locks);
9989 } else {
9990 bool r = lock_state->add_lock(filelock, false, false, NULL);
9991 assert(r);
9992 }
9993 }
9994
9995 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9996 {
9997 Inode *in = fh->inode.get();
9998 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9999 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10000 return ret;
10001 }
10002
10003 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10004 {
10005 Inode *in = fh->inode.get();
10006 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10007 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10008 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10009 return ret;
10010 }
10011
10012 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10013 {
10014 Inode *in = fh->inode.get();
10015 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10016
10017 int sleep = !(cmd & LOCK_NB);
10018 cmd &= ~LOCK_NB;
10019
10020 int type;
10021 switch (cmd) {
10022 case LOCK_SH:
10023 type = F_RDLCK;
10024 break;
10025 case LOCK_EX:
10026 type = F_WRLCK;
10027 break;
10028 case LOCK_UN:
10029 type = F_UNLCK;
10030 break;
10031 default:
10032 return -EINVAL;
10033 }
10034
10035 struct flock fl;
10036 memset(&fl, 0, sizeof(fl));
10037 fl.l_type = type;
10038 fl.l_whence = SEEK_SET;
10039
10040 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10041 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10042 return ret;
10043 }
10044
10045 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10046 {
10047 /* Since the only thing this does is wrap a call to statfs, and
10048 statfs takes a lock, it doesn't seem we have a need to split it
10049 out. */
10050 return statfs(0, stbuf, perms);
10051 }
10052
10053 void Client::ll_register_callbacks(struct client_callback_args *args)
10054 {
10055 if (!args)
10056 return;
10057 Mutex::Locker l(client_lock);
10058 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10059 << " invalidate_ino_cb " << args->ino_cb
10060 << " invalidate_dentry_cb " << args->dentry_cb
10061 << " getgroups_cb" << args->getgroups_cb
10062 << " switch_interrupt_cb " << args->switch_intr_cb
10063 << " remount_cb " << args->remount_cb
10064 << dendl;
10065 callback_handle = args->handle;
10066 if (args->ino_cb) {
10067 ino_invalidate_cb = args->ino_cb;
10068 async_ino_invalidator.start();
10069 }
10070 if (args->dentry_cb) {
10071 dentry_invalidate_cb = args->dentry_cb;
10072 async_dentry_invalidator.start();
10073 }
10074 if (args->switch_intr_cb) {
10075 switch_interrupt_cb = args->switch_intr_cb;
10076 interrupt_finisher.start();
10077 }
10078 if (args->remount_cb) {
10079 remount_cb = args->remount_cb;
10080 remount_finisher.start();
10081 }
10082 getgroups_cb = args->getgroups_cb;
10083 umask_cb = args->umask_cb;
10084 }
10085
10086 int Client::test_dentry_handling(bool can_invalidate)
10087 {
10088 int r = 0;
10089
10090 can_invalidate_dentries = can_invalidate;
10091
10092 if (can_invalidate_dentries) {
10093 assert(dentry_invalidate_cb);
10094 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10095 r = 0;
10096 } else if (remount_cb) {
10097 ldout(cct, 1) << "using remount_cb" << dendl;
10098 r = _do_remount();
10099 }
10100 if (r) {
10101 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10102 if (should_abort) {
10103 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
10104 ceph_abort();
10105 } else {
10106 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10107 }
10108 }
10109 return r;
10110 }
10111
10112 int Client::_sync_fs()
10113 {
10114 ldout(cct, 10) << "_sync_fs" << dendl;
10115
10116 // flush file data
10117 Mutex lock("Client::_fsync::lock");
10118 Cond cond;
10119 bool flush_done = false;
10120 if (cct->_conf->client_oc)
10121 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10122 else
10123 flush_done = true;
10124
10125 // flush caps
10126 flush_caps_sync();
10127 ceph_tid_t flush_tid = last_flush_tid;
10128
10129 // wait for unsafe mds requests
10130 wait_unsafe_requests();
10131
10132 wait_sync_caps(flush_tid);
10133
10134 if (!flush_done) {
10135 client_lock.Unlock();
10136 lock.Lock();
10137 ldout(cct, 15) << "waiting on data to flush" << dendl;
10138 while (!flush_done)
10139 cond.Wait(lock);
10140 lock.Unlock();
10141 client_lock.Lock();
10142 }
10143
10144 return 0;
10145 }
10146
10147 int Client::sync_fs()
10148 {
10149 Mutex::Locker l(client_lock);
10150
10151 if (unmounting)
10152 return -ENOTCONN;
10153
10154 return _sync_fs();
10155 }
10156
10157 int64_t Client::drop_caches()
10158 {
10159 Mutex::Locker l(client_lock);
10160 return objectcacher->release_all();
10161 }
10162
10163
10164 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10165 {
10166 Mutex::Locker l(client_lock);
10167 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10168 << ", " << offset << ", " << count << ")" << dendl;
10169
10170 Fh *f = get_filehandle(fd);
10171 if (!f)
10172 return -EBADF;
10173
10174 // for now
10175 _fsync(f, true);
10176
10177 return 0;
10178 }
10179
10180 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10181 {
10182 Mutex::Locker l(client_lock);
10183 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10184 << ", " << offset << ", " << count << ")" << dendl;
10185
10186 Fh *f = get_filehandle(fd);
10187 if (!f)
10188 return -EBADF;
10189 Inode *in = f->inode.get();
10190
10191 _fsync(f, true);
10192 if (_release(in))
10193 check_caps(in, 0);
10194 return 0;
10195 }
10196
10197
10198 // =============================
10199 // snaps
10200
10201 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10202 {
10203 Mutex::Locker l(client_lock);
10204
10205 if (unmounting)
10206 return -ENOTCONN;
10207
10208 filepath path(relpath);
10209 InodeRef in;
10210 int r = path_walk(path, &in, perm);
10211 if (r < 0)
10212 return r;
10213 if (cct->_conf->client_permissions) {
10214 r = may_create(in.get(), perm);
10215 if (r < 0)
10216 return r;
10217 }
10218 Inode *snapdir = open_snapdir(in.get());
10219 return _mkdir(snapdir, name, 0, perm);
10220 }
10221
10222 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10223 {
10224 Mutex::Locker l(client_lock);
10225
10226 if (unmounting)
10227 return -ENOTCONN;
10228
10229 filepath path(relpath);
10230 InodeRef in;
10231 int r = path_walk(path, &in, perms);
10232 if (r < 0)
10233 return r;
10234 if (cct->_conf->client_permissions) {
10235 r = may_delete(in.get(), NULL, perms);
10236 if (r < 0)
10237 return r;
10238 }
10239 Inode *snapdir = open_snapdir(in.get());
10240 return _rmdir(snapdir, name, perms);
10241 }
10242
10243 // =============================
10244 // expose caps
10245
10246 int Client::get_caps_issued(int fd) {
10247
10248 Mutex::Locker lock(client_lock);
10249
10250 if (unmounting)
10251 return -ENOTCONN;
10252
10253 Fh *f = get_filehandle(fd);
10254 if (!f)
10255 return -EBADF;
10256
10257 return f->inode->caps_issued();
10258 }
10259
10260 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10261 {
10262 Mutex::Locker lock(client_lock);
10263
10264 if (unmounting)
10265 return -ENOTCONN;
10266
10267 filepath p(path);
10268 InodeRef in;
10269 int r = path_walk(p, &in, perms, true);
10270 if (r < 0)
10271 return r;
10272 return in->caps_issued();
10273 }
10274
10275 // =========================================
10276 // low level
10277
10278 Inode *Client::open_snapdir(Inode *diri)
10279 {
10280 Inode *in;
10281 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10282 if (!inode_map.count(vino)) {
10283 in = new Inode(this, vino, &diri->layout);
10284
10285 in->ino = diri->ino;
10286 in->snapid = CEPH_SNAPDIR;
10287 in->mode = diri->mode;
10288 in->uid = diri->uid;
10289 in->gid = diri->gid;
10290 in->mtime = diri->mtime;
10291 in->ctime = diri->ctime;
10292 in->btime = diri->btime;
10293 in->size = diri->size;
10294 in->change_attr = diri->change_attr;
10295
10296 in->dirfragtree.clear();
10297 in->snapdir_parent = diri;
10298 diri->flags |= I_SNAPDIR_OPEN;
10299 inode_map[vino] = in;
10300 if (use_faked_inos())
10301 _assign_faked_ino(in);
10302 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10303 } else {
10304 in = inode_map[vino];
10305 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10306 }
10307 return in;
10308 }
10309
10310 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10311 Inode **out, const UserPerm& perms)
10312 {
10313 Mutex::Locker lock(client_lock);
10314 vinodeno_t vparent = _get_vino(parent);
10315 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
10316 tout(cct) << "ll_lookup" << std::endl;
10317 tout(cct) << name << std::endl;
10318
10319 if (unmounting)
10320 return -ENOTCONN;
10321
10322 int r = 0;
10323 if (!cct->_conf->fuse_default_permissions) {
10324 r = may_lookup(parent, perms);
10325 if (r < 0)
10326 return r;
10327 }
10328
10329 string dname(name);
10330 InodeRef in;
10331
10332 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10333 if (r < 0) {
10334 attr->st_ino = 0;
10335 goto out;
10336 }
10337
10338 assert(in);
10339 fill_stat(in, attr);
10340 _ll_get(in.get());
10341
10342 out:
10343 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
10344 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10345 tout(cct) << attr->st_ino << std::endl;
10346 *out = in.get();
10347 return r;
10348 }
10349
10350 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10351 struct ceph_statx *stx, unsigned want, unsigned flags,
10352 const UserPerm& perms)
10353 {
10354 Mutex::Locker lock(client_lock);
10355 vinodeno_t vparent = _get_vino(parent);
10356 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
10357 tout(cct) << "ll_lookupx" << std::endl;
10358 tout(cct) << name << std::endl;
10359
10360 if (unmounting)
10361 return -ENOTCONN;
10362
10363 int r = 0;
10364 if (!cct->_conf->fuse_default_permissions) {
10365 r = may_lookup(parent, perms);
10366 if (r < 0)
10367 return r;
10368 }
10369
10370 string dname(name);
10371 InodeRef in;
10372
10373 unsigned mask = statx_to_mask(flags, want);
10374 r = _lookup(parent, dname, mask, &in, perms);
10375 if (r < 0) {
10376 stx->stx_ino = 0;
10377 stx->stx_mask = 0;
10378 } else {
10379 assert(in);
10380 fill_statx(in, mask, stx);
10381 _ll_get(in.get());
10382 }
10383
10384 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
10385 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10386 tout(cct) << stx->stx_ino << std::endl;
10387 *out = in.get();
10388 return r;
10389 }
10390
10391 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10392 unsigned int want, unsigned int flags, const UserPerm& perms)
10393 {
10394 Mutex::Locker lock(client_lock);
10395
10396 if (unmounting)
10397 return -ENOTCONN;
10398
10399 filepath fp(name, 0);
10400 InodeRef in;
10401 int rc;
10402 unsigned mask = statx_to_mask(flags, want);
10403
10404 ldout(cct, 3) << "ll_walk" << name << dendl;
10405 tout(cct) << "ll_walk" << std::endl;
10406 tout(cct) << name << std::endl;
10407
10408 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10409 if (rc < 0) {
10410 /* zero out mask, just in case... */
10411 stx->stx_mask = 0;
10412 stx->stx_ino = 0;
10413 *out = NULL;
10414 return rc;
10415 } else {
10416 assert(in);
10417 fill_statx(in, mask, stx);
10418 _ll_get(in.get());
10419 *out = in.get();
10420 return 0;
10421 }
10422 }
10423
10424 void Client::_ll_get(Inode *in)
10425 {
10426 if (in->ll_ref == 0) {
10427 in->get();
10428 if (in->is_dir() && !in->dn_set.empty()) {
10429 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10430 in->get_first_parent()->get(); // pin dentry
10431 }
10432 }
10433 in->ll_get();
10434 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10435 }
10436
10437 int Client::_ll_put(Inode *in, int num)
10438 {
10439 in->ll_put(num);
10440 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10441 if (in->ll_ref == 0) {
10442 if (in->is_dir() && !in->dn_set.empty()) {
10443 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10444 in->get_first_parent()->put(); // unpin dentry
10445 }
10446 put_inode(in);
10447 return 0;
10448 } else {
10449 return in->ll_ref;
10450 }
10451 }
10452
10453 void Client::_ll_drop_pins()
10454 {
10455 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10456 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10457 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10458 it != inode_map.end();
10459 it = next) {
10460 Inode *in = it->second;
10461 next = it;
10462 ++next;
10463 if (in->ll_ref)
10464 _ll_put(in, in->ll_ref);
10465 }
10466 }
10467
10468 bool Client::ll_forget(Inode *in, int count)
10469 {
10470 Mutex::Locker lock(client_lock);
10471 inodeno_t ino = _get_inodeno(in);
10472
10473 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10474 tout(cct) << "ll_forget" << std::endl;
10475 tout(cct) << ino.val << std::endl;
10476 tout(cct) << count << std::endl;
10477
10478 // Ignore forget if we're no longer mounted
10479 if (unmounting)
10480 return true;
10481
10482 if (ino == 1) return true; // ignore forget on root.
10483
10484 bool last = false;
10485 if (in->ll_ref < count) {
10486 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10487 << ", which only has ll_ref=" << in->ll_ref << dendl;
10488 _ll_put(in, in->ll_ref);
10489 last = true;
10490 } else {
10491 if (_ll_put(in, count) == 0)
10492 last = true;
10493 }
10494
10495 return last;
10496 }
10497
10498 bool Client::ll_put(Inode *in)
10499 {
10500 /* ll_forget already takes the lock */
10501 return ll_forget(in, 1);
10502 }
10503
10504 snapid_t Client::ll_get_snapid(Inode *in)
10505 {
10506 Mutex::Locker lock(client_lock);
10507 return in->snapid;
10508 }
10509
10510 Inode *Client::ll_get_inode(ino_t ino)
10511 {
10512 Mutex::Locker lock(client_lock);
10513
10514 if (unmounting)
10515 return NULL;
10516
10517 vinodeno_t vino = _map_faked_ino(ino);
10518 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10519 if (p == inode_map.end())
10520 return NULL;
10521 Inode *in = p->second;
10522 _ll_get(in);
10523 return in;
10524 }
10525
10526 Inode *Client::ll_get_inode(vinodeno_t vino)
10527 {
10528 Mutex::Locker lock(client_lock);
10529
10530 if (unmounting)
10531 return NULL;
10532
10533 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10534 if (p == inode_map.end())
10535 return NULL;
10536 Inode *in = p->second;
10537 _ll_get(in);
10538 return in;
10539 }
10540
10541 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10542 {
10543 vinodeno_t vino = _get_vino(in);
10544
10545 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10546 tout(cct) << "ll_getattr" << std::endl;
10547 tout(cct) << vino.ino.val << std::endl;
10548
10549 if (vino.snapid < CEPH_NOSNAP)
10550 return 0;
10551 else
10552 return _getattr(in, caps, perms);
10553 }
10554
10555 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10556 {
10557 Mutex::Locker lock(client_lock);
10558
10559 if (unmounting)
10560 return -ENOTCONN;
10561
10562 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10563
10564 if (res == 0)
10565 fill_stat(in, attr);
10566 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10567 return res;
10568 }
10569
10570 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10571 unsigned int flags, const UserPerm& perms)
10572 {
10573 Mutex::Locker lock(client_lock);
10574
10575 if (unmounting)
10576 return -ENOTCONN;
10577
10578 int res = 0;
10579 unsigned mask = statx_to_mask(flags, want);
10580
10581 if (mask && !in->caps_issued_mask(mask, true))
10582 res = _ll_getattr(in, mask, perms);
10583
10584 if (res == 0)
10585 fill_statx(in, mask, stx);
10586 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10587 return res;
10588 }
10589
10590 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10591 const UserPerm& perms, InodeRef *inp)
10592 {
10593 vinodeno_t vino = _get_vino(in);
10594
10595 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10596 << dendl;
10597 tout(cct) << "ll_setattrx" << std::endl;
10598 tout(cct) << vino.ino.val << std::endl;
10599 tout(cct) << stx->stx_mode << std::endl;
10600 tout(cct) << stx->stx_uid << std::endl;
10601 tout(cct) << stx->stx_gid << std::endl;
10602 tout(cct) << stx->stx_size << std::endl;
10603 tout(cct) << stx->stx_mtime << std::endl;
10604 tout(cct) << stx->stx_atime << std::endl;
10605 tout(cct) << stx->stx_btime << std::endl;
10606 tout(cct) << mask << std::endl;
10607
10608 if (!cct->_conf->fuse_default_permissions) {
10609 int res = may_setattr(in, stx, mask, perms);
10610 if (res < 0)
10611 return res;
10612 }
10613
10614 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10615
10616 return __setattrx(in, stx, mask, perms, inp);
10617 }
10618
10619 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10620 const UserPerm& perms)
10621 {
10622 Mutex::Locker lock(client_lock);
10623
10624 if (unmounting)
10625 return -ENOTCONN;
10626
10627 InodeRef target(in);
10628 int res = _ll_setattrx(in, stx, mask, perms, &target);
10629 if (res == 0) {
10630 assert(in == target.get());
10631 fill_statx(in, in->caps_issued(), stx);
10632 }
10633
10634 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10635 return res;
10636 }
10637
10638 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10639 const UserPerm& perms)
10640 {
10641 struct ceph_statx stx;
10642 stat_to_statx(attr, &stx);
10643
10644 Mutex::Locker lock(client_lock);
10645
10646 if (unmounting)
10647 return -ENOTCONN;
10648
10649 InodeRef target(in);
10650 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10651 if (res == 0) {
10652 assert(in == target.get());
10653 fill_stat(in, attr);
10654 }
10655
10656 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10657 return res;
10658 }
10659
10660
10661 // ----------
10662 // xattrs
10663
10664 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10665 const UserPerm& perms)
10666 {
10667 Mutex::Locker lock(client_lock);
10668
10669 if (unmounting)
10670 return -ENOTCONN;
10671
10672 InodeRef in;
10673 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10674 if (r < 0)
10675 return r;
10676 return _getxattr(in, name, value, size, perms);
10677 }
10678
10679 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10680 const UserPerm& perms)
10681 {
10682 Mutex::Locker lock(client_lock);
10683
10684 if (unmounting)
10685 return -ENOTCONN;
10686
10687 InodeRef in;
10688 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10689 if (r < 0)
10690 return r;
10691 return _getxattr(in, name, value, size, perms);
10692 }
10693
10694 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10695 const UserPerm& perms)
10696 {
10697 Mutex::Locker lock(client_lock);
10698
10699 if (unmounting)
10700 return -ENOTCONN;
10701
10702 Fh *f = get_filehandle(fd);
10703 if (!f)
10704 return -EBADF;
10705 return _getxattr(f->inode, name, value, size, perms);
10706 }
10707
10708 int Client::listxattr(const char *path, char *list, size_t size,
10709 const UserPerm& perms)
10710 {
10711 Mutex::Locker lock(client_lock);
10712
10713 if (unmounting)
10714 return -ENOTCONN;
10715
10716 InodeRef in;
10717 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10718 if (r < 0)
10719 return r;
10720 return Client::_listxattr(in.get(), list, size, perms);
10721 }
10722
10723 int Client::llistxattr(const char *path, char *list, size_t size,
10724 const UserPerm& perms)
10725 {
10726 Mutex::Locker lock(client_lock);
10727
10728 if (unmounting)
10729 return -ENOTCONN;
10730
10731 InodeRef in;
10732 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10733 if (r < 0)
10734 return r;
10735 return Client::_listxattr(in.get(), list, size, perms);
10736 }
10737
10738 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10739 {
10740 Mutex::Locker lock(client_lock);
10741
10742 if (unmounting)
10743 return -ENOTCONN;
10744
10745 Fh *f = get_filehandle(fd);
10746 if (!f)
10747 return -EBADF;
10748 return Client::_listxattr(f->inode.get(), list, size, perms);
10749 }
10750
10751 int Client::removexattr(const char *path, const char *name,
10752 const UserPerm& perms)
10753 {
10754 Mutex::Locker lock(client_lock);
10755
10756 if (unmounting)
10757 return -ENOTCONN;
10758
10759 InodeRef in;
10760 int r = Client::path_walk(path, &in, perms, true);
10761 if (r < 0)
10762 return r;
10763 return _removexattr(in, name, perms);
10764 }
10765
10766 int Client::lremovexattr(const char *path, const char *name,
10767 const UserPerm& perms)
10768 {
10769 Mutex::Locker lock(client_lock);
10770
10771 if (unmounting)
10772 return -ENOTCONN;
10773
10774 InodeRef in;
10775 int r = Client::path_walk(path, &in, perms, false);
10776 if (r < 0)
10777 return r;
10778 return _removexattr(in, name, perms);
10779 }
10780
10781 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10782 {
10783 Mutex::Locker lock(client_lock);
10784
10785 if (unmounting)
10786 return -ENOTCONN;
10787
10788 Fh *f = get_filehandle(fd);
10789 if (!f)
10790 return -EBADF;
10791 return _removexattr(f->inode, name, perms);
10792 }
10793
10794 int Client::setxattr(const char *path, const char *name, const void *value,
10795 size_t size, int flags, const UserPerm& perms)
10796 {
10797 _setxattr_maybe_wait_for_osdmap(name, value, size);
10798
10799 Mutex::Locker lock(client_lock);
10800
10801 if (unmounting)
10802 return -ENOTCONN;
10803
10804 InodeRef in;
10805 int r = Client::path_walk(path, &in, perms, true);
10806 if (r < 0)
10807 return r;
10808 return _setxattr(in, name, value, size, flags, perms);
10809 }
10810
10811 int Client::lsetxattr(const char *path, const char *name, const void *value,
10812 size_t size, int flags, const UserPerm& perms)
10813 {
10814 _setxattr_maybe_wait_for_osdmap(name, value, size);
10815
10816 Mutex::Locker lock(client_lock);
10817
10818 if (unmounting)
10819 return -ENOTCONN;
10820
10821 InodeRef in;
10822 int r = Client::path_walk(path, &in, perms, false);
10823 if (r < 0)
10824 return r;
10825 return _setxattr(in, name, value, size, flags, perms);
10826 }
10827
10828 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10829 int flags, const UserPerm& perms)
10830 {
10831 _setxattr_maybe_wait_for_osdmap(name, value, size);
10832
10833 Mutex::Locker lock(client_lock);
10834
10835 if (unmounting)
10836 return -ENOTCONN;
10837
10838 Fh *f = get_filehandle(fd);
10839 if (!f)
10840 return -EBADF;
10841 return _setxattr(f->inode, name, value, size, flags, perms);
10842 }
10843
10844 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10845 const UserPerm& perms)
10846 {
10847 int r;
10848
10849 const VXattr *vxattr = _match_vxattr(in, name);
10850 if (vxattr) {
10851 r = -ENODATA;
10852
10853 // Do a force getattr to get the latest quota before returning
10854 // a value to userspace.
10855 r = _getattr(in, 0, perms, true);
10856 if (r != 0) {
10857 // Error from getattr!
10858 return r;
10859 }
10860
10861 // call pointer-to-member function
10862 char buf[256];
10863 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10864 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10865 } else {
10866 r = -ENODATA;
10867 }
10868
10869 if (size != 0) {
10870 if (r > (int)size) {
10871 r = -ERANGE;
10872 } else if (r > 0) {
10873 memcpy(value, buf, r);
10874 }
10875 }
10876 goto out;
10877 }
10878
10879 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10880 r = -EOPNOTSUPP;
10881 goto out;
10882 }
10883
10884 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10885 if (r == 0) {
10886 string n(name);
10887 r = -ENODATA;
10888 if (in->xattrs.count(n)) {
10889 r = in->xattrs[n].length();
10890 if (r > 0 && size != 0) {
10891 if (size >= (unsigned)r)
10892 memcpy(value, in->xattrs[n].c_str(), r);
10893 else
10894 r = -ERANGE;
10895 }
10896 }
10897 }
10898 out:
10899 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10900 return r;
10901 }
10902
10903 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10904 const UserPerm& perms)
10905 {
10906 if (cct->_conf->client_permissions) {
10907 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10908 if (r < 0)
10909 return r;
10910 }
10911 return _getxattr(in.get(), name, value, size, perms);
10912 }
10913
10914 int Client::ll_getxattr(Inode *in, const char *name, void *value,
10915 size_t size, const UserPerm& perms)
10916 {
10917 Mutex::Locker lock(client_lock);
10918
10919 if (unmounting)
10920 return -ENOTCONN;
10921
10922 vinodeno_t vino = _get_vino(in);
10923
10924 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10925 tout(cct) << "ll_getxattr" << std::endl;
10926 tout(cct) << vino.ino.val << std::endl;
10927 tout(cct) << name << std::endl;
10928
10929 if (!cct->_conf->fuse_default_permissions) {
10930 int r = xattr_permission(in, name, MAY_READ, perms);
10931 if (r < 0)
10932 return r;
10933 }
10934
10935 return _getxattr(in, name, value, size, perms);
10936 }
10937
10938 int Client::_listxattr(Inode *in, char *name, size_t size,
10939 const UserPerm& perms)
10940 {
10941 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10942 if (r == 0) {
10943 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10944 p != in->xattrs.end();
10945 ++p)
10946 r += p->first.length() + 1;
10947
10948 const VXattr *vxattrs = _get_vxattrs(in);
10949 r += _vxattrs_name_size(vxattrs);
10950
10951 if (size != 0) {
10952 if (size >= (unsigned)r) {
10953 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10954 p != in->xattrs.end();
10955 ++p) {
10956 memcpy(name, p->first.c_str(), p->first.length());
10957 name += p->first.length();
10958 *name = '\0';
10959 name++;
10960 }
10961 if (vxattrs) {
10962 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10963 const VXattr& vxattr = vxattrs[i];
10964 if (vxattr.hidden)
10965 continue;
10966 // call pointer-to-member function
10967 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10968 continue;
10969 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10970 name += vxattr.name.length();
10971 *name = '\0';
10972 name++;
10973 }
10974 }
10975 } else
10976 r = -ERANGE;
10977 }
10978 }
10979 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10980 return r;
10981 }
10982
10983 int Client::ll_listxattr(Inode *in, char *names, size_t size,
10984 const UserPerm& perms)
10985 {
10986 Mutex::Locker lock(client_lock);
10987
10988 if (unmounting)
10989 return -ENOTCONN;
10990
10991 vinodeno_t vino = _get_vino(in);
10992
10993 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10994 tout(cct) << "ll_listxattr" << std::endl;
10995 tout(cct) << vino.ino.val << std::endl;
10996 tout(cct) << size << std::endl;
10997
10998 return _listxattr(in, names, size, perms);
10999 }
11000
11001 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11002 size_t size, int flags, const UserPerm& perms)
11003 {
11004
11005 int xattr_flags = 0;
11006 if (!value)
11007 xattr_flags |= CEPH_XATTR_REMOVE;
11008 if (flags & XATTR_CREATE)
11009 xattr_flags |= CEPH_XATTR_CREATE;
11010 if (flags & XATTR_REPLACE)
11011 xattr_flags |= CEPH_XATTR_REPLACE;
11012
11013 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11014 filepath path;
11015 in->make_nosnap_relative_path(path);
11016 req->set_filepath(path);
11017 req->set_string2(name);
11018 req->set_inode(in);
11019 req->head.args.setxattr.flags = xattr_flags;
11020
11021 bufferlist bl;
11022 bl.append((const char*)value, size);
11023 req->set_data(bl);
11024
11025 int res = make_request(req, perms);
11026
11027 trim_cache();
11028 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11029 res << dendl;
11030 return res;
11031 }
11032
11033 int Client::_setxattr(Inode *in, const char *name, const void *value,
11034 size_t size, int flags, const UserPerm& perms)
11035 {
11036 if (in->snapid != CEPH_NOSNAP) {
11037 return -EROFS;
11038 }
11039
11040 bool posix_acl_xattr = false;
11041 if (acl_type == POSIX_ACL)
11042 posix_acl_xattr = !strncmp(name, "system.", 7);
11043
11044 if (strncmp(name, "user.", 5) &&
11045 strncmp(name, "security.", 9) &&
11046 strncmp(name, "trusted.", 8) &&
11047 strncmp(name, "ceph.", 5) &&
11048 !posix_acl_xattr)
11049 return -EOPNOTSUPP;
11050
11051 if (posix_acl_xattr) {
11052 if (!strcmp(name, ACL_EA_ACCESS)) {
11053 mode_t new_mode = in->mode;
11054 if (value) {
11055 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11056 if (ret < 0)
11057 return ret;
11058 if (ret == 0) {
11059 value = NULL;
11060 size = 0;
11061 }
11062 if (new_mode != in->mode) {
11063 struct ceph_statx stx;
11064 stx.stx_mode = new_mode;
11065 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11066 if (ret < 0)
11067 return ret;
11068 }
11069 }
11070 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11071 if (value) {
11072 if (!S_ISDIR(in->mode))
11073 return -EACCES;
11074 int ret = posix_acl_check(value, size);
11075 if (ret < 0)
11076 return -EINVAL;
11077 if (ret == 0) {
11078 value = NULL;
11079 size = 0;
11080 }
11081 }
11082 } else {
11083 return -EOPNOTSUPP;
11084 }
11085 } else {
11086 const VXattr *vxattr = _match_vxattr(in, name);
11087 if (vxattr && vxattr->readonly)
11088 return -EOPNOTSUPP;
11089 }
11090
11091 return _do_setxattr(in, name, value, size, flags, perms);
11092 }
11093
11094 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11095 size_t size, int flags, const UserPerm& perms)
11096 {
11097 if (cct->_conf->client_permissions) {
11098 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11099 if (r < 0)
11100 return r;
11101 }
11102 return _setxattr(in.get(), name, value, size, flags, perms);
11103 }
11104
11105 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11106 {
11107 string tmp;
11108 if (name == "layout") {
11109 string::iterator begin = value.begin();
11110 string::iterator end = value.end();
11111 keys_and_values<string::iterator> p; // create instance of parser
11112 std::map<string, string> m; // map to receive results
11113 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11114 return -EINVAL;
11115 }
11116 if (begin != end)
11117 return -EINVAL;
11118 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11119 if (q->first == "pool") {
11120 tmp = q->second;
11121 break;
11122 }
11123 }
11124 } else if (name == "layout.pool") {
11125 tmp = value;
11126 }
11127
11128 if (tmp.length()) {
11129 int64_t pool;
11130 try {
11131 pool = boost::lexical_cast<unsigned>(tmp);
11132 if (!osdmap->have_pg_pool(pool))
11133 return -ENOENT;
11134 } catch (boost::bad_lexical_cast const&) {
11135 pool = osdmap->lookup_pg_pool_name(tmp);
11136 if (pool < 0) {
11137 return -ENOENT;
11138 }
11139 }
11140 }
11141
11142 return 0;
11143 }
11144
11145 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11146 {
11147 // For setting pool of layout, MetaRequest need osdmap epoch.
11148 // There is a race which create a new data pool but client and mds both don't have.
11149 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11150 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11151 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11152 string rest(strstr(name, "layout"));
11153 string v((const char*)value, size);
11154 int r = objecter->with_osdmap([&](const OSDMap& o) {
11155 return _setxattr_check_data_pool(rest, v, &o);
11156 });
11157
11158 if (r == -ENOENT) {
11159 C_SaferCond ctx;
11160 objecter->wait_for_latest_osdmap(&ctx);
11161 ctx.wait();
11162 }
11163 }
11164 }
11165
11166 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11167 size_t size, int flags, const UserPerm& perms)
11168 {
11169 _setxattr_maybe_wait_for_osdmap(name, value, size);
11170
11171 Mutex::Locker lock(client_lock);
11172
11173 if (unmounting)
11174 return -ENOTCONN;
11175
11176 vinodeno_t vino = _get_vino(in);
11177
11178 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11179 tout(cct) << "ll_setxattr" << std::endl;
11180 tout(cct) << vino.ino.val << std::endl;
11181 tout(cct) << name << std::endl;
11182
11183 if (!cct->_conf->fuse_default_permissions) {
11184 int r = xattr_permission(in, name, MAY_WRITE, perms);
11185 if (r < 0)
11186 return r;
11187 }
11188 return _setxattr(in, name, value, size, flags, perms);
11189 }
11190
11191 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11192 {
11193 if (in->snapid != CEPH_NOSNAP) {
11194 return -EROFS;
11195 }
11196
11197 // same xattrs supported by kernel client
11198 if (strncmp(name, "user.", 5) &&
11199 strncmp(name, "system.", 7) &&
11200 strncmp(name, "security.", 9) &&
11201 strncmp(name, "trusted.", 8) &&
11202 strncmp(name, "ceph.", 5))
11203 return -EOPNOTSUPP;
11204
11205 const VXattr *vxattr = _match_vxattr(in, name);
11206 if (vxattr && vxattr->readonly)
11207 return -EOPNOTSUPP;
11208
11209 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11210 filepath path;
11211 in->make_nosnap_relative_path(path);
11212 req->set_filepath(path);
11213 req->set_filepath2(name);
11214 req->set_inode(in);
11215
11216 int res = make_request(req, perms);
11217
11218 trim_cache();
11219 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11220 return res;
11221 }
11222
11223 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11224 {
11225 if (cct->_conf->client_permissions) {
11226 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11227 if (r < 0)
11228 return r;
11229 }
11230 return _removexattr(in.get(), name, perms);
11231 }
11232
11233 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11234 {
11235 Mutex::Locker lock(client_lock);
11236
11237 if (unmounting)
11238 return -ENOTCONN;
11239
11240 vinodeno_t vino = _get_vino(in);
11241
11242 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11243 tout(cct) << "ll_removexattr" << std::endl;
11244 tout(cct) << vino.ino.val << std::endl;
11245 tout(cct) << name << std::endl;
11246
11247 if (!cct->_conf->fuse_default_permissions) {
11248 int r = xattr_permission(in, name, MAY_WRITE, perms);
11249 if (r < 0)
11250 return r;
11251 }
11252
11253 return _removexattr(in, name, perms);
11254 }
11255
11256 bool Client::_vxattrcb_quota_exists(Inode *in)
11257 {
11258 return in->quota.is_enable();
11259 }
11260 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11261 {
11262 return snprintf(val, size,
11263 "max_bytes=%lld max_files=%lld",
11264 (long long int)in->quota.max_bytes,
11265 (long long int)in->quota.max_files);
11266 }
11267 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11268 {
11269 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11270 }
11271 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11272 {
11273 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11274 }
11275
11276 bool Client::_vxattrcb_layout_exists(Inode *in)
11277 {
11278 return in->layout != file_layout_t();
11279 }
11280 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11281 {
11282 int r = snprintf(val, size,
11283 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11284 (unsigned long long)in->layout.stripe_unit,
11285 (unsigned long long)in->layout.stripe_count,
11286 (unsigned long long)in->layout.object_size);
11287 objecter->with_osdmap([&](const OSDMap& o) {
11288 if (o.have_pg_pool(in->layout.pool_id))
11289 r += snprintf(val + r, size - r, "%s",
11290 o.get_pool_name(in->layout.pool_id).c_str());
11291 else
11292 r += snprintf(val + r, size - r, "%" PRIu64,
11293 (uint64_t)in->layout.pool_id);
11294 });
11295 if (in->layout.pool_ns.length())
11296 r += snprintf(val + r, size - r, " pool_namespace=%s",
11297 in->layout.pool_ns.c_str());
11298 return r;
11299 }
11300 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11301 {
11302 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11303 }
11304 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11305 {
11306 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11307 }
11308 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11309 {
11310 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11311 }
11312 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11313 {
11314 size_t r;
11315 objecter->with_osdmap([&](const OSDMap& o) {
11316 if (o.have_pg_pool(in->layout.pool_id))
11317 r = snprintf(val, size, "%s", o.get_pool_name(
11318 in->layout.pool_id).c_str());
11319 else
11320 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11321 });
11322 return r;
11323 }
11324 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11325 {
11326 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11327 }
11328 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11329 {
11330 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11331 }
11332 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11333 {
11334 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11335 }
11336 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11337 {
11338 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11339 }
11340 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11341 {
11342 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11343 }
11344 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11345 {
11346 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11347 }
11348 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11349 {
11350 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11351 }
11352 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11353 {
11354 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11355 }
11356 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11357 {
11358 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11359 (long)in->rstat.rctime.nsec());
11360 }
11361
11362 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11363 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11364
11365 #define XATTR_NAME_CEPH(_type, _name) \
11366 { \
11367 name: CEPH_XATTR_NAME(_type, _name), \
11368 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11369 readonly: true, \
11370 hidden: false, \
11371 exists_cb: NULL, \
11372 }
11373 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11374 { \
11375 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11376 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11377 readonly: false, \
11378 hidden: true, \
11379 exists_cb: &Client::_vxattrcb_layout_exists, \
11380 }
11381 #define XATTR_QUOTA_FIELD(_type, _name) \
11382 { \
11383 name: CEPH_XATTR_NAME(_type, _name), \
11384 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11385 readonly: false, \
11386 hidden: true, \
11387 exists_cb: &Client::_vxattrcb_quota_exists, \
11388 }
11389
11390 const Client::VXattr Client::_dir_vxattrs[] = {
11391 {
11392 name: "ceph.dir.layout",
11393 getxattr_cb: &Client::_vxattrcb_layout,
11394 readonly: false,
11395 hidden: true,
11396 exists_cb: &Client::_vxattrcb_layout_exists,
11397 },
11398 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11399 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11400 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11401 XATTR_LAYOUT_FIELD(dir, layout, pool),
11402 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11403 XATTR_NAME_CEPH(dir, entries),
11404 XATTR_NAME_CEPH(dir, files),
11405 XATTR_NAME_CEPH(dir, subdirs),
11406 XATTR_NAME_CEPH(dir, rentries),
11407 XATTR_NAME_CEPH(dir, rfiles),
11408 XATTR_NAME_CEPH(dir, rsubdirs),
11409 XATTR_NAME_CEPH(dir, rbytes),
11410 XATTR_NAME_CEPH(dir, rctime),
11411 {
11412 name: "ceph.quota",
11413 getxattr_cb: &Client::_vxattrcb_quota,
11414 readonly: false,
11415 hidden: true,
11416 exists_cb: &Client::_vxattrcb_quota_exists,
11417 },
11418 XATTR_QUOTA_FIELD(quota, max_bytes),
11419 XATTR_QUOTA_FIELD(quota, max_files),
11420 { name: "" } /* Required table terminator */
11421 };
11422
11423 const Client::VXattr Client::_file_vxattrs[] = {
11424 {
11425 name: "ceph.file.layout",
11426 getxattr_cb: &Client::_vxattrcb_layout,
11427 readonly: false,
11428 hidden: true,
11429 exists_cb: &Client::_vxattrcb_layout_exists,
11430 },
11431 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11432 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11433 XATTR_LAYOUT_FIELD(file, layout, object_size),
11434 XATTR_LAYOUT_FIELD(file, layout, pool),
11435 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11436 { name: "" } /* Required table terminator */
11437 };
11438
11439 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11440 {
11441 if (in->is_dir())
11442 return _dir_vxattrs;
11443 else if (in->is_file())
11444 return _file_vxattrs;
11445 return NULL;
11446 }
11447
11448 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11449 {
11450 if (strncmp(name, "ceph.", 5) == 0) {
11451 const VXattr *vxattr = _get_vxattrs(in);
11452 if (vxattr) {
11453 while (!vxattr->name.empty()) {
11454 if (vxattr->name == name)
11455 return vxattr;
11456 vxattr++;
11457 }
11458 }
11459 }
11460 return NULL;
11461 }
11462
11463 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11464 {
11465 size_t len = 0;
11466 while (!vxattr->name.empty()) {
11467 if (!vxattr->hidden)
11468 len += vxattr->name.length() + 1;
11469 vxattr++;
11470 }
11471 return len;
11472 }
11473
11474 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11475 {
11476 Mutex::Locker lock(client_lock);
11477
11478 if (unmounting)
11479 return -ENOTCONN;
11480
11481 vinodeno_t vino = _get_vino(in);
11482
11483 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11484 tout(cct) << "ll_readlink" << std::endl;
11485 tout(cct) << vino.ino.val << std::endl;
11486
11487 set<Dentry*>::iterator dn = in->dn_set.begin();
11488 while (dn != in->dn_set.end()) {
11489 touch_dn(*dn);
11490 ++dn;
11491 }
11492
11493 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11494 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11495 return r;
11496 }
11497
11498 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11499 const UserPerm& perms, InodeRef *inp)
11500 {
11501 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11502 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11503 << ", gid " << perms.gid() << ")" << dendl;
11504
11505 if (strlen(name) > NAME_MAX)
11506 return -ENAMETOOLONG;
11507
11508 if (dir->snapid != CEPH_NOSNAP) {
11509 return -EROFS;
11510 }
11511 if (is_quota_files_exceeded(dir, perms)) {
11512 return -EDQUOT;
11513 }
11514
11515 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11516
11517 filepath path;
11518 dir->make_nosnap_relative_path(path);
11519 path.push_dentry(name);
11520 req->set_filepath(path);
11521 req->set_inode(dir);
11522 req->head.args.mknod.rdev = rdev;
11523 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11524 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11525
11526 bufferlist xattrs_bl;
11527 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11528 if (res < 0)
11529 goto fail;
11530 req->head.args.mknod.mode = mode;
11531 if (xattrs_bl.length() > 0)
11532 req->set_data(xattrs_bl);
11533
11534 Dentry *de;
11535 res = get_or_create(dir, name, &de);
11536 if (res < 0)
11537 goto fail;
11538 req->set_dentry(de);
11539
11540 res = make_request(req, perms, inp);
11541
11542 trim_cache();
11543
11544 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11545 return res;
11546
11547 fail:
11548 put_request(req);
11549 return res;
11550 }
11551
11552 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11553 dev_t rdev, struct stat *attr, Inode **out,
11554 const UserPerm& perms)
11555 {
11556 Mutex::Locker lock(client_lock);
11557
11558 if (unmounting)
11559 return -ENOTCONN;
11560
11561 vinodeno_t vparent = _get_vino(parent);
11562
11563 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11564 tout(cct) << "ll_mknod" << std::endl;
11565 tout(cct) << vparent.ino.val << std::endl;
11566 tout(cct) << name << std::endl;
11567 tout(cct) << mode << std::endl;
11568 tout(cct) << rdev << std::endl;
11569
11570 if (!cct->_conf->fuse_default_permissions) {
11571 int r = may_create(parent, perms);
11572 if (r < 0)
11573 return r;
11574 }
11575
11576 InodeRef in;
11577 int r = _mknod(parent, name, mode, rdev, perms, &in);
11578 if (r == 0) {
11579 fill_stat(in, attr);
11580 _ll_get(in.get());
11581 }
11582 tout(cct) << attr->st_ino << std::endl;
11583 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11584 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11585 *out = in.get();
11586 return r;
11587 }
11588
11589 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11590 dev_t rdev, Inode **out,
11591 struct ceph_statx *stx, unsigned want, unsigned flags,
11592 const UserPerm& perms)
11593 {
11594 unsigned caps = statx_to_mask(flags, want);
11595 Mutex::Locker lock(client_lock);
11596
11597 if (unmounting)
11598 return -ENOTCONN;
11599
11600 vinodeno_t vparent = _get_vino(parent);
11601
11602 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11603 tout(cct) << "ll_mknodx" << std::endl;
11604 tout(cct) << vparent.ino.val << std::endl;
11605 tout(cct) << name << std::endl;
11606 tout(cct) << mode << std::endl;
11607 tout(cct) << rdev << std::endl;
11608
11609 if (!cct->_conf->fuse_default_permissions) {
11610 int r = may_create(parent, perms);
11611 if (r < 0)
11612 return r;
11613 }
11614
11615 InodeRef in;
11616 int r = _mknod(parent, name, mode, rdev, perms, &in);
11617 if (r == 0) {
11618 fill_statx(in, caps, stx);
11619 _ll_get(in.get());
11620 }
11621 tout(cct) << stx->stx_ino << std::endl;
11622 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11623 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11624 *out = in.get();
11625 return r;
11626 }
11627
11628 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11629 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11630 int object_size, const char *data_pool, bool *created,
11631 const UserPerm& perms)
11632 {
11633 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11634 mode << dec << ")" << dendl;
11635
11636 if (strlen(name) > NAME_MAX)
11637 return -ENAMETOOLONG;
11638 if (dir->snapid != CEPH_NOSNAP) {
11639 return -EROFS;
11640 }
11641 if (is_quota_files_exceeded(dir, perms)) {
11642 return -EDQUOT;
11643 }
11644
11645 // use normalized flags to generate cmode
11646 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11647 if (cmode < 0)
11648 return -EINVAL;
11649
11650 int64_t pool_id = -1;
11651 if (data_pool && *data_pool) {
11652 pool_id = objecter->with_osdmap(
11653 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11654 if (pool_id < 0)
11655 return -EINVAL;
11656 if (pool_id > 0xffffffffll)
11657 return -ERANGE; // bummer!
11658 }
11659
11660 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11661
11662 filepath path;
11663 dir->make_nosnap_relative_path(path);
11664 path.push_dentry(name);
11665 req->set_filepath(path);
11666 req->set_inode(dir);
11667 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11668
11669 req->head.args.open.stripe_unit = stripe_unit;
11670 req->head.args.open.stripe_count = stripe_count;
11671 req->head.args.open.object_size = object_size;
11672 if (cct->_conf->client_debug_getattr_caps)
11673 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11674 else
11675 req->head.args.open.mask = 0;
11676 req->head.args.open.pool = pool_id;
11677 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11678 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11679
11680 mode |= S_IFREG;
11681 bufferlist xattrs_bl;
11682 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11683 if (res < 0)
11684 goto fail;
11685 req->head.args.open.mode = mode;
11686 if (xattrs_bl.length() > 0)
11687 req->set_data(xattrs_bl);
11688
11689 Dentry *de;
11690 res = get_or_create(dir, name, &de);
11691 if (res < 0)
11692 goto fail;
11693 req->set_dentry(de);
11694
11695 res = make_request(req, perms, inp, created);
11696 if (res < 0) {
11697 goto reply_error;
11698 }
11699
11700 /* If the caller passed a value in fhp, do the open */
11701 if(fhp) {
11702 (*inp)->get_open_ref(cmode);
11703 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11704 }
11705
11706 reply_error:
11707 trim_cache();
11708
11709 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11710 << " layout " << stripe_unit
11711 << ' ' << stripe_count
11712 << ' ' << object_size
11713 <<") = " << res << dendl;
11714 return res;
11715
11716 fail:
11717 put_request(req);
11718 return res;
11719 }
11720
11721
11722 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11723 InodeRef *inp)
11724 {
11725 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11726 << mode << dec << ", uid " << perm.uid()
11727 << ", gid " << perm.gid() << ")" << dendl;
11728
11729 if (strlen(name) > NAME_MAX)
11730 return -ENAMETOOLONG;
11731
11732 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11733 return -EROFS;
11734 }
11735 if (is_quota_files_exceeded(dir, perm)) {
11736 return -EDQUOT;
11737 }
11738 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11739 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11740
11741 filepath path;
11742 dir->make_nosnap_relative_path(path);
11743 path.push_dentry(name);
11744 req->set_filepath(path);
11745 req->set_inode(dir);
11746 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11747 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11748
11749 mode |= S_IFDIR;
11750 bufferlist xattrs_bl;
11751 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11752 if (res < 0)
11753 goto fail;
11754 req->head.args.mkdir.mode = mode;
11755 if (xattrs_bl.length() > 0)
11756 req->set_data(xattrs_bl);
11757
11758 Dentry *de;
11759 res = get_or_create(dir, name, &de);
11760 if (res < 0)
11761 goto fail;
11762 req->set_dentry(de);
11763
11764 ldout(cct, 10) << "_mkdir: making request" << dendl;
11765 res = make_request(req, perm, inp);
11766 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11767
11768 trim_cache();
11769
11770 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11771 return res;
11772
11773 fail:
11774 put_request(req);
11775 return res;
11776 }
11777
11778 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11779 struct stat *attr, Inode **out, const UserPerm& perm)
11780 {
11781 Mutex::Locker lock(client_lock);
11782
11783 if (unmounting)
11784 return -ENOTCONN;
11785
11786 vinodeno_t vparent = _get_vino(parent);
11787
11788 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11789 tout(cct) << "ll_mkdir" << std::endl;
11790 tout(cct) << vparent.ino.val << std::endl;
11791 tout(cct) << name << std::endl;
11792 tout(cct) << mode << std::endl;
11793
11794 if (!cct->_conf->fuse_default_permissions) {
11795 int r = may_create(parent, perm);
11796 if (r < 0)
11797 return r;
11798 }
11799
11800 InodeRef in;
11801 int r = _mkdir(parent, name, mode, perm, &in);
11802 if (r == 0) {
11803 fill_stat(in, attr);
11804 _ll_get(in.get());
11805 }
11806 tout(cct) << attr->st_ino << std::endl;
11807 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11808 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11809 *out = in.get();
11810 return r;
11811 }
11812
11813 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11814 struct ceph_statx *stx, unsigned want, unsigned flags,
11815 const UserPerm& perms)
11816 {
11817 Mutex::Locker lock(client_lock);
11818
11819 if (unmounting)
11820 return -ENOTCONN;
11821
11822 vinodeno_t vparent = _get_vino(parent);
11823
11824 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11825 tout(cct) << "ll_mkdirx" << std::endl;
11826 tout(cct) << vparent.ino.val << std::endl;
11827 tout(cct) << name << std::endl;
11828 tout(cct) << mode << std::endl;
11829
11830 if (!cct->_conf->fuse_default_permissions) {
11831 int r = may_create(parent, perms);
11832 if (r < 0)
11833 return r;
11834 }
11835
11836 InodeRef in;
11837 int r = _mkdir(parent, name, mode, perms, &in);
11838 if (r == 0) {
11839 fill_statx(in, statx_to_mask(flags, want), stx);
11840 _ll_get(in.get());
11841 } else {
11842 stx->stx_ino = 0;
11843 stx->stx_mask = 0;
11844 }
11845 tout(cct) << stx->stx_ino << std::endl;
11846 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11847 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11848 *out = in.get();
11849 return r;
11850 }
11851
11852 int Client::_symlink(Inode *dir, const char *name, const char *target,
11853 const UserPerm& perms, InodeRef *inp)
11854 {
11855 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11856 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11857 << dendl;
11858
11859 if (strlen(name) > NAME_MAX)
11860 return -ENAMETOOLONG;
11861
11862 if (dir->snapid != CEPH_NOSNAP) {
11863 return -EROFS;
11864 }
11865 if (is_quota_files_exceeded(dir, perms)) {
11866 return -EDQUOT;
11867 }
11868
11869 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11870
11871 filepath path;
11872 dir->make_nosnap_relative_path(path);
11873 path.push_dentry(name);
11874 req->set_filepath(path);
11875 req->set_inode(dir);
11876 req->set_string2(target);
11877 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11878 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11879
11880 Dentry *de;
11881 int res = get_or_create(dir, name, &de);
11882 if (res < 0)
11883 goto fail;
11884 req->set_dentry(de);
11885
11886 res = make_request(req, perms, inp);
11887
11888 trim_cache();
11889 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11890 res << dendl;
11891 return res;
11892
11893 fail:
11894 put_request(req);
11895 return res;
11896 }
11897
11898 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11899 struct stat *attr, Inode **out, const UserPerm& perms)
11900 {
11901 Mutex::Locker lock(client_lock);
11902
11903 if (unmounting)
11904 return -ENOTCONN;
11905
11906 vinodeno_t vparent = _get_vino(parent);
11907
11908 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11909 << dendl;
11910 tout(cct) << "ll_symlink" << std::endl;
11911 tout(cct) << vparent.ino.val << std::endl;
11912 tout(cct) << name << std::endl;
11913 tout(cct) << value << std::endl;
11914
11915 if (!cct->_conf->fuse_default_permissions) {
11916 int r = may_create(parent, perms);
11917 if (r < 0)
11918 return r;
11919 }
11920
11921 InodeRef in;
11922 int r = _symlink(parent, name, value, perms, &in);
11923 if (r == 0) {
11924 fill_stat(in, attr);
11925 _ll_get(in.get());
11926 }
11927 tout(cct) << attr->st_ino << std::endl;
11928 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11929 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11930 *out = in.get();
11931 return r;
11932 }
11933
11934 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11935 Inode **out, struct ceph_statx *stx, unsigned want,
11936 unsigned flags, const UserPerm& perms)
11937 {
11938 Mutex::Locker lock(client_lock);
11939
11940 if (unmounting)
11941 return -ENOTCONN;
11942
11943 vinodeno_t vparent = _get_vino(parent);
11944
11945 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11946 << dendl;
11947 tout(cct) << "ll_symlinkx" << std::endl;
11948 tout(cct) << vparent.ino.val << std::endl;
11949 tout(cct) << name << std::endl;
11950 tout(cct) << value << std::endl;
11951
11952 if (!cct->_conf->fuse_default_permissions) {
11953 int r = may_create(parent, perms);
11954 if (r < 0)
11955 return r;
11956 }
11957
11958 InodeRef in;
11959 int r = _symlink(parent, name, value, perms, &in);
11960 if (r == 0) {
11961 fill_statx(in, statx_to_mask(flags, want), stx);
11962 _ll_get(in.get());
11963 }
11964 tout(cct) << stx->stx_ino << std::endl;
11965 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11966 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11967 *out = in.get();
11968 return r;
11969 }
11970
11971 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11972 {
11973 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11974 << " uid " << perm.uid() << " gid " << perm.gid()
11975 << ")" << dendl;
11976
11977 if (dir->snapid != CEPH_NOSNAP) {
11978 return -EROFS;
11979 }
11980
11981 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11982
11983 filepath path;
11984 dir->make_nosnap_relative_path(path);
11985 path.push_dentry(name);
11986 req->set_filepath(path);
11987
11988 InodeRef otherin;
11989 Inode *in;
11990 Dentry *de;
11991
11992 int res = get_or_create(dir, name, &de);
11993 if (res < 0)
11994 goto fail;
11995 req->set_dentry(de);
11996 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11997 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11998
11999 res = _lookup(dir, name, 0, &otherin, perm);
12000 if (res < 0)
12001 goto fail;
12002
12003 in = otherin.get();
12004 req->set_other_inode(in);
12005 in->break_all_delegs();
12006 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12007
12008 req->set_inode(dir);
12009
12010 res = make_request(req, perm);
12011
12012 trim_cache();
12013 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
12014 return res;
12015
12016 fail:
12017 put_request(req);
12018 return res;
12019 }
12020
12021 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12022 {
12023 Mutex::Locker lock(client_lock);
12024
12025 if (unmounting)
12026 return -ENOTCONN;
12027
12028 vinodeno_t vino = _get_vino(in);
12029
12030 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12031 tout(cct) << "ll_unlink" << std::endl;
12032 tout(cct) << vino.ino.val << std::endl;
12033 tout(cct) << name << std::endl;
12034
12035 if (!cct->_conf->fuse_default_permissions) {
12036 int r = may_delete(in, name, perm);
12037 if (r < 0)
12038 return r;
12039 }
12040 return _unlink(in, name, perm);
12041 }
12042
12043 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12044 {
12045 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
12046 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12047
12048 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12049 return -EROFS;
12050 }
12051
12052 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12053 MetaRequest *req = new MetaRequest(op);
12054 filepath path;
12055 dir->make_nosnap_relative_path(path);
12056 path.push_dentry(name);
12057 req->set_filepath(path);
12058
12059 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12060 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12061 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12062
12063 InodeRef in;
12064
12065 Dentry *de;
12066 int res = get_or_create(dir, name, &de);
12067 if (res < 0)
12068 goto fail;
12069 if (op == CEPH_MDS_OP_RMDIR)
12070 req->set_dentry(de);
12071 else
12072 de->get();
12073
12074 res = _lookup(dir, name, 0, &in, perms);
12075 if (res < 0)
12076 goto fail;
12077 if (op == CEPH_MDS_OP_RMDIR) {
12078 req->set_inode(dir);
12079 req->set_other_inode(in.get());
12080 } else {
12081 unlink(de, true, true);
12082 de->put();
12083 req->set_other_inode(in.get());
12084 }
12085
12086 res = make_request(req, perms);
12087
12088 trim_cache();
12089 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
12090 return res;
12091
12092 fail:
12093 put_request(req);
12094 return res;
12095 }
12096
12097 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12098 {
12099 Mutex::Locker lock(client_lock);
12100
12101 if (unmounting)
12102 return -ENOTCONN;
12103
12104 vinodeno_t vino = _get_vino(in);
12105
12106 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12107 tout(cct) << "ll_rmdir" << std::endl;
12108 tout(cct) << vino.ino.val << std::endl;
12109 tout(cct) << name << std::endl;
12110
12111 if (!cct->_conf->fuse_default_permissions) {
12112 int r = may_delete(in, name, perms);
12113 if (r < 0)
12114 return r;
12115 }
12116
12117 return _rmdir(in, name, perms);
12118 }
12119
12120 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12121 {
12122 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
12123 << todir->ino << " " << toname
12124 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12125 << dendl;
12126
12127 if (fromdir->snapid != todir->snapid)
12128 return -EXDEV;
12129
12130 int op = CEPH_MDS_OP_RENAME;
12131 if (fromdir->snapid != CEPH_NOSNAP) {
12132 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12133 op = CEPH_MDS_OP_RENAMESNAP;
12134 else
12135 return -EROFS;
12136 }
12137 if (fromdir != todir) {
12138 Inode *fromdir_root =
12139 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12140 Inode *todir_root =
12141 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12142 if (fromdir_root != todir_root) {
12143 return -EXDEV;
12144 }
12145 }
12146
12147 InodeRef target;
12148 MetaRequest *req = new MetaRequest(op);
12149
12150 filepath from;
12151 fromdir->make_nosnap_relative_path(from);
12152 from.push_dentry(fromname);
12153 filepath to;
12154 todir->make_nosnap_relative_path(to);
12155 to.push_dentry(toname);
12156 req->set_filepath(to);
12157 req->set_filepath2(from);
12158
12159 Dentry *oldde;
12160 int res = get_or_create(fromdir, fromname, &oldde);
12161 if (res < 0)
12162 goto fail;
12163 Dentry *de;
12164 res = get_or_create(todir, toname, &de);
12165 if (res < 0)
12166 goto fail;
12167
12168 if (op == CEPH_MDS_OP_RENAME) {
12169 req->set_old_dentry(oldde);
12170 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12171 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12172
12173 req->set_dentry(de);
12174 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12175 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12176
12177 InodeRef oldin, otherin;
12178 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12179 if (res < 0)
12180 goto fail;
12181
12182 Inode *oldinode = oldin.get();
12183 oldinode->break_all_delegs();
12184 req->set_old_inode(oldinode);
12185 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12186
12187 res = _lookup(todir, toname, 0, &otherin, perm);
12188 switch (res) {
12189 case 0:
12190 {
12191 Inode *in = otherin.get();
12192 req->set_other_inode(in);
12193 in->break_all_delegs();
12194 }
12195 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12196 break;
12197 case -ENOENT:
12198 break;
12199 default:
12200 goto fail;
12201 }
12202
12203 req->set_inode(todir);
12204 } else {
12205 // renamesnap reply contains no tracedn, so we need to invalidate
12206 // dentry manually
12207 unlink(oldde, true, true);
12208 unlink(de, true, true);
12209 }
12210
12211 res = make_request(req, perm, &target);
12212 ldout(cct, 10) << "rename result is " << res << dendl;
12213
12214 // renamed item from our cache
12215
12216 trim_cache();
12217 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12218 return res;
12219
12220 fail:
12221 put_request(req);
12222 return res;
12223 }
12224
12225 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12226 const char *newname, const UserPerm& perm)
12227 {
12228 Mutex::Locker lock(client_lock);
12229
12230 if (unmounting)
12231 return -ENOTCONN;
12232
12233 vinodeno_t vparent = _get_vino(parent);
12234 vinodeno_t vnewparent = _get_vino(newparent);
12235
12236 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12237 << vnewparent << " " << newname << dendl;
12238 tout(cct) << "ll_rename" << std::endl;
12239 tout(cct) << vparent.ino.val << std::endl;
12240 tout(cct) << name << std::endl;
12241 tout(cct) << vnewparent.ino.val << std::endl;
12242 tout(cct) << newname << std::endl;
12243
12244 if (!cct->_conf->fuse_default_permissions) {
12245 int r = may_delete(parent, name, perm);
12246 if (r < 0)
12247 return r;
12248 r = may_delete(newparent, newname, perm);
12249 if (r < 0 && r != -ENOENT)
12250 return r;
12251 }
12252
12253 return _rename(parent, name, newparent, newname, perm);
12254 }
12255
12256 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12257 {
12258 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12259 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12260
12261 if (strlen(newname) > NAME_MAX)
12262 return -ENAMETOOLONG;
12263
12264 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12265 return -EROFS;
12266 }
12267 if (is_quota_files_exceeded(dir, perm)) {
12268 return -EDQUOT;
12269 }
12270
12271 in->break_all_delegs();
12272 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12273
12274 filepath path(newname, dir->ino);
12275 req->set_filepath(path);
12276 filepath existing(in->ino);
12277 req->set_filepath2(existing);
12278
12279 req->set_inode(dir);
12280 req->inode_drop = CEPH_CAP_FILE_SHARED;
12281 req->inode_unless = CEPH_CAP_FILE_EXCL;
12282
12283 Dentry *de;
12284 int res = get_or_create(dir, newname, &de);
12285 if (res < 0)
12286 goto fail;
12287 req->set_dentry(de);
12288
12289 res = make_request(req, perm, inp);
12290 ldout(cct, 10) << "link result is " << res << dendl;
12291
12292 trim_cache();
12293 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
12294 return res;
12295
12296 fail:
12297 put_request(req);
12298 return res;
12299 }
12300
12301 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12302 const UserPerm& perm)
12303 {
12304 Mutex::Locker lock(client_lock);
12305
12306 if (unmounting)
12307 return -ENOTCONN;
12308
12309 vinodeno_t vino = _get_vino(in);
12310 vinodeno_t vnewparent = _get_vino(newparent);
12311
12312 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12313 newname << dendl;
12314 tout(cct) << "ll_link" << std::endl;
12315 tout(cct) << vino.ino.val << std::endl;
12316 tout(cct) << vnewparent << std::endl;
12317 tout(cct) << newname << std::endl;
12318
12319 int r = 0;
12320 InodeRef target;
12321
12322 if (!cct->_conf->fuse_default_permissions) {
12323 if (S_ISDIR(in->mode))
12324 return -EPERM;
12325
12326 r = may_hardlink(in, perm);
12327 if (r < 0)
12328 return r;
12329
12330 r = may_create(newparent, perm);
12331 if (r < 0)
12332 return r;
12333 }
12334
12335 return _link(in, newparent, newname, perm, &target);
12336 }
12337
12338 int Client::ll_num_osds(void)
12339 {
12340 Mutex::Locker lock(client_lock);
12341 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12342 }
12343
12344 int Client::ll_osdaddr(int osd, uint32_t *addr)
12345 {
12346 Mutex::Locker lock(client_lock);
12347
12348 entity_addr_t g;
12349 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12350 if (!o.exists(osd))
12351 return false;
12352 g = o.get_addr(osd);
12353 return true;
12354 });
12355 if (!exists)
12356 return -1;
12357 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12358 *addr = ntohl(nb_addr);
12359 return 0;
12360 }
12361
12362 uint32_t Client::ll_stripe_unit(Inode *in)
12363 {
12364 Mutex::Locker lock(client_lock);
12365 return in->layout.stripe_unit;
12366 }
12367
12368 uint64_t Client::ll_snap_seq(Inode *in)
12369 {
12370 Mutex::Locker lock(client_lock);
12371 return in->snaprealm->seq;
12372 }
12373
12374 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12375 {
12376 Mutex::Locker lock(client_lock);
12377 *layout = in->layout;
12378 return 0;
12379 }
12380
12381 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12382 {
12383 return ll_file_layout(fh->inode.get(), layout);
12384 }
12385
12386 /* Currently we cannot take advantage of redundancy in reads, since we
12387 would have to go through all possible placement groups (a
12388 potentially quite large number determined by a hash), and use CRUSH
12389 to calculate the appropriate set of OSDs for each placement group,
12390 then index into that. An array with one entry per OSD is much more
12391 tractable and works for demonstration purposes. */
12392
12393 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12394 file_layout_t* layout)
12395 {
12396 Mutex::Locker lock(client_lock);
12397
12398 inodeno_t ino = ll_get_inodeno(in);
12399 uint32_t object_size = layout->object_size;
12400 uint32_t su = layout->stripe_unit;
12401 uint32_t stripe_count = layout->stripe_count;
12402 uint64_t stripes_per_object = object_size / su;
12403
12404 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12405 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12406 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12407 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12408
12409 object_t oid = file_object_t(ino, objectno);
12410 return objecter->with_osdmap([&](const OSDMap& o) {
12411 ceph_object_layout olayout =
12412 o.file_to_object_layout(oid, *layout);
12413 pg_t pg = (pg_t)olayout.ol_pgid;
12414 vector<int> osds;
12415 int primary;
12416 o.pg_to_acting_osds(pg, &osds, &primary);
12417 return primary;
12418 });
12419 }
12420
12421 /* Return the offset of the block, internal to the object */
12422
12423 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12424 {
12425 Mutex::Locker lock(client_lock);
12426 file_layout_t *layout=&(in->layout);
12427 uint32_t object_size = layout->object_size;
12428 uint32_t su = layout->stripe_unit;
12429 uint64_t stripes_per_object = object_size / su;
12430
12431 return (blockno % stripes_per_object) * su;
12432 }
12433
12434 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12435 const UserPerm& perms)
12436 {
12437 Mutex::Locker lock(client_lock);
12438
12439 if (unmounting)
12440 return -ENOTCONN;
12441
12442 vinodeno_t vino = _get_vino(in);
12443
12444 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12445 tout(cct) << "ll_opendir" << std::endl;
12446 tout(cct) << vino.ino.val << std::endl;
12447
12448 if (!cct->_conf->fuse_default_permissions) {
12449 int r = may_open(in, flags, perms);
12450 if (r < 0)
12451 return r;
12452 }
12453
12454 int r = _opendir(in, dirpp, perms);
12455 tout(cct) << (unsigned long)*dirpp << std::endl;
12456
12457 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12458 << dendl;
12459 return r;
12460 }
12461
12462 int Client::ll_releasedir(dir_result_t *dirp)
12463 {
12464 Mutex::Locker lock(client_lock);
12465 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12466 tout(cct) << "ll_releasedir" << std::endl;
12467 tout(cct) << (unsigned long)dirp << std::endl;
12468
12469 if (unmounting)
12470 return -ENOTCONN;
12471
12472 _closedir(dirp);
12473 return 0;
12474 }
12475
12476 int Client::ll_fsyncdir(dir_result_t *dirp)
12477 {
12478 Mutex::Locker lock(client_lock);
12479 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12480 tout(cct) << "ll_fsyncdir" << std::endl;
12481 tout(cct) << (unsigned long)dirp << std::endl;
12482
12483 if (unmounting)
12484 return -ENOTCONN;
12485
12486 return _fsync(dirp->inode.get(), false);
12487 }
12488
12489 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12490 {
12491 assert(!(flags & O_CREAT));
12492
12493 Mutex::Locker lock(client_lock);
12494
12495 if (unmounting)
12496 return -ENOTCONN;
12497
12498 vinodeno_t vino = _get_vino(in);
12499
12500 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12501 tout(cct) << "ll_open" << std::endl;
12502 tout(cct) << vino.ino.val << std::endl;
12503 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12504
12505 int r;
12506 if (!cct->_conf->fuse_default_permissions) {
12507 r = may_open(in, flags, perms);
12508 if (r < 0)
12509 goto out;
12510 }
12511
12512 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12513
12514 out:
12515 Fh *fhptr = fhp ? *fhp : NULL;
12516 if (fhptr) {
12517 ll_unclosed_fh_set.insert(fhptr);
12518 }
12519 tout(cct) << (unsigned long)fhptr << std::endl;
12520 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12521 " = " << r << " (" << fhptr << ")" << dendl;
12522 return r;
12523 }
12524
12525 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12526 int flags, InodeRef *in, int caps, Fh **fhp,
12527 const UserPerm& perms)
12528 {
12529 *fhp = NULL;
12530
12531 vinodeno_t vparent = _get_vino(parent);
12532
12533 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12534 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12535 << ", gid " << perms.gid() << dendl;
12536 tout(cct) << "ll_create" << std::endl;
12537 tout(cct) << vparent.ino.val << std::endl;
12538 tout(cct) << name << std::endl;
12539 tout(cct) << mode << std::endl;
12540 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12541
12542 bool created = false;
12543 int r = _lookup(parent, name, caps, in, perms);
12544
12545 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12546 return -EEXIST;
12547
12548 if (r == -ENOENT && (flags & O_CREAT)) {
12549 if (!cct->_conf->fuse_default_permissions) {
12550 r = may_create(parent, perms);
12551 if (r < 0)
12552 goto out;
12553 }
12554 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12555 perms);
12556 if (r < 0)
12557 goto out;
12558 }
12559
12560 if (r < 0)
12561 goto out;
12562
12563 assert(*in);
12564
12565 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12566 if (!created) {
12567 if (!cct->_conf->fuse_default_permissions) {
12568 r = may_open(in->get(), flags, perms);
12569 if (r < 0) {
12570 if (*fhp) {
12571 int release_r = _release_fh(*fhp);
12572 assert(release_r == 0); // during create, no async data ops should have happened
12573 }
12574 goto out;
12575 }
12576 }
12577 if (*fhp == NULL) {
12578 r = _open(in->get(), flags, mode, fhp, perms);
12579 if (r < 0)
12580 goto out;
12581 }
12582 }
12583
12584 out:
12585 if (*fhp) {
12586 ll_unclosed_fh_set.insert(*fhp);
12587 }
12588
12589 ino_t ino = 0;
12590 if (r >= 0) {
12591 Inode *inode = in->get();
12592 if (use_faked_inos())
12593 ino = inode->faked_ino;
12594 else
12595 ino = inode->ino;
12596 }
12597
12598 tout(cct) << (unsigned long)*fhp << std::endl;
12599 tout(cct) << ino << std::endl;
12600 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12601 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12602 *fhp << " " << hex << ino << dec << ")" << dendl;
12603
12604 return r;
12605 }
12606
12607 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12608 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12609 const UserPerm& perms)
12610 {
12611 Mutex::Locker lock(client_lock);
12612 InodeRef in;
12613
12614 if (unmounting)
12615 return -ENOTCONN;
12616
12617 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12618 fhp, perms);
12619 if (r >= 0) {
12620 assert(in);
12621
12622 // passing an Inode in outp requires an additional ref
12623 if (outp) {
12624 _ll_get(in.get());
12625 *outp = in.get();
12626 }
12627 fill_stat(in, attr);
12628 } else {
12629 attr->st_ino = 0;
12630 }
12631
12632 return r;
12633 }
12634
12635 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12636 int oflags, Inode **outp, Fh **fhp,
12637 struct ceph_statx *stx, unsigned want, unsigned lflags,
12638 const UserPerm& perms)
12639 {
12640 unsigned caps = statx_to_mask(lflags, want);
12641 Mutex::Locker lock(client_lock);
12642 InodeRef in;
12643
12644 if (unmounting)
12645 return -ENOTCONN;
12646
12647 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12648 if (r >= 0) {
12649 assert(in);
12650
12651 // passing an Inode in outp requires an additional ref
12652 if (outp) {
12653 _ll_get(in.get());
12654 *outp = in.get();
12655 }
12656 fill_statx(in, caps, stx);
12657 } else {
12658 stx->stx_ino = 0;
12659 stx->stx_mask = 0;
12660 }
12661
12662 return r;
12663 }
12664
12665 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12666 {
12667 Mutex::Locker lock(client_lock);
12668 tout(cct) << "ll_lseek" << std::endl;
12669 tout(cct) << offset << std::endl;
12670 tout(cct) << whence << std::endl;
12671
12672 if (unmounting)
12673 return -ENOTCONN;
12674
12675 return _lseek(fh, offset, whence);
12676 }
12677
12678 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12679 {
12680 Mutex::Locker lock(client_lock);
12681 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12682 tout(cct) << "ll_read" << std::endl;
12683 tout(cct) << (unsigned long)fh << std::endl;
12684 tout(cct) << off << std::endl;
12685 tout(cct) << len << std::endl;
12686
12687 if (unmounting)
12688 return -ENOTCONN;
12689
12690 return _read(fh, off, len, bl);
12691 }
12692
12693 int Client::ll_read_block(Inode *in, uint64_t blockid,
12694 char *buf,
12695 uint64_t offset,
12696 uint64_t length,
12697 file_layout_t* layout)
12698 {
12699 Mutex::Locker lock(client_lock);
12700
12701 if (unmounting)
12702 return -ENOTCONN;
12703
12704 vinodeno_t vino = _get_vino(in);
12705 object_t oid = file_object_t(vino.ino, blockid);
12706 C_SaferCond onfinish;
12707 bufferlist bl;
12708
12709 objecter->read(oid,
12710 object_locator_t(layout->pool_id),
12711 offset,
12712 length,
12713 vino.snapid,
12714 &bl,
12715 CEPH_OSD_FLAG_READ,
12716 &onfinish);
12717
12718 client_lock.Unlock();
12719 int r = onfinish.wait();
12720 client_lock.Lock();
12721
12722 if (r >= 0) {
12723 bl.copy(0, bl.length(), buf);
12724 r = bl.length();
12725 }
12726
12727 return r;
12728 }
12729
12730 /* It appears that the OSD doesn't return success unless the entire
12731 buffer was written, return the write length on success. */
12732
12733 int Client::ll_write_block(Inode *in, uint64_t blockid,
12734 char* buf, uint64_t offset,
12735 uint64_t length, file_layout_t* layout,
12736 uint64_t snapseq, uint32_t sync)
12737 {
12738 Mutex flock("Client::ll_write_block flock");
12739 vinodeno_t vino = ll_get_vino(in);
12740 Cond cond;
12741 bool done;
12742 int r = 0;
12743 Context *onsafe = nullptr;
12744
12745 if (length == 0) {
12746 return -EINVAL;
12747 }
12748 if (true || sync) {
12749 /* if write is stable, the epilogue is waiting on
12750 * flock */
12751 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12752 done = false;
12753 } else {
12754 /* if write is unstable, we just place a barrier for
12755 * future commits to wait on */
12756 /*onsafe = new C_Block_Sync(this, vino.ino,
12757 barrier_interval(offset, offset + length), &r);
12758 */
12759 done = true;
12760 }
12761 object_t oid = file_object_t(vino.ino, blockid);
12762 SnapContext fakesnap;
12763 bufferptr bp;
12764 if (length > 0) bp = buffer::copy(buf, length);
12765 bufferlist bl;
12766 bl.push_back(bp);
12767
12768 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12769 << dendl;
12770
12771 fakesnap.seq = snapseq;
12772
12773 /* lock just in time */
12774 client_lock.Lock();
12775 if (unmounting) {
12776 client_lock.Unlock();
12777 delete onsafe;
12778 return -ENOTCONN;
12779 }
12780
12781 objecter->write(oid,
12782 object_locator_t(layout->pool_id),
12783 offset,
12784 length,
12785 fakesnap,
12786 bl,
12787 ceph::real_clock::now(),
12788 0,
12789 onsafe);
12790
12791 client_lock.Unlock();
12792 if (!done /* also !sync */) {
12793 flock.Lock();
12794 while (! done)
12795 cond.Wait(flock);
12796 flock.Unlock();
12797 }
12798
12799 if (r < 0) {
12800 return r;
12801 } else {
12802 return length;
12803 }
12804 }
12805
12806 int Client::ll_commit_blocks(Inode *in,
12807 uint64_t offset,
12808 uint64_t length)
12809 {
12810 Mutex::Locker lock(client_lock);
12811 /*
12812 BarrierContext *bctx;
12813 vinodeno_t vino = _get_vino(in);
12814 uint64_t ino = vino.ino;
12815
12816 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12817 << offset << " to " << length << dendl;
12818
12819 if (length == 0) {
12820 return -EINVAL;
12821 }
12822
12823 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12824 if (p != barriers.end()) {
12825 barrier_interval civ(offset, offset + length);
12826 p->second->commit_barrier(civ);
12827 }
12828 */
12829 return 0;
12830 }
12831
12832 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12833 {
12834 Mutex::Locker lock(client_lock);
12835 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12836 "~" << len << dendl;
12837 tout(cct) << "ll_write" << std::endl;
12838 tout(cct) << (unsigned long)fh << std::endl;
12839 tout(cct) << off << std::endl;
12840 tout(cct) << len << std::endl;
12841
12842 if (unmounting)
12843 return -ENOTCONN;
12844
12845 int r = _write(fh, off, len, data, NULL, 0);
12846 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12847 << dendl;
12848 return r;
12849 }
12850
12851 int Client::ll_flush(Fh *fh)
12852 {
12853 Mutex::Locker lock(client_lock);
12854 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12855 tout(cct) << "ll_flush" << std::endl;
12856 tout(cct) << (unsigned long)fh << std::endl;
12857
12858 if (unmounting)
12859 return -ENOTCONN;
12860
12861 return _flush(fh);
12862 }
12863
12864 int Client::ll_fsync(Fh *fh, bool syncdataonly)
12865 {
12866 Mutex::Locker lock(client_lock);
12867 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12868 tout(cct) << "ll_fsync" << std::endl;
12869 tout(cct) << (unsigned long)fh << std::endl;
12870
12871 if (unmounting)
12872 return -ENOTCONN;
12873
12874 int r = _fsync(fh, syncdataonly);
12875 if (r) {
12876 // If we're returning an error, clear it from the FH
12877 fh->take_async_err();
12878 }
12879 return r;
12880 }
12881
12882 #ifdef FALLOC_FL_PUNCH_HOLE
12883
12884 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12885 {
12886 if (offset < 0 || length <= 0)
12887 return -EINVAL;
12888
12889 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12890 return -EOPNOTSUPP;
12891
12892 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12893 return -EOPNOTSUPP;
12894
12895 Inode *in = fh->inode.get();
12896
12897 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12898 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12899 return -ENOSPC;
12900 }
12901
12902 if (in->snapid != CEPH_NOSNAP)
12903 return -EROFS;
12904
12905 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12906 return -EBADF;
12907
12908 uint64_t size = offset + length;
12909 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12910 size > in->size &&
12911 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12912 return -EDQUOT;
12913 }
12914
12915 int have;
12916 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12917 if (r < 0)
12918 return r;
12919
12920 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12921 Cond uninline_cond;
12922 bool uninline_done = false;
12923 int uninline_ret = 0;
12924 Context *onuninline = NULL;
12925
12926 if (mode & FALLOC_FL_PUNCH_HOLE) {
12927 if (in->inline_version < CEPH_INLINE_NONE &&
12928 (have & CEPH_CAP_FILE_BUFFER)) {
12929 bufferlist bl;
12930 int len = in->inline_data.length();
12931 if (offset < len) {
12932 if (offset > 0)
12933 in->inline_data.copy(0, offset, bl);
12934 int size = length;
12935 if (offset + size > len)
12936 size = len - offset;
12937 if (size > 0)
12938 bl.append_zero(size);
12939 if (offset + size < len)
12940 in->inline_data.copy(offset + size, len - offset - size, bl);
12941 in->inline_data = bl;
12942 in->inline_version++;
12943 }
12944 in->mtime = ceph_clock_now();
12945 in->change_attr++;
12946 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12947 } else {
12948 if (in->inline_version < CEPH_INLINE_NONE) {
12949 onuninline = new C_SafeCond(&uninline_flock,
12950 &uninline_cond,
12951 &uninline_done,
12952 &uninline_ret);
12953 uninline_data(in, onuninline);
12954 }
12955
12956 Mutex flock("Client::_punch_hole flock");
12957 Cond cond;
12958 bool done = false;
12959 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12960
12961 unsafe_sync_write++;
12962 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12963
12964 _invalidate_inode_cache(in, offset, length);
12965 filer->zero(in->ino, &in->layout,
12966 in->snaprealm->get_snap_context(),
12967 offset, length,
12968 ceph::real_clock::now(),
12969 0, true, onfinish);
12970 in->mtime = ceph_clock_now();
12971 in->change_attr++;
12972 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12973
12974 client_lock.Unlock();
12975 flock.Lock();
12976 while (!done)
12977 cond.Wait(flock);
12978 flock.Unlock();
12979 client_lock.Lock();
12980 _sync_write_commit(in);
12981 }
12982 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12983 uint64_t size = offset + length;
12984 if (size > in->size) {
12985 in->size = size;
12986 in->mtime = ceph_clock_now();
12987 in->change_attr++;
12988 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12989
12990 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12991 check_caps(in, CHECK_CAPS_NODELAY);
12992 } else if (is_max_size_approaching(in)) {
12993 check_caps(in, 0);
12994 }
12995 }
12996 }
12997
12998 if (onuninline) {
12999 client_lock.Unlock();
13000 uninline_flock.Lock();
13001 while (!uninline_done)
13002 uninline_cond.Wait(uninline_flock);
13003 uninline_flock.Unlock();
13004 client_lock.Lock();
13005
13006 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13007 in->inline_data.clear();
13008 in->inline_version = CEPH_INLINE_NONE;
13009 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
13010 check_caps(in, 0);
13011 } else
13012 r = uninline_ret;
13013 }
13014
13015 put_cap_ref(in, CEPH_CAP_FILE_WR);
13016 return r;
13017 }
13018 #else
13019
13020 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13021 {
13022 return -EOPNOTSUPP;
13023 }
13024
13025 #endif
13026
13027
13028 int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13029 {
13030 Mutex::Locker lock(client_lock);
13031 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13032 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13033 tout(cct) << (unsigned long)fh << std::endl;
13034
13035 if (unmounting)
13036 return -ENOTCONN;
13037
13038 return _fallocate(fh, mode, offset, length);
13039 }
13040
13041 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13042 {
13043 Mutex::Locker lock(client_lock);
13044 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13045
13046 if (unmounting)
13047 return -ENOTCONN;
13048
13049 Fh *fh = get_filehandle(fd);
13050 if (!fh)
13051 return -EBADF;
13052 #if defined(__linux__) && defined(O_PATH)
13053 if (fh->flags & O_PATH)
13054 return -EBADF;
13055 #endif
13056 return _fallocate(fh, mode, offset, length);
13057 }
13058
13059 int Client::ll_release(Fh *fh)
13060 {
13061 Mutex::Locker lock(client_lock);
13062 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13063 dendl;
13064 tout(cct) << "ll_release (fh)" << std::endl;
13065 tout(cct) << (unsigned long)fh << std::endl;
13066
13067 if (unmounting)
13068 return -ENOTCONN;
13069
13070 if (ll_unclosed_fh_set.count(fh))
13071 ll_unclosed_fh_set.erase(fh);
13072 return _release_fh(fh);
13073 }
13074
13075 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13076 {
13077 Mutex::Locker lock(client_lock);
13078
13079 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13080 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13081
13082 if (unmounting)
13083 return -ENOTCONN;
13084
13085 return _getlk(fh, fl, owner);
13086 }
13087
13088 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13089 {
13090 Mutex::Locker lock(client_lock);
13091
13092 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13093 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13094
13095 if (unmounting)
13096 return -ENOTCONN;
13097
13098 return _setlk(fh, fl, owner, sleep);
13099 }
13100
13101 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13102 {
13103 Mutex::Locker lock(client_lock);
13104
13105 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13106 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13107
13108 if (unmounting)
13109 return -ENOTCONN;
13110
13111 return _flock(fh, cmd, owner);
13112 }
13113
13114 int Client::set_deleg_timeout(uint32_t timeout)
13115 {
13116 Mutex::Locker lock(client_lock);
13117
13118 /*
13119 * The whole point is to prevent blacklisting so we must time out the
13120 * delegation before the session autoclose timeout kicks in.
13121 */
13122 if (timeout >= mdsmap->get_session_autoclose())
13123 return -EINVAL;
13124
13125 deleg_timeout = timeout;
13126 return 0;
13127 }
13128
13129 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13130 {
13131 int ret = -EINVAL;
13132
13133 Mutex::Locker lock(client_lock);
13134
13135 if (!mounted)
13136 return -ENOTCONN;
13137
13138 Inode *inode = fh->inode.get();
13139
13140 switch(cmd) {
13141 case CEPH_DELEGATION_NONE:
13142 inode->unset_deleg(fh);
13143 ret = 0;
13144 break;
13145 default:
13146 try {
13147 ret = inode->set_deleg(fh, cmd, cb, priv);
13148 } catch (std::bad_alloc) {
13149 ret = -ENOMEM;
13150 }
13151 break;
13152 }
13153 return ret;
13154 }
13155
13156 class C_Client_RequestInterrupt : public Context {
13157 private:
13158 Client *client;
13159 MetaRequest *req;
13160 public:
13161 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13162 req->get();
13163 }
13164 void finish(int r) override {
13165 Mutex::Locker l(client->client_lock);
13166 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13167 client->_interrupt_filelock(req);
13168 client->put_request(req);
13169 }
13170 };
13171
13172 void Client::ll_interrupt(void *d)
13173 {
13174 MetaRequest *req = static_cast<MetaRequest*>(d);
13175 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13176 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13177 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13178 }
13179
13180 // =========================================
13181 // layout
13182
13183 // expose file layouts
13184
13185 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13186 const UserPerm& perms)
13187 {
13188 Mutex::Locker lock(client_lock);
13189
13190 if (unmounting)
13191 return -ENOTCONN;
13192
13193 filepath path(relpath);
13194 InodeRef in;
13195 int r = path_walk(path, &in, perms);
13196 if (r < 0)
13197 return r;
13198
13199 *lp = in->layout;
13200
13201 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13202 return 0;
13203 }
13204
13205 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13206 {
13207 Mutex::Locker lock(client_lock);
13208
13209 if (unmounting)
13210 return -ENOTCONN;
13211
13212 Fh *f = get_filehandle(fd);
13213 if (!f)
13214 return -EBADF;
13215 Inode *in = f->inode.get();
13216
13217 *lp = in->layout;
13218
13219 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13220 return 0;
13221 }
13222
13223 int64_t Client::get_default_pool_id()
13224 {
13225 Mutex::Locker lock(client_lock);
13226
13227 if (unmounting)
13228 return -ENOTCONN;
13229
13230 /* first data pool is the default */
13231 return mdsmap->get_first_data_pool();
13232 }
13233
13234 // expose osdmap
13235
13236 int64_t Client::get_pool_id(const char *pool_name)
13237 {
13238 Mutex::Locker lock(client_lock);
13239
13240 if (unmounting)
13241 return -ENOTCONN;
13242
13243 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13244 pool_name);
13245 }
13246
13247 string Client::get_pool_name(int64_t pool)
13248 {
13249 Mutex::Locker lock(client_lock);
13250
13251 if (unmounting)
13252 return string();
13253
13254 return objecter->with_osdmap([pool](const OSDMap& o) {
13255 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13256 });
13257 }
13258
13259 int Client::get_pool_replication(int64_t pool)
13260 {
13261 Mutex::Locker lock(client_lock);
13262
13263 if (unmounting)
13264 return -ENOTCONN;
13265
13266 return objecter->with_osdmap([pool](const OSDMap& o) {
13267 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13268 });
13269 }
13270
13271 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13272 {
13273 Mutex::Locker lock(client_lock);
13274
13275 if (unmounting)
13276 return -ENOTCONN;
13277
13278 Fh *f = get_filehandle(fd);
13279 if (!f)
13280 return -EBADF;
13281 Inode *in = f->inode.get();
13282
13283 vector<ObjectExtent> extents;
13284 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13285 assert(extents.size() == 1);
13286
13287 objecter->with_osdmap([&](const OSDMap& o) {
13288 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13289 o.pg_to_acting_osds(pg, osds);
13290 });
13291
13292 if (osds.empty())
13293 return -EINVAL;
13294
13295 /*
13296 * Return the remainder of the extent (stripe unit)
13297 *
13298 * If length = 1 is passed to Striper::file_to_extents we get a single
13299 * extent back, but its length is one so we still need to compute the length
13300 * to the end of the stripe unit.
13301 *
13302 * If length = su then we may get 1 or 2 objects back in the extents vector
13303 * which would have to be examined. Even then, the offsets are local to the
13304 * object, so matching up to the file offset is extra work.
13305 *
13306 * It seems simpler to stick with length = 1 and manually compute the
13307 * remainder.
13308 */
13309 if (len) {
13310 uint64_t su = in->layout.stripe_unit;
13311 *len = su - (off % su);
13312 }
13313
13314 return 0;
13315 }
13316
13317 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13318 {
13319 Mutex::Locker lock(client_lock);
13320
13321 if (unmounting)
13322 return -ENOTCONN;
13323
13324 if (id < 0)
13325 return -EINVAL;
13326 return objecter->with_osdmap([&](const OSDMap& o) {
13327 return o.crush->get_full_location_ordered(id, path);
13328 });
13329 }
13330
13331 int Client::get_file_stripe_address(int fd, loff_t offset,
13332 vector<entity_addr_t>& address)
13333 {
13334 Mutex::Locker lock(client_lock);
13335
13336 if (unmounting)
13337 return -ENOTCONN;
13338
13339 Fh *f = get_filehandle(fd);
13340 if (!f)
13341 return -EBADF;
13342 Inode *in = f->inode.get();
13343
13344 // which object?
13345 vector<ObjectExtent> extents;
13346 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13347 in->truncate_size, extents);
13348 assert(extents.size() == 1);
13349
13350 // now we have the object and its 'layout'
13351 return objecter->with_osdmap([&](const OSDMap& o) {
13352 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13353 vector<int> osds;
13354 o.pg_to_acting_osds(pg, osds);
13355 if (osds.empty())
13356 return -EINVAL;
13357 for (unsigned i = 0; i < osds.size(); i++) {
13358 entity_addr_t addr = o.get_addr(osds[i]);
13359 address.push_back(addr);
13360 }
13361 return 0;
13362 });
13363 }
13364
13365 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13366 {
13367 Mutex::Locker lock(client_lock);
13368
13369 if (unmounting)
13370 return -ENOTCONN;
13371
13372 return objecter->with_osdmap([&](const OSDMap& o) {
13373 if (!o.exists(osd))
13374 return -ENOENT;
13375
13376 addr = o.get_addr(osd);
13377 return 0;
13378 });
13379 }
13380
13381 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13382 loff_t length, loff_t offset)
13383 {
13384 Mutex::Locker lock(client_lock);
13385
13386 if (unmounting)
13387 return -ENOTCONN;
13388
13389 Fh *f = get_filehandle(fd);
13390 if (!f)
13391 return -EBADF;
13392 Inode *in = f->inode.get();
13393
13394 // map to a list of extents
13395 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13396
13397 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13398 return 0;
13399 }
13400
13401
13402 /* find an osd with the same ip. -ENXIO if none. */
13403 int Client::get_local_osd()
13404 {
13405 Mutex::Locker lock(client_lock);
13406
13407 if (unmounting)
13408 return -ENOTCONN;
13409
13410 objecter->with_osdmap([this](const OSDMap& o) {
13411 if (o.get_epoch() != local_osd_epoch) {
13412 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13413 local_osd_epoch = o.get_epoch();
13414 }
13415 });
13416 return local_osd;
13417 }
13418
13419
13420
13421
13422
13423
13424 // ===============================
13425
13426 void Client::ms_handle_connect(Connection *con)
13427 {
13428 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13429 }
13430
13431 bool Client::ms_handle_reset(Connection *con)
13432 {
13433 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13434 return false;
13435 }
13436
13437 void Client::ms_handle_remote_reset(Connection *con)
13438 {
13439 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13440 Mutex::Locker l(client_lock);
13441 switch (con->get_peer_type()) {
13442 case CEPH_ENTITY_TYPE_MDS:
13443 {
13444 // kludge to figure out which mds this is; fixme with a Connection* state
13445 mds_rank_t mds = MDS_RANK_NONE;
13446 MetaSession *s = NULL;
13447 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13448 p != mds_sessions.end();
13449 ++p) {
13450 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13451 mds = p->first;
13452 s = p->second;
13453 }
13454 }
13455 if (mds >= 0) {
13456 assert (s != NULL);
13457 switch (s->state) {
13458 case MetaSession::STATE_CLOSING:
13459 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13460 _closed_mds_session(s);
13461 break;
13462
13463 case MetaSession::STATE_OPENING:
13464 {
13465 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13466 list<Context*> waiters;
13467 waiters.swap(s->waiting_for_open);
13468 _closed_mds_session(s);
13469 MetaSession *news = _get_or_open_mds_session(mds);
13470 news->waiting_for_open.swap(waiters);
13471 }
13472 break;
13473
13474 case MetaSession::STATE_OPEN:
13475 {
13476 const md_config_t *conf = cct->_conf;
13477 if (conf->client_reconnect_stale) {
13478 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13479 _closed_mds_session(s);
13480 } else {
13481 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13482 s->state = MetaSession::STATE_STALE;
13483 }
13484 }
13485 break;
13486
13487 case MetaSession::STATE_NEW:
13488 case MetaSession::STATE_CLOSED:
13489 default:
13490 break;
13491 }
13492 }
13493 }
13494 break;
13495 }
13496 }
13497
13498 bool Client::ms_handle_refused(Connection *con)
13499 {
13500 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13501 return false;
13502 }
13503
13504 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13505 {
13506 if (dest_type == CEPH_ENTITY_TYPE_MON)
13507 return true;
13508 *authorizer = monclient->build_authorizer(dest_type);
13509 return true;
13510 }
13511
13512 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13513 {
13514 Inode *cur = in;
13515 utime_t now = ceph_clock_now();
13516
13517 while (cur) {
13518 if (cur != in && cur->quota.is_enable())
13519 break;
13520
13521 Inode *parent_in = NULL;
13522 if (!cur->dn_set.empty()) {
13523 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13524 Dentry *dn = *p;
13525 if (dn->lease_mds >= 0 &&
13526 dn->lease_ttl > now &&
13527 mds_sessions.count(dn->lease_mds)) {
13528 parent_in = dn->dir->parent_inode;
13529 } else {
13530 Inode *diri = dn->dir->parent_inode;
13531 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13532 diri->shared_gen == dn->cap_shared_gen) {
13533 parent_in = dn->dir->parent_inode;
13534 }
13535 }
13536 if (parent_in)
13537 break;
13538 }
13539 } else if (root_parents.count(cur)) {
13540 parent_in = root_parents[cur].get();
13541 }
13542
13543 if (parent_in) {
13544 cur = parent_in;
13545 continue;
13546 }
13547
13548 if (cur == root_ancestor)
13549 break;
13550
13551 // deleted inode
13552 if (cur->nlink == 0) {
13553 cur = root_ancestor;
13554 break;
13555 }
13556
13557 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13558 filepath path(cur->ino);
13559 req->set_filepath(path);
13560 req->set_inode(cur);
13561
13562 InodeRef parent_ref;
13563 int ret = make_request(req, perms, &parent_ref);
13564 if (ret < 0) {
13565 ldout(cct, 1) << __func__ << " " << in->vino()
13566 << " failed to find parent of " << cur->vino()
13567 << " err " << ret << dendl;
13568 // FIXME: what to do?
13569 cur = root_ancestor;
13570 break;
13571 }
13572
13573 now = ceph_clock_now();
13574 if (cur == in)
13575 cur = parent_ref.get();
13576 else
13577 cur = in; // start over
13578 }
13579
13580 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13581 return cur;
13582 }
13583
13584 /**
13585 * Traverse quota ancestors of the Inode, return true
13586 * if any of them passes the passed function
13587 */
13588 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13589 std::function<bool (const Inode &in)> test)
13590 {
13591 while (true) {
13592 assert(in != NULL);
13593 if (test(*in)) {
13594 return true;
13595 }
13596
13597 if (in == root_ancestor) {
13598 // We're done traversing, drop out
13599 return false;
13600 } else {
13601 // Continue up the tree
13602 in = get_quota_root(in, perms);
13603 }
13604 }
13605
13606 return false;
13607 }
13608
13609 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13610 {
13611 return check_quota_condition(in, perms,
13612 [](const Inode &in) {
13613 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13614 });
13615 }
13616
13617 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13618 const UserPerm& perms)
13619 {
13620 return check_quota_condition(in, perms,
13621 [&new_bytes](const Inode &in) {
13622 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13623 > in.quota.max_bytes;
13624 });
13625 }
13626
13627 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13628 {
13629 return check_quota_condition(in, perms,
13630 [](const Inode &in) {
13631 if (in.quota.max_bytes) {
13632 if (in.rstat.rbytes >= in.quota.max_bytes) {
13633 return true;
13634 }
13635
13636 assert(in.size >= in.reported_size);
13637 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13638 const uint64_t size = in.size - in.reported_size;
13639 return (space >> 4) < size;
13640 } else {
13641 return false;
13642 }
13643 });
13644 }
13645
13646 enum {
13647 POOL_CHECKED = 1,
13648 POOL_CHECKING = 2,
13649 POOL_READ = 4,
13650 POOL_WRITE = 8,
13651 };
13652
13653 int Client::check_pool_perm(Inode *in, int need)
13654 {
13655 if (!cct->_conf->client_check_pool_perm)
13656 return 0;
13657
13658 int64_t pool_id = in->layout.pool_id;
13659 std::string pool_ns = in->layout.pool_ns;
13660 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13661 int have = 0;
13662 while (true) {
13663 auto it = pool_perms.find(perm_key);
13664 if (it == pool_perms.end())
13665 break;
13666 if (it->second == POOL_CHECKING) {
13667 // avoid concurrent checkings
13668 wait_on_list(waiting_for_pool_perm);
13669 } else {
13670 have = it->second;
13671 assert(have & POOL_CHECKED);
13672 break;
13673 }
13674 }
13675
13676 if (!have) {
13677 if (in->snapid != CEPH_NOSNAP) {
13678 // pool permission check needs to write to the first object. But for snapshot,
13679 // head of the first object may have alread been deleted. To avoid creating
13680 // orphan object, skip the check for now.
13681 return 0;
13682 }
13683
13684 pool_perms[perm_key] = POOL_CHECKING;
13685
13686 char oid_buf[32];
13687 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13688 object_t oid = oid_buf;
13689
13690 SnapContext nullsnapc;
13691
13692 C_SaferCond rd_cond;
13693 ObjectOperation rd_op;
13694 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13695
13696 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13697 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13698
13699 C_SaferCond wr_cond;
13700 ObjectOperation wr_op;
13701 wr_op.create(true);
13702
13703 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13704 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13705
13706 client_lock.Unlock();
13707 int rd_ret = rd_cond.wait();
13708 int wr_ret = wr_cond.wait();
13709 client_lock.Lock();
13710
13711 bool errored = false;
13712
13713 if (rd_ret == 0 || rd_ret == -ENOENT)
13714 have |= POOL_READ;
13715 else if (rd_ret != -EPERM) {
13716 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13717 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13718 errored = true;
13719 }
13720
13721 if (wr_ret == 0 || wr_ret == -EEXIST)
13722 have |= POOL_WRITE;
13723 else if (wr_ret != -EPERM) {
13724 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13725 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13726 errored = true;
13727 }
13728
13729 if (errored) {
13730 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13731 // Raise EIO because actual error code might be misleading for
13732 // userspace filesystem user.
13733 pool_perms.erase(perm_key);
13734 signal_cond_list(waiting_for_pool_perm);
13735 return -EIO;
13736 }
13737
13738 pool_perms[perm_key] = have | POOL_CHECKED;
13739 signal_cond_list(waiting_for_pool_perm);
13740 }
13741
13742 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13743 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13744 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13745 return -EPERM;
13746 }
13747 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13748 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13749 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13750 return -EPERM;
13751 }
13752
13753 return 0;
13754 }
13755
13756 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13757 {
13758 if (acl_type == POSIX_ACL) {
13759 if (in->xattrs.count(ACL_EA_ACCESS)) {
13760 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13761
13762 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13763 }
13764 }
13765 return -EAGAIN;
13766 }
13767
13768 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13769 {
13770 if (acl_type == NO_ACL)
13771 return 0;
13772
13773 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13774 if (r < 0)
13775 goto out;
13776
13777 if (acl_type == POSIX_ACL) {
13778 if (in->xattrs.count(ACL_EA_ACCESS)) {
13779 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13780 bufferptr acl(access_acl.c_str(), access_acl.length());
13781 r = posix_acl_access_chmod(acl, mode);
13782 if (r < 0)
13783 goto out;
13784 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13785 } else {
13786 r = 0;
13787 }
13788 }
13789 out:
13790 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13791 return r;
13792 }
13793
13794 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13795 const UserPerm& perms)
13796 {
13797 if (acl_type == NO_ACL)
13798 return 0;
13799
13800 if (S_ISLNK(*mode))
13801 return 0;
13802
13803 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13804 if (r < 0)
13805 goto out;
13806
13807 if (acl_type == POSIX_ACL) {
13808 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13809 map<string, bufferptr> xattrs;
13810
13811 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13812 bufferptr acl(default_acl.c_str(), default_acl.length());
13813 r = posix_acl_inherit_mode(acl, mode);
13814 if (r < 0)
13815 goto out;
13816
13817 if (r > 0) {
13818 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13819 if (r < 0)
13820 goto out;
13821 if (r > 0)
13822 xattrs[ACL_EA_ACCESS] = acl;
13823 }
13824
13825 if (S_ISDIR(*mode))
13826 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13827
13828 r = xattrs.size();
13829 if (r > 0)
13830 ::encode(xattrs, xattrs_bl);
13831 } else {
13832 if (umask_cb)
13833 *mode &= ~umask_cb(callback_handle);
13834 r = 0;
13835 }
13836 }
13837 out:
13838 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13839 return r;
13840 }
13841
13842 void Client::set_filer_flags(int flags)
13843 {
13844 Mutex::Locker l(client_lock);
13845 assert(flags == 0 ||
13846 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13847 objecter->add_global_op_flags(flags);
13848 }
13849
13850 void Client::clear_filer_flags(int flags)
13851 {
13852 Mutex::Locker l(client_lock);
13853 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13854 objecter->clear_global_op_flag(flags);
13855 }
13856
13857 /**
13858 * This is included in cap release messages, to cause
13859 * the MDS to wait until this OSD map epoch. It is necessary
13860 * in corner cases where we cancel RADOS ops, so that
13861 * nobody else tries to do IO to the same objects in
13862 * the same epoch as the cancelled ops.
13863 */
13864 void Client::set_cap_epoch_barrier(epoch_t e)
13865 {
13866 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13867 cap_epoch_barrier = e;
13868 }
13869
13870 const char** Client::get_tracked_conf_keys() const
13871 {
13872 static const char* keys[] = {
13873 "client_cache_size",
13874 "client_cache_mid",
13875 "client_acl_type",
13876 "client_deleg_timeout",
13877 "client_deleg_break_on_open",
13878 NULL
13879 };
13880 return keys;
13881 }
13882
13883 void Client::handle_conf_change(const struct md_config_t *conf,
13884 const std::set <std::string> &changed)
13885 {
13886 Mutex::Locker lock(client_lock);
13887
13888 if (changed.count("client_cache_mid")) {
13889 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13890 }
13891 if (changed.count("client_acl_type")) {
13892 acl_type = NO_ACL;
13893 if (cct->_conf->client_acl_type == "posix_acl")
13894 acl_type = POSIX_ACL;
13895 }
13896 }
13897
13898 void Client::init_groups(UserPerm *perms)
13899 {
13900 gid_t *sgids;
13901 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13902 perms->init_gids(sgids, count);
13903 }
13904
13905 void intrusive_ptr_add_ref(Inode *in)
13906 {
13907 in->get();
13908 }
13909
13910 void intrusive_ptr_release(Inode *in)
13911 {
13912 in->client->put_inode(in);
13913 }
13914
13915 mds_rank_t Client::_get_random_up_mds() const
13916 {
13917 assert(client_lock.is_locked_by_me());
13918
13919 std::set<mds_rank_t> up;
13920 mdsmap->get_up_mds_set(up);
13921
13922 if (up.empty())
13923 return MDS_RANK_NONE;
13924 std::set<mds_rank_t>::const_iterator p = up.begin();
13925 for (int n = rand() % up.size(); n; n--)
13926 ++p;
13927 return *p;
13928 }
13929
13930
13931 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13932 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13933 {
13934 monclient->set_messenger(m);
13935 objecter->set_client_incarnation(0);
13936 }
13937
13938 StandaloneClient::~StandaloneClient()
13939 {
13940 delete objecter;
13941 objecter = nullptr;
13942 }
13943
13944 int StandaloneClient::init()
13945 {
13946 timer.init();
13947 objectcacher->start();
13948 objecter->init();
13949
13950 client_lock.Lock();
13951 assert(!initialized);
13952
13953 messenger->add_dispatcher_tail(objecter);
13954 messenger->add_dispatcher_tail(this);
13955
13956 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13957 int r = monclient->init();
13958 if (r < 0) {
13959 // need to do cleanup because we're in an intermediate init state
13960 timer.shutdown();
13961 client_lock.Unlock();
13962 objecter->shutdown();
13963 objectcacher->stop();
13964 monclient->shutdown();
13965 return r;
13966 }
13967 objecter->start();
13968
13969 client_lock.Unlock();
13970 _finish_init();
13971
13972 return 0;
13973 }
13974
13975 void StandaloneClient::shutdown()
13976 {
13977 Client::shutdown();
13978 objecter->shutdown();
13979 monclient->shutdown();
13980 }