]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
6b34e4a330e43df4983fc48be64fd78045675a49
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <sys/stat.h>
22 #include <sys/param.h>
23 #include <fcntl.h>
24 #include <sys/file.h>
25 #include <sys/utsname.h>
26 #include <sys/uio.h>
27
28 #include <boost/lexical_cast.hpp>
29 #include <boost/fusion/include/std_pair.hpp>
30
31 #if defined(__FreeBSD__)
32 #define XATTR_CREATE 0x1
33 #define XATTR_REPLACE 0x2
34 #else
35 #include <sys/xattr.h>
36 #endif
37
38 #if defined(__linux__)
39 #include <linux/falloc.h>
40 #endif
41
42 #include <sys/statvfs.h>
43
44 #include "common/config.h"
45 #include "common/version.h"
46
47 // ceph stuff
48 #include "messages/MClientSession.h"
49 #include "messages/MClientReconnect.h"
50 #include "messages/MClientRequest.h"
51 #include "messages/MClientRequestForward.h"
52 #include "messages/MClientReply.h"
53 #include "messages/MClientCaps.h"
54 #include "messages/MClientLease.h"
55 #include "messages/MClientSnap.h"
56 #include "messages/MCommandReply.h"
57 #include "messages/MOSDMap.h"
58 #include "messages/MClientQuota.h"
59 #include "messages/MClientCapRelease.h"
60 #include "messages/MMDSMap.h"
61 #include "messages/MFSMap.h"
62 #include "messages/MFSMapUser.h"
63
64 #include "mon/MonClient.h"
65
66 #include "mds/flock.h"
67 #include "osd/OSDMap.h"
68 #include "osdc/Filer.h"
69
70 #include "common/Cond.h"
71 #include "common/Mutex.h"
72 #include "common/perf_counters.h"
73 #include "common/admin_socket.h"
74 #include "common/errno.h"
75 #include "include/str_list.h"
76
77 #define dout_subsys ceph_subsys_client
78
79 #include "include/lru.h"
80 #include "include/compat.h"
81 #include "include/stringify.h"
82
83 #include "Client.h"
84 #include "Inode.h"
85 #include "Dentry.h"
86 #include "Dir.h"
87 #include "ClientSnapRealm.h"
88 #include "Fh.h"
89 #include "MetaSession.h"
90 #include "MetaRequest.h"
91 #include "ObjecterWriteback.h"
92 #include "posix_acl.h"
93
94 #include "include/assert.h"
95 #include "include/stat.h"
96
97 #include "include/cephfs/ceph_statx.h"
98
99 #if HAVE_GETGROUPLIST
100 #include <grp.h>
101 #include <pwd.h>
102 #include <unistd.h>
103 #endif
104
105 #undef dout_prefix
106 #define dout_prefix *_dout << "client." << whoami << " "
107
108 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
109
110 // FreeBSD fails to define this
111 #ifndef O_DSYNC
112 #define O_DSYNC 0x0
113 #endif
114 // Darwin fails to define this
115 #ifndef O_RSYNC
116 #define O_RSYNC 0x0
117 #endif
118
119 #ifndef O_DIRECT
120 #define O_DIRECT 0x0
121 #endif
122
123 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
124
125 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
126 {
127 Client *client = static_cast<Client*>(p);
128 client->flush_set_callback(oset);
129 }
130
131
132 // -------------
133
134 Client::CommandHook::CommandHook(Client *client) :
135 m_client(client)
136 {
137 }
138
139 bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
140 std::string format, bufferlist& out)
141 {
142 Formatter *f = Formatter::create(format);
143 f->open_object_section("result");
144 m_client->client_lock.Lock();
145 if (command == "mds_requests")
146 m_client->dump_mds_requests(f);
147 else if (command == "mds_sessions")
148 m_client->dump_mds_sessions(f);
149 else if (command == "dump_cache")
150 m_client->dump_cache(f);
151 else if (command == "kick_stale_sessions")
152 m_client->_kick_stale_sessions();
153 else if (command == "status")
154 m_client->dump_status(f);
155 else
156 assert(0 == "bad command registered");
157 m_client->client_lock.Unlock();
158 f->close_section();
159 f->flush(out);
160 delete f;
161 return true;
162 }
163
164
165 // -------------
166
167 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
168 : inode(in), offset(0), next_offset(2),
169 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
170 perms(perms)
171 { }
172
173 void Client::_reset_faked_inos()
174 {
175 ino_t start = 1024;
176 free_faked_inos.clear();
177 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
178 last_used_faked_ino = 0;
179 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
180 }
181
182 void Client::_assign_faked_ino(Inode *in)
183 {
184 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
185 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
186 last_used_faked_ino = 0;
187 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
188 }
189 assert(it != free_faked_inos.end());
190 if (last_used_faked_ino < it.get_start()) {
191 assert(it.get_len() > 0);
192 last_used_faked_ino = it.get_start();
193 } else {
194 ++last_used_faked_ino;
195 assert(it.get_start() + it.get_len() > last_used_faked_ino);
196 }
197 in->faked_ino = last_used_faked_ino;
198 free_faked_inos.erase(in->faked_ino);
199 faked_ino_map[in->faked_ino] = in->vino();
200 }
201
202 void Client::_release_faked_ino(Inode *in)
203 {
204 free_faked_inos.insert(in->faked_ino);
205 faked_ino_map.erase(in->faked_ino);
206 }
207
208 vinodeno_t Client::_map_faked_ino(ino_t ino)
209 {
210 vinodeno_t vino;
211 if (ino == 1)
212 vino = root->vino();
213 else if (faked_ino_map.count(ino))
214 vino = faked_ino_map[ino];
215 else
216 vino = vinodeno_t(0, CEPH_NOSNAP);
217 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
218 return vino;
219 }
220
221 vinodeno_t Client::map_faked_ino(ino_t ino)
222 {
223 Mutex::Locker lock(client_lock);
224 return _map_faked_ino(ino);
225 }
226
227 // cons/des
228
229 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
230 : Dispatcher(m->cct),
231 m_command_hook(this),
232 timer(m->cct, client_lock),
233 callback_handle(NULL),
234 switch_interrupt_cb(NULL),
235 remount_cb(NULL),
236 ino_invalidate_cb(NULL),
237 dentry_invalidate_cb(NULL),
238 getgroups_cb(NULL),
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
241 require_remount(false),
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
247 tick_event(NULL),
248 messenger(m), monclient(mc),
249 objecter(objecter_),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 initialized(false),
253 mounted(false), unmounting(false), blacklisted(false),
254 local_osd(-1), local_osd_epoch(0),
255 unsafe_sync_write(0),
256 client_lock("Client::client_lock")
257 {
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
274 lru.lru_set_max(cct->_conf->client_cache_size);
275 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
276
277 // file handles
278 free_fd_set.insert(10, 1<<30);
279
280 mdsmap.reset(new MDSMap);
281
282 // osd interfaces
283 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
284 &client_lock));
285 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
286 client_flush_set_callback, // all commit callback
287 (void*)this,
288 cct->_conf->client_oc_size,
289 cct->_conf->client_oc_max_objects,
290 cct->_conf->client_oc_max_dirty,
291 cct->_conf->client_oc_target_dirty,
292 cct->_conf->client_oc_max_dirty_age,
293 true));
294 objecter_finisher.start();
295 filer.reset(new Filer(objecter, &objecter_finisher));
296 objecter->enable_blacklist_events();
297 }
298
299
300 Client::~Client()
301 {
302 assert(!client_lock.is_locked());
303
304 // It is necessary to hold client_lock, because any inode destruction
305 // may call into ObjectCacher, which asserts that it's lock (which is
306 // client_lock) is held.
307 client_lock.Lock();
308 tear_down_cache();
309 client_lock.Unlock();
310 }
311
312 void Client::tear_down_cache()
313 {
314 // fd's
315 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
316 it != fd_map.end();
317 ++it) {
318 Fh *fh = it->second;
319 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
320 _release_fh(fh);
321 }
322 fd_map.clear();
323
324 while (!opened_dirs.empty()) {
325 dir_result_t *dirp = *opened_dirs.begin();
326 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
327 _closedir(dirp);
328 }
329
330 // caps!
331 // *** FIXME ***
332
333 // empty lru
334 lru.lru_set_max(0);
335 trim_cache();
336 assert(lru.lru_get_size() == 0);
337
338 // close root ino
339 assert(inode_map.size() <= 1 + root_parents.size());
340 if (root && inode_map.size() == 1 + root_parents.size()) {
341 delete root;
342 root = 0;
343 root_ancestor = 0;
344 while (!root_parents.empty())
345 root_parents.erase(root_parents.begin());
346 inode_map.clear();
347 _reset_faked_inos();
348 }
349
350 assert(inode_map.empty());
351 }
352
353 inodeno_t Client::get_root_ino()
354 {
355 Mutex::Locker l(client_lock);
356 if (use_faked_inos())
357 return root->faked_ino;
358 else
359 return root->ino;
360 }
361
362 Inode *Client::get_root()
363 {
364 Mutex::Locker l(client_lock);
365 root->ll_get();
366 return root;
367 }
368
369
370 // debug crapola
371
372 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
373 {
374 filepath path;
375 in->make_long_path(path);
376 ldout(cct, 1) << "dump_inode: "
377 << (disconnected ? "DISCONNECTED ":"")
378 << "inode " << in->ino
379 << " " << path
380 << " ref " << in->get_num_ref()
381 << *in << dendl;
382
383 if (f) {
384 f->open_object_section("inode");
385 f->dump_stream("path") << path;
386 if (disconnected)
387 f->dump_int("disconnected", 1);
388 in->dump(f);
389 f->close_section();
390 }
391
392 did.insert(in);
393 if (in->dir) {
394 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
395 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
396 it != in->dir->dentries.end();
397 ++it) {
398 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
399 if (f) {
400 f->open_object_section("dentry");
401 it->second->dump(f);
402 f->close_section();
403 }
404 if (it->second->inode)
405 dump_inode(f, it->second->inode.get(), did, false);
406 }
407 }
408 }
409
410 void Client::dump_cache(Formatter *f)
411 {
412 set<Inode*> did;
413
414 ldout(cct, 1) << "dump_cache" << dendl;
415
416 if (f)
417 f->open_array_section("cache");
418
419 if (root)
420 dump_inode(f, root, did, true);
421
422 // make a second pass to catch anything disconnected
423 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
424 it != inode_map.end();
425 ++it) {
426 if (did.count(it->second))
427 continue;
428 dump_inode(f, it->second, did, true);
429 }
430
431 if (f)
432 f->close_section();
433 }
434
435 void Client::dump_status(Formatter *f)
436 {
437 assert(client_lock.is_locked_by_me());
438
439 ldout(cct, 1) << __func__ << dendl;
440
441 const epoch_t osd_epoch
442 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
443
444 if (f) {
445 f->open_object_section("metadata");
446 for (const auto& kv : metadata)
447 f->dump_string(kv.first.c_str(), kv.second);
448 f->close_section();
449
450 f->dump_int("dentry_count", lru.lru_get_size());
451 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
452 f->dump_int("id", get_nodeid().v);
453 f->dump_int("inode_count", inode_map.size());
454 f->dump_int("mds_epoch", mdsmap->get_epoch());
455 f->dump_int("osd_epoch", osd_epoch);
456 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
457 }
458 }
459
460 int Client::init()
461 {
462 timer.init();
463 objectcacher->start();
464
465 client_lock.Lock();
466 assert(!initialized);
467
468 messenger->add_dispatcher_tail(this);
469 client_lock.Unlock();
470
471 _finish_init();
472 return 0;
473 }
474
475 void Client::_finish_init()
476 {
477 client_lock.Lock();
478 // logger
479 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
480 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
481 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
482 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
483 logger.reset(plb.create_perf_counters());
484 cct->get_perfcounters_collection()->add(logger.get());
485
486 client_lock.Unlock();
487
488 cct->_conf->add_observer(this);
489
490 AdminSocket* admin_socket = cct->get_admin_socket();
491 int ret = admin_socket->register_command("mds_requests",
492 "mds_requests",
493 &m_command_hook,
494 "show in-progress mds requests");
495 if (ret < 0) {
496 lderr(cct) << "error registering admin socket command: "
497 << cpp_strerror(-ret) << dendl;
498 }
499 ret = admin_socket->register_command("mds_sessions",
500 "mds_sessions",
501 &m_command_hook,
502 "show mds session state");
503 if (ret < 0) {
504 lderr(cct) << "error registering admin socket command: "
505 << cpp_strerror(-ret) << dendl;
506 }
507 ret = admin_socket->register_command("dump_cache",
508 "dump_cache",
509 &m_command_hook,
510 "show in-memory metadata cache contents");
511 if (ret < 0) {
512 lderr(cct) << "error registering admin socket command: "
513 << cpp_strerror(-ret) << dendl;
514 }
515 ret = admin_socket->register_command("kick_stale_sessions",
516 "kick_stale_sessions",
517 &m_command_hook,
518 "kick sessions that were remote reset");
519 if (ret < 0) {
520 lderr(cct) << "error registering admin socket command: "
521 << cpp_strerror(-ret) << dendl;
522 }
523 ret = admin_socket->register_command("status",
524 "status",
525 &m_command_hook,
526 "show overall client status");
527 if (ret < 0) {
528 lderr(cct) << "error registering admin socket command: "
529 << cpp_strerror(-ret) << dendl;
530 }
531
532 client_lock.Lock();
533 initialized = true;
534 client_lock.Unlock();
535 }
536
537 void Client::shutdown()
538 {
539 ldout(cct, 1) << "shutdown" << dendl;
540
541 // If we were not mounted, but were being used for sending
542 // MDS commands, we may have sessions that need closing.
543 client_lock.Lock();
544 _close_sessions();
545 client_lock.Unlock();
546
547 cct->_conf->remove_observer(this);
548
549 AdminSocket* admin_socket = cct->get_admin_socket();
550 admin_socket->unregister_command("mds_requests");
551 admin_socket->unregister_command("mds_sessions");
552 admin_socket->unregister_command("dump_cache");
553 admin_socket->unregister_command("kick_stale_sessions");
554 admin_socket->unregister_command("status");
555
556 if (ino_invalidate_cb) {
557 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
558 async_ino_invalidator.wait_for_empty();
559 async_ino_invalidator.stop();
560 }
561
562 if (dentry_invalidate_cb) {
563 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
564 async_dentry_invalidator.wait_for_empty();
565 async_dentry_invalidator.stop();
566 }
567
568 if (switch_interrupt_cb) {
569 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
570 interrupt_finisher.wait_for_empty();
571 interrupt_finisher.stop();
572 }
573
574 if (remount_cb) {
575 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
576 remount_finisher.wait_for_empty();
577 remount_finisher.stop();
578 }
579
580 objectcacher->stop(); // outside of client_lock! this does a join.
581
582 client_lock.Lock();
583 assert(initialized);
584 initialized = false;
585 timer.shutdown();
586 client_lock.Unlock();
587
588 objecter_finisher.wait_for_empty();
589 objecter_finisher.stop();
590
591 if (logger) {
592 cct->get_perfcounters_collection()->remove(logger.get());
593 logger.reset();
594 }
595 }
596
597
598 // ===================
599 // metadata cache stuff
600
601 void Client::trim_cache(bool trim_kernel_dcache)
602 {
603 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl;
604 unsigned last = 0;
605 while (lru.lru_get_size() != last) {
606 last = lru.lru_get_size();
607
608 if (lru.lru_get_size() <= lru.lru_get_max()) break;
609
610 // trim!
611 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
612 if (!dn)
613 break; // done
614
615 trim_dentry(dn);
616 }
617
618 if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max())
619 _invalidate_kernel_dcache();
620
621 // hose root?
622 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
623 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
624 delete root;
625 root = 0;
626 root_ancestor = 0;
627 while (!root_parents.empty())
628 root_parents.erase(root_parents.begin());
629 inode_map.clear();
630 _reset_faked_inos();
631 }
632 }
633
634 void Client::trim_cache_for_reconnect(MetaSession *s)
635 {
636 mds_rank_t mds = s->mds_num;
637 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
638
639 int trimmed = 0;
640 list<Dentry*> skipped;
641 while (lru.lru_get_size() > 0) {
642 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
643 if (!dn)
644 break;
645
646 if ((dn->inode && dn->inode->caps.count(mds)) ||
647 dn->dir->parent_inode->caps.count(mds)) {
648 trim_dentry(dn);
649 trimmed++;
650 } else
651 skipped.push_back(dn);
652 }
653
654 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
655 lru.lru_insert_mid(*p);
656
657 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
658 << " trimmed " << trimmed << " dentries" << dendl;
659
660 if (s->caps.size() > 0)
661 _invalidate_kernel_dcache();
662 }
663
664 void Client::trim_dentry(Dentry *dn)
665 {
666 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
667 << " in dir " << hex << dn->dir->parent_inode->ino
668 << dendl;
669 if (dn->inode) {
670 Inode *diri = dn->dir->parent_inode;
671 diri->dir_release_count++;
672 clear_dir_complete_and_ordered(diri, true);
673 }
674 unlink(dn, false, false); // drop dir, drop dentry
675 }
676
677
678 void Client::update_inode_file_bits(Inode *in,
679 uint64_t truncate_seq, uint64_t truncate_size,
680 uint64_t size, uint64_t change_attr,
681 uint64_t time_warp_seq, utime_t ctime,
682 utime_t mtime,
683 utime_t atime,
684 version_t inline_version,
685 bufferlist& inline_data,
686 int issued)
687 {
688 bool warn = false;
689 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
690 << " mtime " << mtime << dendl;
691 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
692 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
693 << " local " << in->time_warp_seq << dendl;
694 uint64_t prior_size = in->size;
695
696 if (inline_version > in->inline_version) {
697 in->inline_data = inline_data;
698 in->inline_version = inline_version;
699 }
700
701 /* always take a newer change attr */
702 if (change_attr > in->change_attr)
703 in->change_attr = change_attr;
704
705 if (truncate_seq > in->truncate_seq ||
706 (truncate_seq == in->truncate_seq && size > in->size)) {
707 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
708 in->size = size;
709 in->reported_size = size;
710 if (truncate_seq != in->truncate_seq) {
711 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
712 << truncate_seq << dendl;
713 in->truncate_seq = truncate_seq;
714 in->oset.truncate_seq = truncate_seq;
715
716 // truncate cached file data
717 if (prior_size > size) {
718 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
719 }
720 }
721
722 // truncate inline data
723 if (in->inline_version < CEPH_INLINE_NONE) {
724 uint32_t len = in->inline_data.length();
725 if (size < len)
726 in->inline_data.splice(size, len - size);
727 }
728 }
729 if (truncate_seq >= in->truncate_seq &&
730 in->truncate_size != truncate_size) {
731 if (in->is_file()) {
732 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
733 << truncate_size << dendl;
734 in->truncate_size = truncate_size;
735 in->oset.truncate_size = truncate_size;
736 } else {
737 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
738 }
739 }
740
741 // be careful with size, mtime, atime
742 if (issued & (CEPH_CAP_FILE_EXCL|
743 CEPH_CAP_FILE_WR|
744 CEPH_CAP_FILE_BUFFER|
745 CEPH_CAP_AUTH_EXCL|
746 CEPH_CAP_XATTR_EXCL)) {
747 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
748 if (ctime > in->ctime)
749 in->ctime = ctime;
750 if (time_warp_seq > in->time_warp_seq) {
751 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
752 << " is higher than local time_warp_seq "
753 << in->time_warp_seq << dendl;
754 //the mds updated times, so take those!
755 in->mtime = mtime;
756 in->atime = atime;
757 in->time_warp_seq = time_warp_seq;
758 } else if (time_warp_seq == in->time_warp_seq) {
759 //take max times
760 if (mtime > in->mtime)
761 in->mtime = mtime;
762 if (atime > in->atime)
763 in->atime = atime;
764 } else if (issued & CEPH_CAP_FILE_EXCL) {
765 //ignore mds values as we have a higher seq
766 } else warn = true;
767 } else {
768 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
769 if (time_warp_seq >= in->time_warp_seq) {
770 in->ctime = ctime;
771 in->mtime = mtime;
772 in->atime = atime;
773 in->time_warp_seq = time_warp_seq;
774 } else warn = true;
775 }
776 if (warn) {
777 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
778 << time_warp_seq << " is lower than local time_warp_seq "
779 << in->time_warp_seq
780 << dendl;
781 }
782 }
783
784 void Client::_fragmap_remove_non_leaves(Inode *in)
785 {
786 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (!in->dirfragtree.is_leaf(p->first))
788 in->fragmap.erase(p++);
789 else
790 ++p;
791 }
792
793 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
794 {
795 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
796 if (p->second == mds)
797 in->fragmap.erase(p++);
798 else
799 ++p;
800 }
801
802 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
803 MetaSession *session,
804 const UserPerm& request_perms)
805 {
806 Inode *in;
807 bool was_new = false;
808 if (inode_map.count(st->vino)) {
809 in = inode_map[st->vino];
810 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
811 } else {
812 in = new Inode(this, st->vino, &st->layout);
813 inode_map[st->vino] = in;
814
815 if (use_faked_inos())
816 _assign_faked_ino(in);
817
818 if (!root) {
819 root = in;
820 root_ancestor = in;
821 cwd = root;
822 } else if (!mounted) {
823 root_parents[root_ancestor] = in;
824 root_ancestor = in;
825 }
826
827 // immutable bits
828 in->ino = st->vino.ino;
829 in->snapid = st->vino.snapid;
830 in->mode = st->mode & S_IFMT;
831 was_new = true;
832 }
833
834 in->rdev = st->rdev;
835 if (in->is_symlink())
836 in->symlink = st->symlink;
837
838 if (was_new)
839 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
840
841 if (!st->cap.caps)
842 return in; // as with readdir returning indoes in different snaprealms (no caps!)
843
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool updating_inode = false;
846 int issued = 0;
847 if (st->version == 0 ||
848 (in->version & ~1) < st->version) {
849 updating_inode = true;
850
851 int implemented = 0;
852 issued = in->caps_issued(&implemented) | in->caps_dirty();
853 issued |= implemented;
854
855 in->version = st->version;
856
857 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
862 }
863
864 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
865 in->nlink = st->nlink;
866 }
867
868 in->dirstat = st->dirstat;
869 in->rstat = st->rstat;
870 in->quota = st->quota;
871 in->layout = st->layout;
872
873 if (in->is_dir()) {
874 in->dir_layout = st->dir_layout;
875 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
876 }
877
878 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
879 st->change_attr, st->time_warp_seq, st->ctime,
880 st->mtime, st->atime, st->inline_version,
881 st->inline_data, issued);
882 } else if (st->inline_version > in->inline_version) {
883 in->inline_data = st->inline_data;
884 in->inline_version = st->inline_version;
885 }
886
887 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
888 st->xattrbl.length() &&
889 st->xattr_version > in->xattr_version) {
890 bufferlist::iterator p = st->xattrbl.begin();
891 ::decode(in->xattrs, p);
892 in->xattr_version = st->xattr_version;
893 }
894
895 // move me if/when version reflects fragtree changes.
896 if (in->dirfragtree != st->dirfragtree) {
897 in->dirfragtree = st->dirfragtree;
898 _fragmap_remove_non_leaves(in);
899 }
900
901 if (in->snapid == CEPH_NOSNAP) {
902 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
903 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
904 request_perms);
905 if (in->auth_cap && in->auth_cap->session == session)
906 in->max_size = st->max_size;
907 } else
908 in->snap_caps |= st->cap.caps;
909
910 // setting I_COMPLETE needs to happen after adding the cap
911 if (updating_inode &&
912 in->is_dir() &&
913 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
914 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
915 in->dirstat.nfiles == 0 &&
916 in->dirstat.nsubdirs == 0) {
917 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
918 in->flags |= I_COMPLETE | I_DIR_ORDERED;
919 if (in->dir) {
920 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
921 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
922 in->dir->readdir_cache.clear();
923 for (auto p = in->dir->dentries.begin();
924 p != in->dir->dentries.end();
925 ++p) {
926 unlink(p->second, true, true); // keep dir, keep dentry
927 }
928 if (in->dir->dentries.empty())
929 close_dir(in->dir);
930 }
931 }
932
933 return in;
934 }
935
936
937 /*
938 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
939 */
940 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
941 Inode *in, utime_t from, MetaSession *session,
942 Dentry *old_dentry)
943 {
944 Dentry *dn = NULL;
945 if (dir->dentries.count(dname))
946 dn = dir->dentries[dname];
947
948 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
949 << " in dir " << dir->parent_inode->vino() << " dn " << dn
950 << dendl;
951
952 if (dn && dn->inode) {
953 if (dn->inode->vino() == in->vino()) {
954 touch_dn(dn);
955 ldout(cct, 12) << " had dentry " << dname
956 << " with correct vino " << dn->inode->vino()
957 << dendl;
958 } else {
959 ldout(cct, 12) << " had dentry " << dname
960 << " with WRONG vino " << dn->inode->vino()
961 << dendl;
962 unlink(dn, true, true); // keep dir, keep dentry
963 }
964 }
965
966 if (!dn || !dn->inode) {
967 InodeRef tmp_ref(in);
968 if (old_dentry) {
969 if (old_dentry->dir != dir) {
970 Inode *old_diri = old_dentry->dir->parent_inode;
971 old_diri->dir_ordered_count++;
972 clear_dir_complete_and_ordered(old_diri, false);
973 }
974 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
975 }
976 Inode *diri = dir->parent_inode;
977 diri->dir_ordered_count++;
978 clear_dir_complete_and_ordered(diri, false);
979 dn = link(dir, dname, in, dn);
980 }
981
982 update_dentry_lease(dn, dlease, from, session);
983 return dn;
984 }
985
986 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
987 {
988 utime_t dttl = from;
989 dttl += (float)dlease->duration_ms / 1000.0;
990
991 assert(dn);
992
993 if (dlease->mask & CEPH_LOCK_DN) {
994 if (dttl > dn->lease_ttl) {
995 ldout(cct, 10) << "got dentry lease on " << dn->name
996 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
997 dn->lease_ttl = dttl;
998 dn->lease_mds = session->mds_num;
999 dn->lease_seq = dlease->seq;
1000 dn->lease_gen = session->cap_gen;
1001 }
1002 }
1003 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1004 }
1005
1006
1007 /*
1008 * update MDS location cache for a single inode
1009 */
1010 void Client::update_dir_dist(Inode *in, DirStat *dst)
1011 {
1012 // auth
1013 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1014 if (dst->auth >= 0) {
1015 in->fragmap[dst->frag] = dst->auth;
1016 } else {
1017 in->fragmap.erase(dst->frag);
1018 }
1019 if (!in->dirfragtree.is_leaf(dst->frag)) {
1020 in->dirfragtree.force_to_leaf(cct, dst->frag);
1021 _fragmap_remove_non_leaves(in);
1022 }
1023
1024 // replicated
1025 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1026
1027 // dist
1028 /*
1029 if (!st->dirfrag_dist.empty()) { // FIXME
1030 set<int> dist = st->dirfrag_dist.begin()->second;
1031 if (dist.empty() && !in->dir_contacts.empty())
1032 ldout(cct, 9) << "lost dist spec for " << in->ino
1033 << " " << dist << dendl;
1034 if (!dist.empty() && in->dir_contacts.empty())
1035 ldout(cct, 9) << "got dist spec for " << in->ino
1036 << " " << dist << dendl;
1037 in->dir_contacts = dist;
1038 }
1039 */
1040 }
1041
1042 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043 {
1044 if (diri->flags & I_COMPLETE) {
1045 if (complete) {
1046 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1047 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048 } else {
1049 if (diri->flags & I_DIR_ORDERED) {
1050 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1051 diri->flags &= ~I_DIR_ORDERED;
1052 }
1053 }
1054 if (diri->dir)
1055 diri->dir->readdir_cache.clear();
1056 }
1057 }
1058
1059 /*
1060 * insert results from readdir or lssnap into the metadata cache.
1061 */
1062 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063
1064 MClientReply *reply = request->reply;
1065 ConnectionRef con = request->reply->get_connection();
1066 uint64_t features = con->get_features();
1067
1068 dir_result_t *dirp = request->dirp;
1069 assert(dirp);
1070
1071 // the extra buffer list is only set for readdir and lssnap replies
1072 bufferlist::iterator p = reply->get_extra_bl().begin();
1073 if (!p.end()) {
1074 // snapdir?
1075 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076 assert(diri);
1077 diri = open_snapdir(diri);
1078 }
1079
1080 // only open dir if we're actually adding stuff to it!
1081 Dir *dir = diri->open_dir();
1082 assert(dir);
1083
1084 // dirstat
1085 DirStat dst(p);
1086 __u32 numdn;
1087 __u16 flags;
1088 ::decode(numdn, p);
1089 ::decode(flags, p);
1090
1091 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1092 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093
1094 frag_t fg = (unsigned)request->head.args.readdir.frag;
1095 unsigned readdir_offset = dirp->next_offset;
1096 string readdir_start = dirp->last_name;
1097 assert(!readdir_start.empty() || readdir_offset == 2);
1098
1099 unsigned last_hash = 0;
1100 if (hash_order) {
1101 if (!readdir_start.empty()) {
1102 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1103 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1104 /* mds understands offset_hash */
1105 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1106 }
1107 }
1108
1109 if (fg != dst.frag) {
1110 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1111 fg = dst.frag;
1112 if (!hash_order) {
1113 readdir_offset = 2;
1114 readdir_start.clear();
1115 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1116 }
1117 }
1118
1119 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1120 << ", hash_order=" << hash_order
1121 << ", readdir_start " << readdir_start
1122 << ", last_hash " << last_hash
1123 << ", next_offset " << readdir_offset << dendl;
1124
1125 if (diri->snapid != CEPH_SNAPDIR &&
1126 fg.is_leftmost() && readdir_offset == 2 &&
1127 !(hash_order && last_hash)) {
1128 dirp->release_count = diri->dir_release_count;
1129 dirp->ordered_count = diri->dir_ordered_count;
1130 dirp->start_shared_gen = diri->shared_gen;
1131 dirp->cache_index = 0;
1132 }
1133
1134 dirp->buffer_frag = fg;
1135
1136 _readdir_drop_dirp_buffer(dirp);
1137 dirp->buffer.reserve(numdn);
1138
1139 string dname;
1140 LeaseStat dlease;
1141 for (unsigned i=0; i<numdn; i++) {
1142 ::decode(dname, p);
1143 ::decode(dlease, p);
1144 InodeStat ist(p, features);
1145
1146 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147
1148 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1149 request->perms);
1150 Dentry *dn;
1151 if (diri->dir->dentries.count(dname)) {
1152 Dentry *olddn = diri->dir->dentries[dname];
1153 if (olddn->inode != in) {
1154 // replace incorrect dentry
1155 unlink(olddn, true, true); // keep dir, dentry
1156 dn = link(dir, dname, in, olddn);
1157 assert(dn == olddn);
1158 } else {
1159 // keep existing dn
1160 dn = olddn;
1161 touch_dn(dn);
1162 }
1163 } else {
1164 // new dn
1165 dn = link(dir, dname, in, NULL);
1166 }
1167
1168 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169 if (hash_order) {
1170 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1171 if (hash != last_hash)
1172 readdir_offset = 2;
1173 last_hash = hash;
1174 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175 } else {
1176 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177 }
1178 // add to readdir cache
1179 if (dirp->release_count == diri->dir_release_count &&
1180 dirp->ordered_count == diri->dir_ordered_count &&
1181 dirp->start_shared_gen == diri->shared_gen) {
1182 if (dirp->cache_index == dir->readdir_cache.size()) {
1183 if (i == 0) {
1184 assert(!dirp->inode->is_complete_and_ordered());
1185 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186 }
1187 dir->readdir_cache.push_back(dn);
1188 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1189 if (dirp->inode->is_complete_and_ordered())
1190 assert(dir->readdir_cache[dirp->cache_index] == dn);
1191 else
1192 dir->readdir_cache[dirp->cache_index] = dn;
1193 } else {
1194 assert(0 == "unexpected readdir buffer idx");
1195 }
1196 dirp->cache_index++;
1197 }
1198 // add to cached result list
1199 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1200 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1201 }
1202
1203 if (numdn > 0)
1204 dirp->last_name = dname;
1205 if (end)
1206 dirp->next_offset = 2;
1207 else
1208 dirp->next_offset = readdir_offset;
1209
1210 if (dir->is_empty())
1211 close_dir(dir);
1212 }
1213 }
1214
1215 /** insert_trace
1216 *
1217 * insert a trace from a MDS reply into the cache.
1218 */
1219 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220 {
1221 MClientReply *reply = request->reply;
1222 int op = request->get_op();
1223
1224 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1225 << " is_target=" << (int)reply->head.is_target
1226 << " is_dentry=" << (int)reply->head.is_dentry
1227 << dendl;
1228
1229 bufferlist::iterator p = reply->get_trace_bl().begin();
1230 if (request->got_unsafe) {
1231 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1232 assert(p.end());
1233 return NULL;
1234 }
1235
1236 if (p.end()) {
1237 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238
1239 Dentry *d = request->dentry();
1240 if (d) {
1241 Inode *diri = d->dir->parent_inode;
1242 diri->dir_release_count++;
1243 clear_dir_complete_and_ordered(diri, true);
1244 }
1245
1246 if (d && reply->get_result() == 0) {
1247 if (op == CEPH_MDS_OP_RENAME) {
1248 // rename
1249 Dentry *od = request->old_dentry();
1250 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251 assert(od);
1252 unlink(od, true, true); // keep dir, dentry
1253 } else if (op == CEPH_MDS_OP_RMDIR ||
1254 op == CEPH_MDS_OP_UNLINK) {
1255 // unlink, rmdir
1256 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1257 unlink(d, true, true); // keep dir, dentry
1258 }
1259 }
1260 return NULL;
1261 }
1262
1263 ConnectionRef con = request->reply->get_connection();
1264 uint64_t features = con->get_features();
1265 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1266
1267 // snap trace
1268 SnapRealm *realm = NULL;
1269 if (reply->snapbl.length())
1270 update_snap_trace(reply->snapbl, &realm);
1271
1272 ldout(cct, 10) << " hrm "
1273 << " is_target=" << (int)reply->head.is_target
1274 << " is_dentry=" << (int)reply->head.is_dentry
1275 << dendl;
1276
1277 InodeStat dirst;
1278 DirStat dst;
1279 string dname;
1280 LeaseStat dlease;
1281 InodeStat ist;
1282
1283 if (reply->head.is_dentry) {
1284 dirst.decode(p, features);
1285 dst.decode(p);
1286 ::decode(dname, p);
1287 ::decode(dlease, p);
1288 }
1289
1290 Inode *in = 0;
1291 if (reply->head.is_target) {
1292 ist.decode(p, features);
1293 if (cct->_conf->client_debug_getattr_caps) {
1294 unsigned wanted = 0;
1295 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1296 wanted = request->head.args.getattr.mask;
1297 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1298 wanted = request->head.args.open.mask;
1299
1300 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1301 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1302 assert(0 == "MDS reply does not contain xattrs");
1303 }
1304
1305 in = add_update_inode(&ist, request->sent_stamp, session,
1306 request->perms);
1307 }
1308
1309 Inode *diri = NULL;
1310 if (reply->head.is_dentry) {
1311 diri = add_update_inode(&dirst, request->sent_stamp, session,
1312 request->perms);
1313 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1314
1315 if (in) {
1316 Dir *dir = diri->open_dir();
1317 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1318 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1319 } else {
1320 Dentry *dn = NULL;
1321 if (diri->dir && diri->dir->dentries.count(dname)) {
1322 dn = diri->dir->dentries[dname];
1323 if (dn->inode) {
1324 diri->dir_ordered_count++;
1325 clear_dir_complete_and_ordered(diri, false);
1326 unlink(dn, true, true); // keep dir, dentry
1327 }
1328 }
1329 if (dlease.duration_ms > 0) {
1330 if (!dn) {
1331 Dir *dir = diri->open_dir();
1332 dn = link(dir, dname, NULL, NULL);
1333 }
1334 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1335 }
1336 }
1337 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1338 op == CEPH_MDS_OP_MKSNAP) {
1339 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1340 // fake it for snap lookup
1341 vinodeno_t vino = ist.vino;
1342 vino.snapid = CEPH_SNAPDIR;
1343 assert(inode_map.count(vino));
1344 diri = inode_map[vino];
1345
1346 string dname = request->path.last_dentry();
1347
1348 LeaseStat dlease;
1349 dlease.duration_ms = 0;
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354 } else {
1355 if (diri->dir && diri->dir->dentries.count(dname)) {
1356 Dentry *dn = diri->dir->dentries[dname];
1357 if (dn->inode)
1358 unlink(dn, true, true); // keep dir, dentry
1359 }
1360 }
1361 }
1362
1363 if (in) {
1364 if (op == CEPH_MDS_OP_READDIR ||
1365 op == CEPH_MDS_OP_LSSNAP) {
1366 insert_readdir_results(request, session, in);
1367 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1368 // hack: return parent inode instead
1369 in = diri;
1370 }
1371
1372 if (request->dentry() == NULL && in != request->inode()) {
1373 // pin the target inode if its parent dentry is not pinned
1374 request->set_other_inode(in);
1375 }
1376 }
1377
1378 if (realm)
1379 put_snap_realm(realm);
1380
1381 request->target = in;
1382 return in;
1383 }
1384
1385 // -------
1386
1387 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388 {
1389 mds_rank_t mds = MDS_RANK_NONE;
1390 __u32 hash = 0;
1391 bool is_hash = false;
1392
1393 Inode *in = NULL;
1394 Dentry *de = NULL;
1395 Cap *cap = NULL;
1396
1397 if (req->resend_mds >= 0) {
1398 mds = req->resend_mds;
1399 req->resend_mds = -1;
1400 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1401 goto out;
1402 }
1403
1404 if (cct->_conf->client_use_random_mds)
1405 goto random_mds;
1406
1407 in = req->inode();
1408 de = req->dentry();
1409 if (in) {
1410 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1411 if (req->path.depth()) {
1412 hash = in->hash_dentry_name(req->path[0]);
1413 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1414 << " on " << req->path[0]
1415 << " => " << hash << dendl;
1416 is_hash = true;
1417 }
1418 } else if (de) {
1419 if (de->inode) {
1420 in = de->inode.get();
1421 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422 } else {
1423 in = de->dir->parent_inode;
1424 hash = in->hash_dentry_name(de->name);
1425 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1426 << " on " << de->name
1427 << " => " << hash << dendl;
1428 is_hash = true;
1429 }
1430 }
1431 if (in) {
1432 if (in->snapid != CEPH_NOSNAP) {
1433 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1434 while (in->snapid != CEPH_NOSNAP) {
1435 if (in->snapid == CEPH_SNAPDIR)
1436 in = in->snapdir_parent.get();
1437 else if (!in->dn_set.empty())
1438 /* In most cases there will only be one dentry, so getting it
1439 * will be the correct action. If there are multiple hard links,
1440 * I think the MDS should be able to redirect as needed*/
1441 in = in->get_first_parent()->dir->parent_inode;
1442 else {
1443 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1444 break;
1445 }
1446 }
1447 is_hash = false;
1448 }
1449
1450 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1451 << " hash=" << hash << dendl;
1452
1453 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1454 frag_t fg = in->dirfragtree[hash];
1455 if (in->fragmap.count(fg)) {
1456 mds = in->fragmap[fg];
1457 if (phash_diri)
1458 *phash_diri = in;
1459 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1460 goto out;
1461 }
1462 }
1463
1464 if (req->auth_is_best())
1465 cap = in->auth_cap;
1466 if (!cap && !in->caps.empty())
1467 cap = in->caps.begin()->second;
1468 if (!cap)
1469 goto random_mds;
1470 mds = cap->session->mds_num;
1471 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1472
1473 goto out;
1474 }
1475
1476 random_mds:
1477 if (mds < 0) {
1478 mds = _get_random_up_mds();
1479 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1480 }
1481
1482 out:
1483 ldout(cct, 20) << "mds is " << mds << dendl;
1484 return mds;
1485 }
1486
1487
1488 void Client::connect_mds_targets(mds_rank_t mds)
1489 {
1490 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1491 assert(mds_sessions.count(mds));
1492 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1493 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1494 q != info.export_targets.end();
1495 ++q) {
1496 if (mds_sessions.count(*q) == 0 &&
1497 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1498 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1499 << " export target mds." << *q << dendl;
1500 _open_mds_session(*q);
1501 }
1502 }
1503 }
1504
1505 void Client::dump_mds_sessions(Formatter *f)
1506 {
1507 f->dump_int("id", get_nodeid().v);
1508 f->open_array_section("sessions");
1509 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1510 f->open_object_section("session");
1511 p->second->dump(f);
1512 f->close_section();
1513 }
1514 f->close_section();
1515 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516 }
1517 void Client::dump_mds_requests(Formatter *f)
1518 {
1519 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1520 p != mds_requests.end();
1521 ++p) {
1522 f->open_object_section("request");
1523 p->second->dump(f);
1524 f->close_section();
1525 }
1526 }
1527
1528 int Client::verify_reply_trace(int r,
1529 MetaRequest *request, MClientReply *reply,
1530 InodeRef *ptarget, bool *pcreated,
1531 const UserPerm& perms)
1532 {
1533 // check whether this request actually did the create, and set created flag
1534 bufferlist extra_bl;
1535 inodeno_t created_ino;
1536 bool got_created_ino = false;
1537 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538
1539 extra_bl.claim(reply->get_extra_bl());
1540 if (extra_bl.length() >= 8) {
1541 // if the extra bufferlist has a buffer, we assume its the created inode
1542 // and that this request to create succeeded in actually creating
1543 // the inode (won the race with other create requests)
1544 ::decode(created_ino, extra_bl);
1545 got_created_ino = true;
1546 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1547 }
1548
1549 if (pcreated)
1550 *pcreated = got_created_ino;
1551
1552 if (request->target) {
1553 *ptarget = request->target;
1554 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555 } else {
1556 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1557 (*ptarget) = p->second;
1558 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559 } else {
1560 // we got a traceless reply, and need to look up what we just
1561 // created. for now, do this by name. someday, do this by the
1562 // ino... which we know! FIXME.
1563 InodeRef target;
1564 Dentry *d = request->dentry();
1565 if (d) {
1566 if (d->dir) {
1567 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1568 << d->dir->parent_inode->ino << "/" << d->name
1569 << " got_ino " << got_created_ino
1570 << " ino " << created_ino
1571 << dendl;
1572 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1573 &target, perms);
1574 } else {
1575 // if the dentry is not linked, just do our best. see #5021.
1576 assert(0 == "how did this happen? i want logs!");
1577 }
1578 } else {
1579 Inode *in = request->inode();
1580 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1581 << in->ino << dendl;
1582 r = _getattr(in, request->regetattr_mask, perms, true);
1583 target = in;
1584 }
1585 if (r >= 0) {
1586 // verify ino returned in reply and trace_dist are the same
1587 if (got_created_ino &&
1588 created_ino.val != target->ino.val) {
1589 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1590 r = -EINTR;
1591 }
1592 if (ptarget)
1593 ptarget->swap(target);
1594 }
1595 }
1596 }
1597
1598 return r;
1599 }
1600
1601
1602 /**
1603 * make a request
1604 *
1605 * Blocking helper to make an MDS request.
1606 *
1607 * If the ptarget flag is set, behavior changes slightly: the caller
1608 * expects to get a pointer to the inode we are creating or operating
1609 * on. As a result, we will follow up any traceless mutation reply
1610 * with a getattr or lookup to transparently handle a traceless reply
1611 * from the MDS (as when the MDS restarts and the client has to replay
1612 * a request).
1613 *
1614 * @param request the MetaRequest to execute
1615 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1616 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1617 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1618 * @param use_mds [optional] prefer a specific mds (-1 for default)
1619 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620 */
1621 int Client::make_request(MetaRequest *request,
1622 const UserPerm& perms,
1623 InodeRef *ptarget, bool *pcreated,
1624 mds_rank_t use_mds,
1625 bufferlist *pdirbl)
1626 {
1627 int r = 0;
1628
1629 // assign a unique tid
1630 ceph_tid_t tid = ++last_tid;
1631 request->set_tid(tid);
1632
1633 // and timestamp
1634 request->op_stamp = ceph_clock_now();
1635
1636 // make note
1637 mds_requests[tid] = request->get();
1638 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1639 oldest_tid = tid;
1640
1641 request->set_caller_perms(perms);
1642
1643 if (cct->_conf->client_inject_fixed_oldest_tid) {
1644 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1645 request->set_oldest_client_tid(1);
1646 } else {
1647 request->set_oldest_client_tid(oldest_tid);
1648 }
1649
1650 // hack target mds?
1651 if (use_mds >= 0)
1652 request->resend_mds = use_mds;
1653
1654 while (1) {
1655 if (request->aborted())
1656 break;
1657
1658 if (blacklisted) {
1659 request->abort(-EBLACKLISTED);
1660 break;
1661 }
1662
1663 // set up wait cond
1664 Cond caller_cond;
1665 request->caller_cond = &caller_cond;
1666
1667 // choose mds
1668 Inode *hash_diri = NULL;
1669 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1670 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1671 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1672 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673 if (hash_diri) {
1674 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1675 _fragmap_remove_stopped_mds(hash_diri, mds);
1676 } else {
1677 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1678 request->resend_mds = _get_random_up_mds();
1679 }
1680 } else {
1681 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1682 wait_on_list(waiting_for_mdsmap);
1683 }
1684 continue;
1685 }
1686
1687 // open a session?
1688 MetaSession *session = NULL;
1689 if (!have_open_session(mds)) {
1690 session = _get_or_open_mds_session(mds);
1691
1692 // wait
1693 if (session->state == MetaSession::STATE_OPENING) {
1694 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1695 wait_on_context_list(session->waiting_for_open);
1696 // Abort requests on REJECT from MDS
1697 if (rejected_by_mds.count(mds)) {
1698 request->abort(-EPERM);
1699 break;
1700 }
1701 continue;
1702 }
1703
1704 if (!have_open_session(mds))
1705 continue;
1706 } else {
1707 session = mds_sessions[mds];
1708 }
1709
1710 // send request.
1711 send_request(request, session);
1712
1713 // wait for signal
1714 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1715 request->kick = false;
1716 while (!request->reply && // reply
1717 request->resend_mds < 0 && // forward
1718 !request->kick)
1719 caller_cond.Wait(client_lock);
1720 request->caller_cond = NULL;
1721
1722 // did we get a reply?
1723 if (request->reply)
1724 break;
1725 }
1726
1727 if (!request->reply) {
1728 assert(request->aborted());
1729 assert(!request->got_unsafe);
1730 r = request->get_abort_code();
1731 request->item.remove_myself();
1732 unregister_request(request);
1733 put_request(request); // ours
1734 return r;
1735 }
1736
1737 // got it!
1738 MClientReply *reply = request->reply;
1739 request->reply = NULL;
1740 r = reply->get_result();
1741 if (r >= 0)
1742 request->success = true;
1743
1744 // kick dispatcher (we've got it!)
1745 assert(request->dispatch_cond);
1746 request->dispatch_cond->Signal();
1747 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1748 request->dispatch_cond = 0;
1749
1750 if (r >= 0 && ptarget)
1751 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1752
1753 if (pdirbl)
1754 pdirbl->claim(reply->get_extra_bl());
1755
1756 // -- log times --
1757 utime_t lat = ceph_clock_now();
1758 lat -= request->sent_stamp;
1759 ldout(cct, 20) << "lat " << lat << dendl;
1760 logger->tinc(l_c_lat, lat);
1761 logger->tinc(l_c_reply, lat);
1762
1763 put_request(request);
1764
1765 reply->put();
1766 return r;
1767 }
1768
1769 void Client::unregister_request(MetaRequest *req)
1770 {
1771 mds_requests.erase(req->tid);
1772 if (req->tid == oldest_tid) {
1773 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774 while (true) {
1775 if (p == mds_requests.end()) {
1776 oldest_tid = 0;
1777 break;
1778 }
1779 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1780 oldest_tid = p->first;
1781 break;
1782 }
1783 ++p;
1784 }
1785 }
1786 put_request(req);
1787 }
1788
1789 void Client::put_request(MetaRequest *request)
1790 {
1791 if (request->_put()) {
1792 int op = -1;
1793 if (request->success)
1794 op = request->get_op();
1795 InodeRef other_in;
1796 request->take_other_inode(&other_in);
1797 delete request;
1798
1799 if (other_in &&
1800 (op == CEPH_MDS_OP_RMDIR ||
1801 op == CEPH_MDS_OP_RENAME ||
1802 op == CEPH_MDS_OP_RMSNAP)) {
1803 _try_to_trim_inode(other_in.get(), false);
1804 }
1805 }
1806 }
1807
1808 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1809 mds_rank_t mds, int drop,
1810 int unless, int force)
1811 {
1812 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1813 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1814 << ", have:" << ", force:" << force << ")" << dendl;
1815 int released = 0;
1816 if (in->caps.count(mds)) {
1817 Cap *caps = in->caps[mds];
1818 drop &= ~(in->dirty_caps | get_caps_used(in));
1819 if ((drop & caps->issued) &&
1820 !(unless & caps->issued)) {
1821 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1822 caps->issued &= ~drop;
1823 caps->implemented &= ~drop;
1824 released = 1;
1825 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1826 } else {
1827 released = force;
1828 }
1829 if (released) {
1830 ceph_mds_request_release rel;
1831 rel.ino = in->ino;
1832 rel.cap_id = caps->cap_id;
1833 rel.seq = caps->seq;
1834 rel.issue_seq = caps->issue_seq;
1835 rel.mseq = caps->mseq;
1836 rel.caps = caps->implemented;
1837 rel.wanted = caps->wanted;
1838 rel.dname_len = 0;
1839 rel.dname_seq = 0;
1840 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1841 }
1842 }
1843 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1844 << released << dendl;
1845 return released;
1846 }
1847
1848 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1849 mds_rank_t mds, int drop, int unless)
1850 {
1851 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1852 << dn << ")" << dendl;
1853 int released = 0;
1854 if (dn->dir)
1855 released = encode_inode_release(dn->dir->parent_inode, req,
1856 mds, drop, unless, 1);
1857 if (released && dn->lease_mds == mds) {
1858 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1859 MClientRequest::Release& rel = req->cap_releases.back();
1860 rel.item.dname_len = dn->name.length();
1861 rel.item.dname_seq = dn->lease_seq;
1862 rel.dname = dn->name;
1863 }
1864 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1865 << dn << ")" << dendl;
1866 }
1867
1868
1869 /*
1870 * This requires the MClientRequest *request member to be set.
1871 * It will error out horribly without one.
1872 * Additionally, if you set any *drop member, you'd better have
1873 * set the corresponding dentry!
1874 */
1875 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876 {
1877 ldout(cct, 20) << "encode_cap_releases enter (req: "
1878 << req << ", mds: " << mds << ")" << dendl;
1879 if (req->inode_drop && req->inode())
1880 encode_inode_release(req->inode(), req,
1881 mds, req->inode_drop,
1882 req->inode_unless);
1883
1884 if (req->old_inode_drop && req->old_inode())
1885 encode_inode_release(req->old_inode(), req,
1886 mds, req->old_inode_drop,
1887 req->old_inode_unless);
1888 if (req->other_inode_drop && req->other_inode())
1889 encode_inode_release(req->other_inode(), req,
1890 mds, req->other_inode_drop,
1891 req->other_inode_unless);
1892
1893 if (req->dentry_drop && req->dentry())
1894 encode_dentry_release(req->dentry(), req,
1895 mds, req->dentry_drop,
1896 req->dentry_unless);
1897
1898 if (req->old_dentry_drop && req->old_dentry())
1899 encode_dentry_release(req->old_dentry(), req,
1900 mds, req->old_dentry_drop,
1901 req->old_dentry_unless);
1902 ldout(cct, 25) << "encode_cap_releases exit (req: "
1903 << req << ", mds " << mds <<dendl;
1904 }
1905
1906 bool Client::have_open_session(mds_rank_t mds)
1907 {
1908 return
1909 mds_sessions.count(mds) &&
1910 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1911 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1912 }
1913
1914 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915 {
1916 if (mds_sessions.count(mds) == 0)
1917 return NULL;
1918 MetaSession *s = mds_sessions[mds];
1919 if (s->con != con)
1920 return NULL;
1921 return s;
1922 }
1923
1924 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925 {
1926 if (mds_sessions.count(mds))
1927 return mds_sessions[mds];
1928 return _open_mds_session(mds);
1929 }
1930
1931 /**
1932 * Populate a map of strings with client-identifying metadata,
1933 * such as the hostname. Call this once at initialization.
1934 */
1935 void Client::populate_metadata(const std::string &mount_root)
1936 {
1937 // Hostname
1938 struct utsname u;
1939 int r = uname(&u);
1940 if (r >= 0) {
1941 metadata["hostname"] = u.nodename;
1942 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943 } else {
1944 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1945 }
1946
1947 metadata["pid"] = stringify(getpid());
1948
1949 // Ceph entity id (the '0' in "client.0")
1950 metadata["entity_id"] = cct->_conf->name.get_id();
1951
1952 // Our mount position
1953 if (!mount_root.empty()) {
1954 metadata["root"] = mount_root;
1955 }
1956
1957 // Ceph version
1958 metadata["ceph_version"] = pretty_version_to_str();
1959 metadata["ceph_sha1"] = git_version_to_str();
1960
1961 // Apply any metadata from the user's configured overrides
1962 std::vector<std::string> tokens;
1963 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1964 for (const auto &i : tokens) {
1965 auto eqpos = i.find("=");
1966 // Throw out anything that isn't of the form "<str>=<str>"
1967 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1968 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1969 continue;
1970 }
1971 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1972 }
1973 }
1974
1975 /**
1976 * Optionally add or override client metadata fields.
1977 */
1978 void Client::update_metadata(std::string const &k, std::string const &v)
1979 {
1980 Mutex::Locker l(client_lock);
1981 assert(initialized);
1982
1983 if (metadata.count(k)) {
1984 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1985 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1986 }
1987
1988 metadata[k] = v;
1989 }
1990
1991 MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992 {
1993 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1994 assert(mds_sessions.count(mds) == 0);
1995 MetaSession *session = new MetaSession;
1996 session->mds_num = mds;
1997 session->seq = 0;
1998 session->inst = mdsmap->get_inst(mds);
1999 session->con = messenger->get_connection(session->inst);
2000 session->state = MetaSession::STATE_OPENING;
2001 session->mds_state = MDSMap::STATE_NULL;
2002 mds_sessions[mds] = session;
2003
2004 // Maybe skip sending a request to open if this MDS daemon
2005 // has previously sent us a REJECT.
2006 if (rejected_by_mds.count(mds)) {
2007 if (rejected_by_mds[mds] == session->inst) {
2008 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2009 "because we were rejected" << dendl;
2010 return session;
2011 } else {
2012 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2013 "rejected us, trying with new inst" << dendl;
2014 rejected_by_mds.erase(mds);
2015 }
2016 }
2017
2018 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2019 m->client_meta = metadata;
2020 session->con->send_message(m);
2021 return session;
2022 }
2023
2024 void Client::_close_mds_session(MetaSession *s)
2025 {
2026 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2027 s->state = MetaSession::STATE_CLOSING;
2028 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2029 }
2030
2031 void Client::_closed_mds_session(MetaSession *s)
2032 {
2033 s->state = MetaSession::STATE_CLOSED;
2034 s->con->mark_down();
2035 signal_context_list(s->waiting_for_open);
2036 mount_cond.Signal();
2037 remove_session_caps(s);
2038 kick_requests_closed(s);
2039 mds_sessions.erase(s->mds_num);
2040 delete s;
2041 }
2042
2043 void Client::handle_client_session(MClientSession *m)
2044 {
2045 mds_rank_t from = mds_rank_t(m->get_source().num());
2046 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047
2048 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049 if (!session) {
2050 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2051 m->put();
2052 return;
2053 }
2054
2055 switch (m->get_op()) {
2056 case CEPH_SESSION_OPEN:
2057 renew_caps(session);
2058 session->state = MetaSession::STATE_OPEN;
2059 if (unmounting)
2060 mount_cond.Signal();
2061 else
2062 connect_mds_targets(from);
2063 signal_context_list(session->waiting_for_open);
2064 break;
2065
2066 case CEPH_SESSION_CLOSE:
2067 _closed_mds_session(session);
2068 break;
2069
2070 case CEPH_SESSION_RENEWCAPS:
2071 if (session->cap_renew_seq == m->get_seq()) {
2072 session->cap_ttl =
2073 session->last_cap_renew_request + mdsmap->get_session_timeout();
2074 wake_inode_waiters(session);
2075 }
2076 break;
2077
2078 case CEPH_SESSION_STALE:
2079 renew_caps(session);
2080 break;
2081
2082 case CEPH_SESSION_RECALL_STATE:
2083 trim_caps(session, m->get_max_caps());
2084 break;
2085
2086 case CEPH_SESSION_FLUSHMSG:
2087 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2088 break;
2089
2090 case CEPH_SESSION_FORCE_RO:
2091 force_session_readonly(session);
2092 break;
2093
2094 case CEPH_SESSION_REJECT:
2095 rejected_by_mds[session->mds_num] = session->inst;
2096 _closed_mds_session(session);
2097
2098 break;
2099
2100 default:
2101 ceph_abort();
2102 }
2103
2104 m->put();
2105 }
2106
2107 bool Client::_any_stale_sessions() const
2108 {
2109 assert(client_lock.is_locked_by_me());
2110
2111 for (const auto &i : mds_sessions) {
2112 if (i.second->state == MetaSession::STATE_STALE) {
2113 return true;
2114 }
2115 }
2116
2117 return false;
2118 }
2119
2120 void Client::_kick_stale_sessions()
2121 {
2122 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123
2124 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2125 p != mds_sessions.end(); ) {
2126 MetaSession *s = p->second;
2127 ++p;
2128 if (s->state == MetaSession::STATE_STALE)
2129 _closed_mds_session(s);
2130 }
2131 }
2132
2133 void Client::send_request(MetaRequest *request, MetaSession *session,
2134 bool drop_cap_releases)
2135 {
2136 // make the request
2137 mds_rank_t mds = session->mds_num;
2138 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2139 << " for mds." << mds << dendl;
2140 MClientRequest *r = build_client_request(request);
2141 if (request->dentry()) {
2142 r->set_dentry_wanted();
2143 }
2144 if (request->got_unsafe) {
2145 r->set_replayed_op();
2146 if (request->target)
2147 r->head.ino = request->target->ino;
2148 } else {
2149 encode_cap_releases(request, mds);
2150 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2151 request->cap_releases.clear();
2152 else
2153 r->releases.swap(request->cap_releases);
2154 }
2155 r->set_mdsmap_epoch(mdsmap->get_epoch());
2156 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2157 objecter->with_osdmap([r](const OSDMap& o) {
2158 r->set_osdmap_epoch(o.get_epoch());
2159 });
2160 }
2161
2162 if (request->mds == -1) {
2163 request->sent_stamp = ceph_clock_now();
2164 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2165 }
2166 request->mds = mds;
2167
2168 Inode *in = request->inode();
2169 if (in && in->caps.count(mds))
2170 request->sent_on_mseq = in->caps[mds]->mseq;
2171
2172 session->requests.push_back(&request->item);
2173
2174 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2175 session->con->send_message(r);
2176 }
2177
2178 MClientRequest* Client::build_client_request(MetaRequest *request)
2179 {
2180 MClientRequest *req = new MClientRequest(request->get_op());
2181 req->set_tid(request->tid);
2182 req->set_stamp(request->op_stamp);
2183 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184
2185 // if the filepath's haven't been set, set them!
2186 if (request->path.empty()) {
2187 Inode *in = request->inode();
2188 Dentry *de = request->dentry();
2189 if (in)
2190 in->make_nosnap_relative_path(request->path);
2191 else if (de) {
2192 if (de->inode)
2193 de->inode->make_nosnap_relative_path(request->path);
2194 else if (de->dir) {
2195 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2196 request->path.push_dentry(de->name);
2197 }
2198 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2199 << " No path, inode, or appropriately-endowed dentry given!"
2200 << dendl;
2201 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2202 << " No path, inode, or dentry given!"
2203 << dendl;
2204 }
2205 req->set_filepath(request->get_filepath());
2206 req->set_filepath2(request->get_filepath2());
2207 req->set_data(request->data);
2208 req->set_retry_attempt(request->retry_attempt++);
2209 req->head.num_fwd = request->num_fwd;
2210 const gid_t *_gids;
2211 int gid_count = request->perms.get_gids(&_gids);
2212 req->set_gid_list(gid_count, _gids);
2213 return req;
2214 }
2215
2216
2217
2218 void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219 {
2220 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2221 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2222 if (!session) {
2223 fwd->put();
2224 return;
2225 }
2226 ceph_tid_t tid = fwd->get_tid();
2227
2228 if (mds_requests.count(tid) == 0) {
2229 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2230 fwd->put();
2231 return;
2232 }
2233
2234 MetaRequest *request = mds_requests[tid];
2235 assert(request);
2236
2237 // reset retry counter
2238 request->retry_attempt = 0;
2239
2240 // request not forwarded, or dest mds has no session.
2241 // resend.
2242 ldout(cct, 10) << "handle_client_request tid " << tid
2243 << " fwd " << fwd->get_num_fwd()
2244 << " to mds." << fwd->get_dest_mds()
2245 << ", resending to " << fwd->get_dest_mds()
2246 << dendl;
2247
2248 request->mds = -1;
2249 request->item.remove_myself();
2250 request->num_fwd = fwd->get_num_fwd();
2251 request->resend_mds = fwd->get_dest_mds();
2252 request->caller_cond->Signal();
2253
2254 fwd->put();
2255 }
2256
2257 bool Client::is_dir_operation(MetaRequest *req)
2258 {
2259 int op = req->get_op();
2260 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2261 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2262 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2263 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2264 return true;
2265 return false;
2266 }
2267
2268 void Client::handle_client_reply(MClientReply *reply)
2269 {
2270 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2271 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2272 if (!session) {
2273 reply->put();
2274 return;
2275 }
2276
2277 ceph_tid_t tid = reply->get_tid();
2278 bool is_safe = reply->is_safe();
2279
2280 if (mds_requests.count(tid) == 0) {
2281 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2282 << " safe is:" << is_safe << dendl;
2283 reply->put();
2284 return;
2285 }
2286 MetaRequest *request = mds_requests.at(tid);
2287
2288 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2289 << " tid " << tid << dendl;
2290
2291 if (request->got_unsafe && !is_safe) {
2292 //duplicate response
2293 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2294 << mds_num << " safe:" << is_safe << dendl;
2295 reply->put();
2296 return;
2297 }
2298
2299 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2300 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2301 << " from mds." << request->mds << dendl;
2302 request->send_to_auth = true;
2303 request->resend_mds = choose_target_mds(request);
2304 Inode *in = request->inode();
2305 if (request->resend_mds >= 0 &&
2306 request->resend_mds == request->mds &&
2307 (in == NULL ||
2308 in->caps.count(request->resend_mds) == 0 ||
2309 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2310 // have to return ESTALE
2311 } else {
2312 request->caller_cond->Signal();
2313 reply->put();
2314 return;
2315 }
2316 ldout(cct, 20) << "have to return ESTALE" << dendl;
2317 }
2318
2319 assert(request->reply == NULL);
2320 request->reply = reply;
2321 insert_trace(request, session);
2322
2323 // Handle unsafe reply
2324 if (!is_safe) {
2325 request->got_unsafe = true;
2326 session->unsafe_requests.push_back(&request->unsafe_item);
2327 if (is_dir_operation(request)) {
2328 Inode *dir = request->inode();
2329 assert(dir);
2330 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331 }
2332 if (request->target) {
2333 InodeRef &in = request->target;
2334 in->unsafe_ops.push_back(&request->unsafe_target_item);
2335 }
2336 }
2337
2338 // Only signal the caller once (on the first reply):
2339 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2340 if (!is_safe || !request->got_unsafe) {
2341 Cond cond;
2342 request->dispatch_cond = &cond;
2343
2344 // wake up waiter
2345 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2346 request->caller_cond->Signal();
2347
2348 // wake for kick back
2349 while (request->dispatch_cond) {
2350 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2351 cond.Wait(client_lock);
2352 }
2353 }
2354
2355 if (is_safe) {
2356 // the filesystem change is committed to disk
2357 // we're done, clean up
2358 if (request->got_unsafe) {
2359 request->unsafe_item.remove_myself();
2360 request->unsafe_dir_item.remove_myself();
2361 request->unsafe_target_item.remove_myself();
2362 signal_cond_list(request->waitfor_safe);
2363 }
2364 request->item.remove_myself();
2365 unregister_request(request);
2366 }
2367 if (unmounting)
2368 mount_cond.Signal();
2369 }
2370
2371 void Client::_handle_full_flag(int64_t pool)
2372 {
2373 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2374 << "on " << pool << dendl;
2375 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2376 // to do this rather than blocking, because otherwise when we fill up we
2377 // potentially lock caps forever on files with dirty pages, and we need
2378 // to be able to release those caps to the MDS so that it can delete files
2379 // and free up space.
2380 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381
2382 // For all inodes with layouts in this pool and a pending flush write op
2383 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2384 // from ObjectCacher so that it doesn't re-issue the write in response to
2385 // the ENOSPC error.
2386 // Fortunately since we're cancelling everything in a given pool, we don't
2387 // need to know which ops belong to which ObjectSet, we can just blow all
2388 // the un-flushed cached data away and mark any dirty inodes' async_err
2389 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2390 // affecting this pool, and all the objectsets we're purging were also
2391 // in this pool.
2392 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2393 i != inode_map.end(); ++i)
2394 {
2395 Inode *inode = i->second;
2396 if (inode->oset.dirty_or_tx
2397 && (pool == -1 || inode->layout.pool_id == pool)) {
2398 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2399 << " has dirty objects, purging and setting ENOSPC" << dendl;
2400 objectcacher->purge_set(&inode->oset);
2401 inode->set_async_err(-ENOSPC);
2402 }
2403 }
2404
2405 if (cancelled_epoch != (epoch_t)-1) {
2406 set_cap_epoch_barrier(cancelled_epoch);
2407 }
2408 }
2409
2410 void Client::handle_osd_map(MOSDMap *m)
2411 {
2412 std::set<entity_addr_t> new_blacklists;
2413 objecter->consume_blacklist_events(&new_blacklists);
2414
2415 const auto myaddr = messenger->get_myaddr();
2416 if (!blacklisted && new_blacklists.count(myaddr)) {
2417 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2418 return o.get_epoch();
2419 });
2420 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421 blacklisted = true;
2422 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2423 p != mds_requests.end(); ) {
2424 auto req = p->second;
2425 ++p;
2426 req->abort(-EBLACKLISTED);
2427 if (req->caller_cond) {
2428 req->kick = true;
2429 req->caller_cond->Signal();
2430 }
2431 }
2432
2433 // Progress aborts on any requests that were on this waitlist. Any
2434 // requests that were on a waiting_for_open session waitlist
2435 // will get kicked during close session below.
2436 signal_cond_list(waiting_for_mdsmap);
2437
2438 // Force-close all sessions: assume this is not abandoning any state
2439 // on the MDS side because the MDS will have seen the blacklist too.
2440 while(!mds_sessions.empty()) {
2441 auto i = mds_sessions.begin();
2442 auto session = i->second;
2443 _closed_mds_session(session);
2444 }
2445
2446 // Since we know all our OSD ops will fail, cancel them all preemtively,
2447 // so that on an unhealthy cluster we can umount promptly even if e.g.
2448 // some PGs were inaccessible.
2449 objecter->op_cancel_writes(-EBLACKLISTED);
2450
2451 } else if (blacklisted) {
2452 // Handle case where we were blacklisted but no longer are
2453 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2454 return o.is_blacklisted(myaddr);});
2455 }
2456
2457 if (objecter->osdmap_full_flag()) {
2458 _handle_full_flag(-1);
2459 } else {
2460 // Accumulate local list of full pools so that I can drop
2461 // the objecter lock before re-entering objecter in
2462 // cancel_writes
2463 std::vector<int64_t> full_pools;
2464
2465 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2466 for (const auto& kv : o.get_pools()) {
2467 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2468 full_pools.push_back(kv.first);
2469 }
2470 }
2471 });
2472
2473 for (auto p : full_pools)
2474 _handle_full_flag(p);
2475
2476 // Subscribe to subsequent maps to watch for the full flag going
2477 // away. For the global full flag objecter does this for us, but
2478 // it pays no attention to the per-pool full flag so in this branch
2479 // we do it ourselves.
2480 if (!full_pools.empty()) {
2481 objecter->maybe_request_map();
2482 }
2483 }
2484
2485 m->put();
2486 }
2487
2488
2489 // ------------------------
2490 // incoming messages
2491
2492
2493 bool Client::ms_dispatch(Message *m)
2494 {
2495 Mutex::Locker l(client_lock);
2496 if (!initialized) {
2497 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2498 m->put();
2499 return true;
2500 }
2501
2502 switch (m->get_type()) {
2503 // mounting and mds sessions
2504 case CEPH_MSG_MDS_MAP:
2505 handle_mds_map(static_cast<MMDSMap*>(m));
2506 break;
2507 case CEPH_MSG_FS_MAP:
2508 handle_fs_map(static_cast<MFSMap*>(m));
2509 break;
2510 case CEPH_MSG_FS_MAP_USER:
2511 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512 break;
2513 case CEPH_MSG_CLIENT_SESSION:
2514 handle_client_session(static_cast<MClientSession*>(m));
2515 break;
2516
2517 case CEPH_MSG_OSD_MAP:
2518 handle_osd_map(static_cast<MOSDMap*>(m));
2519 break;
2520
2521 // requests
2522 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2523 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524 break;
2525 case CEPH_MSG_CLIENT_REPLY:
2526 handle_client_reply(static_cast<MClientReply*>(m));
2527 break;
2528
2529 case CEPH_MSG_CLIENT_SNAP:
2530 handle_snap(static_cast<MClientSnap*>(m));
2531 break;
2532 case CEPH_MSG_CLIENT_CAPS:
2533 handle_caps(static_cast<MClientCaps*>(m));
2534 break;
2535 case CEPH_MSG_CLIENT_LEASE:
2536 handle_lease(static_cast<MClientLease*>(m));
2537 break;
2538 case MSG_COMMAND_REPLY:
2539 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2540 handle_command_reply(static_cast<MCommandReply*>(m));
2541 } else {
2542 return false;
2543 }
2544 break;
2545 case CEPH_MSG_CLIENT_QUOTA:
2546 handle_quota(static_cast<MClientQuota*>(m));
2547 break;
2548
2549 default:
2550 return false;
2551 }
2552
2553 // unmounting?
2554 if (unmounting) {
2555 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2556 << "+" << inode_map.size() << dendl;
2557 long unsigned size = lru.lru_get_size() + inode_map.size();
2558 trim_cache();
2559 if (size < lru.lru_get_size() + inode_map.size()) {
2560 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2561 mount_cond.Signal();
2562 } else {
2563 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2564 << "+" << inode_map.size() << dendl;
2565 }
2566 }
2567
2568 return true;
2569 }
2570
2571 void Client::handle_fs_map(MFSMap *m)
2572 {
2573 fsmap.reset(new FSMap(m->get_fsmap()));
2574 m->put();
2575
2576 signal_cond_list(waiting_for_fsmap);
2577
2578 monclient->sub_got("fsmap", fsmap->get_epoch());
2579 }
2580
2581 void Client::handle_fs_map_user(MFSMapUser *m)
2582 {
2583 fsmap_user.reset(new FSMapUser);
2584 *fsmap_user = m->get_fsmap();
2585 m->put();
2586
2587 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2588 signal_cond_list(waiting_for_fsmap);
2589 }
2590
2591 void Client::handle_mds_map(MMDSMap* m)
2592 {
2593 if (m->get_epoch() <= mdsmap->get_epoch()) {
2594 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2595 << " is identical to or older than our "
2596 << mdsmap->get_epoch() << dendl;
2597 m->put();
2598 return;
2599 }
2600
2601 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602
2603 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2604 oldmap.swap(mdsmap);
2605
2606 mdsmap->decode(m->get_encoded());
2607
2608 // Cancel any commands for missing or laggy GIDs
2609 std::list<ceph_tid_t> cancel_ops;
2610 auto &commands = command_table.get_commands();
2611 for (const auto &i : commands) {
2612 auto &op = i.second;
2613 const mds_gid_t op_mds_gid = op.mds_gid;
2614 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2615 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2616 cancel_ops.push_back(i.first);
2617 if (op.outs) {
2618 std::ostringstream ss;
2619 ss << "MDS " << op_mds_gid << " went away";
2620 *(op.outs) = ss.str();
2621 }
2622 op.con->mark_down();
2623 if (op.on_finish) {
2624 op.on_finish->complete(-ETIMEDOUT);
2625 }
2626 }
2627 }
2628
2629 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2630 i != cancel_ops.end(); ++i) {
2631 command_table.erase(*i);
2632 }
2633
2634 // reset session
2635 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2636 p != mds_sessions.end(); ) {
2637 mds_rank_t mds = p->first;
2638 MetaSession *session = p->second;
2639 ++p;
2640
2641 int oldstate = oldmap->get_state(mds);
2642 int newstate = mdsmap->get_state(mds);
2643 if (!mdsmap->is_up(mds)) {
2644 session->con->mark_down();
2645 } else if (mdsmap->get_inst(mds) != session->inst) {
2646 session->con->mark_down();
2647 session->inst = mdsmap->get_inst(mds);
2648 // When new MDS starts to take over, notify kernel to trim unused entries
2649 // in its dcache/icache. Hopefully, the kernel will release some unused
2650 // inodes before the new MDS enters reconnect state.
2651 trim_cache_for_reconnect(session);
2652 } else if (oldstate == newstate)
2653 continue; // no change
2654
2655 session->mds_state = newstate;
2656 if (newstate == MDSMap::STATE_RECONNECT) {
2657 session->con = messenger->get_connection(session->inst);
2658 send_reconnect(session);
2659 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2660 if (oldstate < MDSMap::STATE_ACTIVE) {
2661 // kick new requests
2662 kick_requests(session);
2663 kick_flushing_caps(session);
2664 signal_context_list(session->waiting_for_open);
2665 kick_maxsize_requests(session);
2666 wake_inode_waiters(session);
2667 }
2668 connect_mds_targets(mds);
2669 } else if (newstate == MDSMap::STATE_NULL &&
2670 mds >= mdsmap->get_max_mds()) {
2671 _closed_mds_session(session);
2672 }
2673 }
2674
2675 // kick any waiting threads
2676 signal_cond_list(waiting_for_mdsmap);
2677
2678 m->put();
2679
2680 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2681 }
2682
2683 void Client::send_reconnect(MetaSession *session)
2684 {
2685 mds_rank_t mds = session->mds_num;
2686 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687
2688 // trim unused caps to reduce MDS's cache rejoin time
2689 trim_cache_for_reconnect(session);
2690
2691 session->readonly = false;
2692
2693 if (session->release) {
2694 session->release->put();
2695 session->release = NULL;
2696 }
2697
2698 // reset my cap seq number
2699 session->seq = 0;
2700 //connect to the mds' offload targets
2701 connect_mds_targets(mds);
2702 //make sure unsafe requests get saved
2703 resend_unsafe_requests(session);
2704
2705 MClientReconnect *m = new MClientReconnect;
2706
2707 // i have an open session.
2708 ceph::unordered_set<inodeno_t> did_snaprealm;
2709 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2710 p != inode_map.end();
2711 ++p) {
2712 Inode *in = p->second;
2713 if (in->caps.count(mds)) {
2714 ldout(cct, 10) << " caps on " << p->first
2715 << " " << ccap_string(in->caps[mds]->issued)
2716 << " wants " << ccap_string(in->caps_wanted())
2717 << dendl;
2718 filepath path;
2719 in->make_long_path(path);
2720 ldout(cct, 10) << " path " << path << dendl;
2721
2722 bufferlist flockbl;
2723 _encode_filelocks(in, flockbl);
2724
2725 Cap *cap = in->caps[mds];
2726 cap->seq = 0; // reset seq.
2727 cap->issue_seq = 0; // reset seq.
2728 cap->mseq = 0; // reset seq.
2729 cap->issued = cap->implemented;
2730
2731 snapid_t snap_follows = 0;
2732 if (!in->cap_snaps.empty())
2733 snap_follows = in->cap_snaps.begin()->first;
2734
2735 m->add_cap(p->first.ino,
2736 cap->cap_id,
2737 path.get_ino(), path.get_path(), // ino
2738 in->caps_wanted(), // wanted
2739 cap->issued, // issued
2740 in->snaprealm->ino,
2741 snap_follows,
2742 flockbl);
2743
2744 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2745 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2746 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2747 did_snaprealm.insert(in->snaprealm->ino);
2748 }
2749 }
2750 }
2751
2752 early_kick_flushing_caps(session);
2753
2754 session->con->send_message(m);
2755
2756 mount_cond.Signal();
2757 }
2758
2759
2760 void Client::kick_requests(MetaSession *session)
2761 {
2762 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2763 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2764 p != mds_requests.end();
2765 ++p) {
2766 MetaRequest *req = p->second;
2767 if (req->got_unsafe)
2768 continue;
2769 if (req->aborted()) {
2770 if (req->caller_cond) {
2771 req->kick = true;
2772 req->caller_cond->Signal();
2773 }
2774 continue;
2775 }
2776 if (req->retry_attempt > 0)
2777 continue; // new requests only
2778 if (req->mds == session->mds_num) {
2779 send_request(p->second, session);
2780 }
2781 }
2782 }
2783
2784 void Client::resend_unsafe_requests(MetaSession *session)
2785 {
2786 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2787 !iter.end();
2788 ++iter)
2789 send_request(*iter, session);
2790
2791 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2792 // process completed requests in clientreplay stage.
2793 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2794 p != mds_requests.end();
2795 ++p) {
2796 MetaRequest *req = p->second;
2797 if (req->got_unsafe)
2798 continue;
2799 if (req->aborted())
2800 continue;
2801 if (req->retry_attempt == 0)
2802 continue; // old requests only
2803 if (req->mds == session->mds_num)
2804 send_request(req, session, true);
2805 }
2806 }
2807
2808 void Client::wait_unsafe_requests()
2809 {
2810 list<MetaRequest*> last_unsafe_reqs;
2811 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2812 p != mds_sessions.end();
2813 ++p) {
2814 MetaSession *s = p->second;
2815 if (!s->unsafe_requests.empty()) {
2816 MetaRequest *req = s->unsafe_requests.back();
2817 req->get();
2818 last_unsafe_reqs.push_back(req);
2819 }
2820 }
2821
2822 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2823 p != last_unsafe_reqs.end();
2824 ++p) {
2825 MetaRequest *req = *p;
2826 if (req->unsafe_item.is_on_list())
2827 wait_on_list(req->waitfor_safe);
2828 put_request(req);
2829 }
2830 }
2831
2832 void Client::kick_requests_closed(MetaSession *session)
2833 {
2834 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2835 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2836 p != mds_requests.end(); ) {
2837 MetaRequest *req = p->second;
2838 ++p;
2839 if (req->mds == session->mds_num) {
2840 if (req->caller_cond) {
2841 req->kick = true;
2842 req->caller_cond->Signal();
2843 }
2844 req->item.remove_myself();
2845 if (req->got_unsafe) {
2846 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2847 req->unsafe_item.remove_myself();
2848 req->unsafe_dir_item.remove_myself();
2849 req->unsafe_target_item.remove_myself();
2850 signal_cond_list(req->waitfor_safe);
2851 unregister_request(req);
2852 }
2853 }
2854 }
2855 assert(session->requests.empty());
2856 assert(session->unsafe_requests.empty());
2857 }
2858
2859
2860
2861
2862 /************
2863 * leases
2864 */
2865
2866 void Client::got_mds_push(MetaSession *s)
2867 {
2868 s->seq++;
2869 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2870 if (s->state == MetaSession::STATE_CLOSING) {
2871 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2872 }
2873 }
2874
2875 void Client::handle_lease(MClientLease *m)
2876 {
2877 ldout(cct, 10) << "handle_lease " << *m << dendl;
2878
2879 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880
2881 mds_rank_t mds = mds_rank_t(m->get_source().num());
2882 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2883 if (!session) {
2884 m->put();
2885 return;
2886 }
2887
2888 got_mds_push(session);
2889
2890 ceph_seq_t seq = m->get_seq();
2891
2892 Inode *in;
2893 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2894 if (inode_map.count(vino) == 0) {
2895 ldout(cct, 10) << " don't have vino " << vino << dendl;
2896 goto revoke;
2897 }
2898 in = inode_map[vino];
2899
2900 if (m->get_mask() & CEPH_LOCK_DN) {
2901 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2902 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2903 goto revoke;
2904 }
2905 Dentry *dn = in->dir->dentries[m->dname];
2906 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2907 dn->lease_mds = -1;
2908 }
2909
2910 revoke:
2911 m->get_connection()->send_message(
2912 new MClientLease(
2913 CEPH_MDS_LEASE_RELEASE, seq,
2914 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2915 m->put();
2916 }
2917
2918 void Client::put_inode(Inode *in, int n)
2919 {
2920 ldout(cct, 10) << "put_inode on " << *in << dendl;
2921 int left = in->_put(n);
2922 if (left == 0) {
2923 // release any caps
2924 remove_all_caps(in);
2925
2926 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2927 bool unclean = objectcacher->release_set(&in->oset);
2928 assert(!unclean);
2929 inode_map.erase(in->vino());
2930 if (use_faked_inos())
2931 _release_faked_ino(in);
2932
2933 if (in == root) {
2934 root = 0;
2935 root_ancestor = 0;
2936 while (!root_parents.empty())
2937 root_parents.erase(root_parents.begin());
2938 }
2939
2940 delete in;
2941 }
2942 }
2943
2944 void Client::close_dir(Dir *dir)
2945 {
2946 Inode *in = dir->parent_inode;
2947 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2948 assert(dir->is_empty());
2949 assert(in->dir == dir);
2950 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2951 if (!in->dn_set.empty())
2952 in->get_first_parent()->put(); // unpin dentry
2953
2954 delete in->dir;
2955 in->dir = 0;
2956 put_inode(in); // unpin inode
2957 }
2958
2959 /**
2960 * Don't call this with in==NULL, use get_or_create for that
2961 * leave dn set to default NULL unless you're trying to add
2962 * a new inode to a pre-created Dentry
2963 */
2964 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2965 {
2966 if (!dn) {
2967 // create a new Dentry
2968 dn = new Dentry;
2969 dn->name = name;
2970
2971 // link to dir
2972 dn->dir = dir;
2973 dir->dentries[dn->name] = dn;
2974 lru.lru_insert_mid(dn); // mid or top?
2975
2976 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2977 << " dn " << dn << " (new dn)" << dendl;
2978 } else {
2979 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2980 << " dn " << dn << " (old dn)" << dendl;
2981 }
2982
2983 if (in) { // link to inode
2984 dn->inode = in;
2985 if (in->is_dir()) {
2986 if (in->dir)
2987 dn->get(); // dir -> dn pin
2988 if (in->ll_ref)
2989 dn->get(); // ll_ref -> dn pin
2990 }
2991
2992 assert(in->dn_set.count(dn) == 0);
2993
2994 // only one parent for directories!
2995 if (in->is_dir() && !in->dn_set.empty()) {
2996 Dentry *olddn = in->get_first_parent();
2997 assert(olddn->dir != dir || olddn->name != name);
2998 Inode *old_diri = olddn->dir->parent_inode;
2999 old_diri->dir_release_count++;
3000 clear_dir_complete_and_ordered(old_diri, true);
3001 unlink(olddn, true, true); // keep dir, dentry
3002 }
3003
3004 in->dn_set.insert(dn);
3005
3006 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3007 }
3008
3009 return dn;
3010 }
3011
3012 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3013 {
3014 InodeRef in;
3015 in.swap(dn->inode);
3016 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3017 << " inode " << dn->inode << dendl;
3018
3019 // unlink from inode
3020 if (in) {
3021 if (in->is_dir()) {
3022 if (in->dir)
3023 dn->put(); // dir -> dn pin
3024 if (in->ll_ref)
3025 dn->put(); // ll_ref -> dn pin
3026 }
3027 dn->inode = 0;
3028 assert(in->dn_set.count(dn));
3029 in->dn_set.erase(dn);
3030 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3031 }
3032
3033 if (keepdentry) {
3034 dn->lease_mds = -1;
3035 } else {
3036 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3037
3038 // unlink from dir
3039 dn->dir->dentries.erase(dn->name);
3040 if (dn->dir->is_empty() && !keepdir)
3041 close_dir(dn->dir);
3042 dn->dir = 0;
3043
3044 // delete den
3045 lru.lru_remove(dn);
3046 dn->put();
3047 }
3048 }
3049
3050 /**
3051 * For asynchronous flushes, check for errors from the IO and
3052 * update the inode if necessary
3053 */
3054 class C_Client_FlushComplete : public Context {
3055 private:
3056 Client *client;
3057 InodeRef inode;
3058 public:
3059 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3060 void finish(int r) override {
3061 assert(client->client_lock.is_locked_by_me());
3062 if (r != 0) {
3063 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3064 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3065 << " 0x" << std::hex << inode->ino << std::dec
3066 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3067 inode->set_async_err(r);
3068 }
3069 }
3070 };
3071
3072
3073 /****
3074 * caps
3075 */
3076
3077 void Client::get_cap_ref(Inode *in, int cap)
3078 {
3079 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3080 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3081 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3082 in->get();
3083 }
3084 if ((cap & CEPH_CAP_FILE_CACHE) &&
3085 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3086 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3087 in->get();
3088 }
3089 in->get_cap_ref(cap);
3090 }
3091
3092 void Client::put_cap_ref(Inode *in, int cap)
3093 {
3094 int last = in->put_cap_ref(cap);
3095 if (last) {
3096 int put_nref = 0;
3097 int drop = last & ~in->caps_issued();
3098 if (in->snapid == CEPH_NOSNAP) {
3099 if ((last & CEPH_CAP_FILE_WR) &&
3100 !in->cap_snaps.empty() &&
3101 in->cap_snaps.rbegin()->second.writing) {
3102 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3103 in->cap_snaps.rbegin()->second.writing = 0;
3104 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3105 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3106 }
3107 if (last & CEPH_CAP_FILE_BUFFER) {
3108 for (auto &p : in->cap_snaps)
3109 p.second.dirty_data = 0;
3110 signal_cond_list(in->waitfor_commit);
3111 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3112 ++put_nref;
3113 }
3114 }
3115 if (last & CEPH_CAP_FILE_CACHE) {
3116 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3117 ++put_nref;
3118 }
3119 if (drop)
3120 check_caps(in, 0);
3121 if (put_nref)
3122 put_inode(in, put_nref);
3123 }
3124 }
3125
3126 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127 {
3128 int r = check_pool_perm(in, need);
3129 if (r < 0)
3130 return r;
3131
3132 while (1) {
3133 int file_wanted = in->caps_file_wanted();
3134 if ((file_wanted & need) != need) {
3135 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3136 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3137 << dendl;
3138 return -EBADF;
3139 }
3140
3141 int implemented;
3142 int have = in->caps_issued(&implemented);
3143
3144 bool waitfor_caps = false;
3145 bool waitfor_commit = false;
3146
3147 if (have & need & CEPH_CAP_FILE_WR) {
3148 if (endoff > 0 &&
3149 (endoff >= (loff_t)in->max_size ||
3150 endoff > (loff_t)(in->size << 1)) &&
3151 endoff > (loff_t)in->wanted_max_size) {
3152 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3153 in->wanted_max_size = endoff;
3154 check_caps(in, 0);
3155 }
3156
3157 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3158 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3159 waitfor_caps = true;
3160 }
3161 if (!in->cap_snaps.empty()) {
3162 if (in->cap_snaps.rbegin()->second.writing) {
3163 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3164 waitfor_caps = true;
3165 }
3166 for (auto &p : in->cap_snaps) {
3167 if (p.second.dirty_data) {
3168 waitfor_commit = true;
3169 break;
3170 }
3171 }
3172 if (waitfor_commit) {
3173 _flush(in, new C_Client_FlushComplete(this, in));
3174 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3175 }
3176 }
3177 }
3178
3179 if (!waitfor_caps && !waitfor_commit) {
3180 if ((have & need) == need) {
3181 int revoking = implemented & ~have;
3182 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3183 << " need " << ccap_string(need) << " want " << ccap_string(want)
3184 << " revoking " << ccap_string(revoking)
3185 << dendl;
3186 if ((revoking & want) == 0) {
3187 *phave = need | (have & want);
3188 in->get_cap_ref(need);
3189 return 0;
3190 }
3191 }
3192 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3193 waitfor_caps = true;
3194 }
3195
3196 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3197 in->auth_cap->session->readonly)
3198 return -EROFS;
3199
3200 if (in->flags & I_CAP_DROPPED) {
3201 int mds_wanted = in->caps_mds_wanted();
3202 if ((mds_wanted & need) != need) {
3203 int ret = _renew_caps(in);
3204 if (ret < 0)
3205 return ret;
3206 continue;
3207 }
3208 if ((mds_wanted & file_wanted) ==
3209 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3210 in->flags &= ~I_CAP_DROPPED;
3211 }
3212 }
3213
3214 if (waitfor_caps)
3215 wait_on_list(in->waitfor_caps);
3216 else if (waitfor_commit)
3217 wait_on_list(in->waitfor_commit);
3218 }
3219 }
3220
3221 int Client::get_caps_used(Inode *in)
3222 {
3223 unsigned used = in->caps_used();
3224 if (!(used & CEPH_CAP_FILE_CACHE) &&
3225 !objectcacher->set_is_empty(&in->oset))
3226 used |= CEPH_CAP_FILE_CACHE;
3227 return used;
3228 }
3229
3230 void Client::cap_delay_requeue(Inode *in)
3231 {
3232 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3233 in->hold_caps_until = ceph_clock_now();
3234 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3235 delayed_caps.push_back(&in->cap_item);
3236 }
3237
3238 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3239 bool sync, int used, int want, int retain,
3240 int flush, ceph_tid_t flush_tid)
3241 {
3242 int held = cap->issued | cap->implemented;
3243 int revoking = cap->implemented & ~cap->issued;
3244 retain &= ~revoking;
3245 int dropping = cap->issued & ~retain;
3246 int op = CEPH_CAP_OP_UPDATE;
3247
3248 ldout(cct, 10) << "send_cap " << *in
3249 << " mds." << session->mds_num << " seq " << cap->seq
3250 << (sync ? " sync " : " async ")
3251 << " used " << ccap_string(used)
3252 << " want " << ccap_string(want)
3253 << " flush " << ccap_string(flush)
3254 << " retain " << ccap_string(retain)
3255 << " held "<< ccap_string(held)
3256 << " revoking " << ccap_string(revoking)
3257 << " dropping " << ccap_string(dropping)
3258 << dendl;
3259
3260 if (cct->_conf->client_inject_release_failure && revoking) {
3261 const int would_have_issued = cap->issued & retain;
3262 const int would_have_implemented = cap->implemented & (cap->issued | used);
3263 // Simulated bug:
3264 // - tell the server we think issued is whatever they issued plus whatever we implemented
3265 // - leave what we have implemented in place
3266 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3267 cap->issued = cap->issued | cap->implemented;
3268
3269 // Make an exception for revoking xattr caps: we are injecting
3270 // failure to release other caps, but allow xattr because client
3271 // will block on xattr ops if it can't release these to MDS (#9800)
3272 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3273 cap->issued ^= xattr_mask & revoking;
3274 cap->implemented ^= xattr_mask & revoking;
3275
3276 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3277 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3278 } else {
3279 // Normal behaviour
3280 cap->issued &= retain;
3281 cap->implemented &= cap->issued | used;
3282 }
3283
3284 snapid_t follows = 0;
3285
3286 if (flush)
3287 follows = in->snaprealm->get_snap_context().seq;
3288
3289 MClientCaps *m = new MClientCaps(op,
3290 in->ino,
3291 0,
3292 cap->cap_id, cap->seq,
3293 cap->implemented,
3294 want,
3295 flush,
3296 cap->mseq,
3297 cap_epoch_barrier);
3298 m->caller_uid = in->cap_dirtier_uid;
3299 m->caller_gid = in->cap_dirtier_gid;
3300
3301 m->head.issue_seq = cap->issue_seq;
3302 m->set_tid(flush_tid);
3303
3304 m->head.uid = in->uid;
3305 m->head.gid = in->gid;
3306 m->head.mode = in->mode;
3307
3308 m->head.nlink = in->nlink;
3309
3310 if (flush & CEPH_CAP_XATTR_EXCL) {
3311 ::encode(in->xattrs, m->xattrbl);
3312 m->head.xattr_version = in->xattr_version;
3313 }
3314
3315 m->size = in->size;
3316 m->max_size = in->max_size;
3317 m->truncate_seq = in->truncate_seq;
3318 m->truncate_size = in->truncate_size;
3319 m->mtime = in->mtime;
3320 m->atime = in->atime;
3321 m->ctime = in->ctime;
3322 m->btime = in->btime;
3323 m->time_warp_seq = in->time_warp_seq;
3324 m->change_attr = in->change_attr;
3325 if (sync)
3326 m->flags |= CLIENT_CAPS_SYNC;
3327
3328 if (flush & CEPH_CAP_FILE_WR) {
3329 m->inline_version = in->inline_version;
3330 m->inline_data = in->inline_data;
3331 }
3332
3333 in->reported_size = in->size;
3334 m->set_snap_follows(follows);
3335 cap->wanted = want;
3336 if (cap == in->auth_cap) {
3337 m->set_max_size(in->wanted_max_size);
3338 in->requested_max_size = in->wanted_max_size;
3339 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3340 }
3341
3342 if (!session->flushing_caps_tids.empty())
3343 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3344
3345 session->con->send_message(m);
3346 }
3347
3348 static bool is_max_size_approaching(Inode *in)
3349 {
3350 /* mds will adjust max size according to the reported size */
3351 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3352 return false;
3353 if (in->size >= in->max_size)
3354 return true;
3355 /* half of previous max_size increment has been used */
3356 if (in->max_size > in->reported_size &&
3357 (in->size << 1) >= in->max_size + in->reported_size)
3358 return true;
3359 return false;
3360 }
3361
3362 /**
3363 * check_caps
3364 *
3365 * Examine currently used and wanted versus held caps. Release, flush or ack
3366 * revoked caps to the MDS as appropriate.
3367 *
3368 * @param in the inode to check
3369 * @param flags flags to apply to cap check
3370 */
3371 void Client::check_caps(Inode *in, unsigned flags)
3372 {
3373 unsigned wanted = in->caps_wanted();
3374 unsigned used = get_caps_used(in);
3375 unsigned cap_used;
3376
3377 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3378 // we do this here because we don't want to drop to Fs (and then
3379 // drop the Fs if we do a create!) if that alone makes us send lookups
3380 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3381 wanted |= CEPH_CAP_FILE_EXCL;
3382 }
3383
3384 int implemented;
3385 int issued = in->caps_issued(&implemented);
3386 int revoking = implemented & ~issued;
3387
3388 int retain = wanted | used | CEPH_CAP_PIN;
3389 if (!unmounting) {
3390 if (wanted)
3391 retain |= CEPH_CAP_ANY;
3392 else
3393 retain |= CEPH_CAP_ANY_SHARED;
3394 }
3395
3396 ldout(cct, 10) << "check_caps on " << *in
3397 << " wanted " << ccap_string(wanted)
3398 << " used " << ccap_string(used)
3399 << " issued " << ccap_string(issued)
3400 << " revoking " << ccap_string(revoking)
3401 << " flags=" << flags
3402 << dendl;
3403
3404 if (in->snapid != CEPH_NOSNAP)
3405 return; //snap caps last forever, can't write
3406
3407 if (in->caps.empty())
3408 return; // guard if at end of func
3409
3410 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3411 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
3412 _release(in);
3413
3414 if (!in->cap_snaps.empty())
3415 flush_snaps(in);
3416
3417 if (flags & CHECK_CAPS_NODELAY)
3418 in->hold_caps_until = utime_t();
3419 else
3420 cap_delay_requeue(in);
3421
3422 utime_t now = ceph_clock_now();
3423
3424 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3425 while (it != in->caps.end()) {
3426 mds_rank_t mds = it->first;
3427 Cap *cap = it->second;
3428 ++it;
3429
3430 MetaSession *session = mds_sessions[mds];
3431 assert(session);
3432
3433 cap_used = used;
3434 if (in->auth_cap && cap != in->auth_cap)
3435 cap_used &= ~in->auth_cap->issued;
3436
3437 revoking = cap->implemented & ~cap->issued;
3438
3439 ldout(cct, 10) << " cap mds." << mds
3440 << " issued " << ccap_string(cap->issued)
3441 << " implemented " << ccap_string(cap->implemented)
3442 << " revoking " << ccap_string(revoking) << dendl;
3443
3444 if (in->wanted_max_size > in->max_size &&
3445 in->wanted_max_size > in->requested_max_size &&
3446 cap == in->auth_cap)
3447 goto ack;
3448
3449 /* approaching file_max? */
3450 if ((cap->issued & CEPH_CAP_FILE_WR) &&
3451 cap == in->auth_cap &&
3452 is_max_size_approaching(in)) {
3453 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3454 << ", reported " << in->reported_size << dendl;
3455 goto ack;
3456 }
3457
3458 /* completed revocation? */
3459 if (revoking && (revoking & cap_used) == 0) {
3460 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3461 goto ack;
3462 }
3463
3464 /* want more caps from mds? */
3465 if (wanted & ~(cap->wanted | cap->issued))
3466 goto ack;
3467
3468 if (!revoking && unmounting && (cap_used == 0))
3469 goto ack;
3470
3471 if (wanted == cap->wanted && // mds knows what we want.
3472 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3473 !in->dirty_caps) // and we have no dirty caps
3474 continue;
3475
3476 if (now < in->hold_caps_until) {
3477 ldout(cct, 10) << "delaying cap release" << dendl;
3478 continue;
3479 }
3480
3481 ack:
3482 // re-send old cap/snapcap flushes first.
3483 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3484 session->mds_state < MDSMap::STATE_ACTIVE &&
3485 session->early_flushing_caps.count(in) == 0) {
3486 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3487 << " to mds." << session->mds_num << dendl;
3488 session->early_flushing_caps.insert(in);
3489 if (in->cap_snaps.size())
3490 flush_snaps(in, true);
3491 if (in->flushing_caps)
3492 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3493 }
3494
3495 int flushing;
3496 ceph_tid_t flush_tid;
3497 if (in->auth_cap == cap && in->dirty_caps) {
3498 flushing = mark_caps_flushing(in, &flush_tid);
3499 } else {
3500 flushing = 0;
3501 flush_tid = 0;
3502 }
3503
3504 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3505 retain, flushing, flush_tid);
3506 }
3507 }
3508
3509
3510 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3511 {
3512 int used = get_caps_used(in);
3513 int dirty = in->caps_dirty();
3514 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3515
3516 if (in->cap_snaps.size() &&
3517 in->cap_snaps.rbegin()->second.writing) {
3518 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3519 return;
3520 } else if (in->caps_dirty() ||
3521 (used & CEPH_CAP_FILE_WR) ||
3522 (dirty & CEPH_CAP_ANY_WR)) {
3523 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3524 assert(capsnapem.second == true); /* element inserted */
3525 CapSnap &capsnap = capsnapem.first->second;
3526 capsnap.context = old_snapc;
3527 capsnap.issued = in->caps_issued();
3528 capsnap.dirty = in->caps_dirty();
3529
3530 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3531
3532 capsnap.uid = in->uid;
3533 capsnap.gid = in->gid;
3534 capsnap.mode = in->mode;
3535 capsnap.btime = in->btime;
3536 capsnap.xattrs = in->xattrs;
3537 capsnap.xattr_version = in->xattr_version;
3538
3539 if (used & CEPH_CAP_FILE_WR) {
3540 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3541 capsnap.writing = 1;
3542 } else {
3543 finish_cap_snap(in, capsnap, used);
3544 }
3545 } else {
3546 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3547 }
3548 }
3549
3550 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3551 {
3552 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3553 capsnap.size = in->size;
3554 capsnap.mtime = in->mtime;
3555 capsnap.atime = in->atime;
3556 capsnap.ctime = in->ctime;
3557 capsnap.time_warp_seq = in->time_warp_seq;
3558 capsnap.change_attr = in->change_attr;
3559
3560 capsnap.dirty |= in->caps_dirty();
3561
3562 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3563 capsnap.inline_data = in->inline_data;
3564 capsnap.inline_version = in->inline_version;
3565 }
3566
3567 if (used & CEPH_CAP_FILE_BUFFER) {
3568 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3569 << " WRBUFFER, delaying" << dendl;
3570 } else {
3571 capsnap.dirty_data = 0;
3572 flush_snaps(in);
3573 }
3574 }
3575
3576 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3577 {
3578 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3579 in->cap_snaps.at(seq).dirty_data = 0;
3580 flush_snaps(in);
3581 }
3582
3583 void Client::flush_snaps(Inode *in, bool all_again)
3584 {
3585 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3586 assert(in->cap_snaps.size());
3587
3588 // pick auth mds
3589 assert(in->auth_cap);
3590 MetaSession *session = in->auth_cap->session;
3591 int mseq = in->auth_cap->mseq;
3592
3593 for (auto &p : in->cap_snaps) {
3594 CapSnap &capsnap = p.second;
3595 if (!all_again) {
3596 // only flush once per session
3597 if (capsnap.flush_tid > 0)
3598 continue;
3599 }
3600
3601 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3602 << " follows " << p.first
3603 << " size " << capsnap.size
3604 << " mtime " << capsnap.mtime
3605 << " dirty_data=" << capsnap.dirty_data
3606 << " writing=" << capsnap.writing
3607 << " on " << *in << dendl;
3608 if (capsnap.dirty_data || capsnap.writing)
3609 continue;
3610
3611 if (capsnap.flush_tid == 0) {
3612 capsnap.flush_tid = ++last_flush_tid;
3613 if (!in->flushing_cap_item.is_on_list())
3614 session->flushing_caps.push_back(&in->flushing_cap_item);
3615 session->flushing_caps_tids.insert(capsnap.flush_tid);
3616 }
3617
3618 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3619 cap_epoch_barrier);
3620 if (user_id >= 0)
3621 m->caller_uid = user_id;
3622 if (group_id >= 0)
3623 m->caller_gid = group_id;
3624
3625 m->set_client_tid(capsnap.flush_tid);
3626 m->head.snap_follows = p.first;
3627
3628 m->head.caps = capsnap.issued;
3629 m->head.dirty = capsnap.dirty;
3630
3631 m->head.uid = capsnap.uid;
3632 m->head.gid = capsnap.gid;
3633 m->head.mode = capsnap.mode;
3634 m->btime = capsnap.btime;
3635
3636 m->size = capsnap.size;
3637
3638 m->head.xattr_version = capsnap.xattr_version;
3639 ::encode(capsnap.xattrs, m->xattrbl);
3640
3641 m->ctime = capsnap.ctime;
3642 m->btime = capsnap.btime;
3643 m->mtime = capsnap.mtime;
3644 m->atime = capsnap.atime;
3645 m->time_warp_seq = capsnap.time_warp_seq;
3646 m->change_attr = capsnap.change_attr;
3647
3648 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3649 m->inline_version = in->inline_version;
3650 m->inline_data = in->inline_data;
3651 }
3652
3653 assert(!session->flushing_caps_tids.empty());
3654 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3655
3656 session->con->send_message(m);
3657 }
3658 }
3659
3660
3661
3662 void Client::wait_on_list(list<Cond*>& ls)
3663 {
3664 Cond cond;
3665 ls.push_back(&cond);
3666 cond.Wait(client_lock);
3667 ls.remove(&cond);
3668 }
3669
3670 void Client::signal_cond_list(list<Cond*>& ls)
3671 {
3672 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3673 (*it)->Signal();
3674 }
3675
3676 void Client::wait_on_context_list(list<Context*>& ls)
3677 {
3678 Cond cond;
3679 bool done = false;
3680 int r;
3681 ls.push_back(new C_Cond(&cond, &done, &r));
3682 while (!done)
3683 cond.Wait(client_lock);
3684 }
3685
3686 void Client::signal_context_list(list<Context*>& ls)
3687 {
3688 while (!ls.empty()) {
3689 ls.front()->complete(0);
3690 ls.pop_front();
3691 }
3692 }
3693
3694 void Client::wake_inode_waiters(MetaSession *s)
3695 {
3696 xlist<Cap*>::iterator iter = s->caps.begin();
3697 while (!iter.end()){
3698 signal_cond_list((*iter)->inode->waitfor_caps);
3699 ++iter;
3700 }
3701 }
3702
3703
3704 // flush dirty data (from objectcache)
3705
3706 class C_Client_CacheInvalidate : public Context {
3707 private:
3708 Client *client;
3709 vinodeno_t ino;
3710 int64_t offset, length;
3711 public:
3712 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3713 client(c), offset(off), length(len) {
3714 if (client->use_faked_inos())
3715 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3716 else
3717 ino = in->vino();
3718 }
3719 void finish(int r) override {
3720 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3721 assert(!client->client_lock.is_locked_by_me());
3722 client->_async_invalidate(ino, offset, length);
3723 }
3724 };
3725
3726 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3727 {
3728 if (unmounting)
3729 return;
3730 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3731 ino_invalidate_cb(callback_handle, ino, off, len);
3732 }
3733
3734 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3735
3736 if (ino_invalidate_cb)
3737 // we queue the invalidate, which calls the callback and decrements the ref
3738 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3739 }
3740
3741 void Client::_invalidate_inode_cache(Inode *in)
3742 {
3743 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3744
3745 // invalidate our userspace inode cache
3746 if (cct->_conf->client_oc)
3747 objectcacher->release_set(&in->oset);
3748
3749 _schedule_invalidate_callback(in, 0, 0);
3750 }
3751
3752 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3753 {
3754 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3755
3756 // invalidate our userspace inode cache
3757 if (cct->_conf->client_oc) {
3758 vector<ObjectExtent> ls;
3759 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3760 objectcacher->discard_set(&in->oset, ls);
3761 }
3762
3763 _schedule_invalidate_callback(in, off, len);
3764 }
3765
3766 bool Client::_release(Inode *in)
3767 {
3768 ldout(cct, 20) << "_release " << *in << dendl;
3769 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3770 _invalidate_inode_cache(in);
3771 return true;
3772 }
3773 return false;
3774 }
3775
3776 bool Client::_flush(Inode *in, Context *onfinish)
3777 {
3778 ldout(cct, 10) << "_flush " << *in << dendl;
3779
3780 if (!in->oset.dirty_or_tx) {
3781 ldout(cct, 10) << " nothing to flush" << dendl;
3782 onfinish->complete(0);
3783 return true;
3784 }
3785
3786 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3787 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3788 objectcacher->purge_set(&in->oset);
3789 if (onfinish) {
3790 onfinish->complete(-ENOSPC);
3791 }
3792 return true;
3793 }
3794
3795 return objectcacher->flush_set(&in->oset, onfinish);
3796 }
3797
3798 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3799 {
3800 assert(client_lock.is_locked());
3801 if (!in->oset.dirty_or_tx) {
3802 ldout(cct, 10) << " nothing to flush" << dendl;
3803 return;
3804 }
3805
3806 Mutex flock("Client::_flush_range flock");
3807 Cond cond;
3808 bool safe = false;
3809 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3810 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3811 offset, size, onflush);
3812 if (!ret) {
3813 // wait for flush
3814 client_lock.Unlock();
3815 flock.Lock();
3816 while (!safe)
3817 cond.Wait(flock);
3818 flock.Unlock();
3819 client_lock.Lock();
3820 }
3821 }
3822
3823 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3824 {
3825 // Mutex::Locker l(client_lock);
3826 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3827 Inode *in = static_cast<Inode *>(oset->parent);
3828 assert(in);
3829 _flushed(in);
3830 }
3831
3832 void Client::_flushed(Inode *in)
3833 {
3834 ldout(cct, 10) << "_flushed " << *in << dendl;
3835
3836 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3837 }
3838
3839
3840
3841 // checks common to add_update_cap, handle_cap_grant
3842 void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3843 {
3844 unsigned had = in->caps_issued();
3845
3846 if ((issued & CEPH_CAP_FILE_CACHE) &&
3847 !(had & CEPH_CAP_FILE_CACHE))
3848 in->cache_gen++;
3849
3850 if ((issued & CEPH_CAP_FILE_SHARED) &&
3851 !(had & CEPH_CAP_FILE_SHARED)) {
3852 in->shared_gen++;
3853
3854 if (in->is_dir())
3855 clear_dir_complete_and_ordered(in, true);
3856 }
3857 }
3858
3859 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3860 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3861 int flags, const UserPerm& cap_perms)
3862 {
3863 Cap *cap = 0;
3864 mds_rank_t mds = mds_session->mds_num;
3865 if (in->caps.count(mds)) {
3866 cap = in->caps[mds];
3867
3868 /*
3869 * auth mds of the inode changed. we received the cap export
3870 * message, but still haven't received the cap import message.
3871 * handle_cap_export() updated the new auth MDS' cap.
3872 *
3873 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3874 * a message that was send before the cap import message. So
3875 * don't remove caps.
3876 */
3877 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3878 assert(cap == in->auth_cap);
3879 assert(cap->cap_id == cap_id);
3880 seq = cap->seq;
3881 mseq = cap->mseq;
3882 issued |= cap->issued;
3883 flags |= CEPH_CAP_FLAG_AUTH;
3884 }
3885 } else {
3886 mds_session->num_caps++;
3887 if (!in->is_any_caps()) {
3888 assert(in->snaprealm == 0);
3889 in->snaprealm = get_snap_realm(realm);
3890 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3891 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3892 }
3893 in->caps[mds] = cap = new Cap;
3894
3895 mds_session->caps.push_back(&cap->cap_item);
3896 cap->session = mds_session;
3897 cap->inode = in;
3898 cap->gen = mds_session->cap_gen;
3899 cap_list.push_back(&in->cap_item);
3900 }
3901
3902 check_cap_issue(in, cap, issued);
3903
3904 if (flags & CEPH_CAP_FLAG_AUTH) {
3905 if (in->auth_cap != cap &&
3906 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3907 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3908 ldout(cct, 10) << "add_update_cap changing auth cap: "
3909 << "add myself to new auth MDS' flushing caps list" << dendl;
3910 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3911 }
3912 in->auth_cap = cap;
3913 }
3914 }
3915
3916 unsigned old_caps = cap->issued;
3917 cap->cap_id = cap_id;
3918 cap->issued |= issued;
3919 cap->implemented |= issued;
3920 cap->seq = seq;
3921 cap->issue_seq = seq;
3922 cap->mseq = mseq;
3923 cap->latest_perms = cap_perms;
3924 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3925 << " from mds." << mds
3926 << " on " << *in
3927 << dendl;
3928
3929 if ((issued & ~old_caps) && in->auth_cap == cap) {
3930 // non-auth MDS is revoking the newly grant caps ?
3931 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3932 if (it->second == cap)
3933 continue;
3934 if (it->second->implemented & ~it->second->issued & issued) {
3935 check_caps(in, CHECK_CAPS_NODELAY);
3936 break;
3937 }
3938 }
3939 }
3940
3941 if (issued & ~old_caps)
3942 signal_cond_list(in->waitfor_caps);
3943 }
3944
3945 void Client::remove_cap(Cap *cap, bool queue_release)
3946 {
3947 Inode *in = cap->inode;
3948 MetaSession *session = cap->session;
3949 mds_rank_t mds = cap->session->mds_num;
3950
3951 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3952
3953 if (queue_release) {
3954 session->enqueue_cap_release(
3955 in->ino,
3956 cap->cap_id,
3957 cap->issue_seq,
3958 cap->mseq,
3959 cap_epoch_barrier);
3960 }
3961
3962 if (in->auth_cap == cap) {
3963 if (in->flushing_cap_item.is_on_list()) {
3964 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3965 in->flushing_cap_item.remove_myself();
3966 }
3967 in->auth_cap = NULL;
3968 }
3969 assert(in->caps.count(mds));
3970 in->caps.erase(mds);
3971
3972 cap->cap_item.remove_myself();
3973 delete cap;
3974 cap = nullptr;
3975
3976 if (!in->is_any_caps()) {
3977 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3978 in->snaprealm_item.remove_myself();
3979 put_snap_realm(in->snaprealm);
3980 in->snaprealm = 0;
3981 }
3982 }
3983
3984 void Client::remove_all_caps(Inode *in)
3985 {
3986 while (!in->caps.empty())
3987 remove_cap(in->caps.begin()->second, true);
3988 }
3989
3990 void Client::remove_session_caps(MetaSession *s)
3991 {
3992 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3993
3994 while (s->caps.size()) {
3995 Cap *cap = *s->caps.begin();
3996 Inode *in = cap->inode;
3997 bool dirty_caps = false, cap_snaps = false;
3998 if (in->auth_cap == cap) {
3999 cap_snaps = !in->cap_snaps.empty();
4000 dirty_caps = in->dirty_caps | in->flushing_caps;
4001 in->wanted_max_size = 0;
4002 in->requested_max_size = 0;
4003 in->flags |= I_CAP_DROPPED;
4004 }
4005 remove_cap(cap, false);
4006 signal_cond_list(in->waitfor_caps);
4007 if (cap_snaps) {
4008 InodeRef tmp_ref(in);
4009 in->cap_snaps.clear();
4010 }
4011 if (dirty_caps) {
4012 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4013 if (in->flushing_caps) {
4014 num_flushing_caps--;
4015 in->flushing_cap_tids.clear();
4016 }
4017 in->flushing_caps = 0;
4018 in->dirty_caps = 0;
4019 put_inode(in);
4020 }
4021 }
4022 s->flushing_caps_tids.clear();
4023 sync_cond.Signal();
4024 }
4025
4026 class C_Client_Remount : public Context {
4027 private:
4028 Client *client;
4029 public:
4030 explicit C_Client_Remount(Client *c) : client(c) {}
4031 void finish(int r) override {
4032 assert (r == 0);
4033 r = client->remount_cb(client->callback_handle);
4034 if (r != 0) {
4035 client_t whoami = client->get_nodeid();
4036 lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error "
4037 << r << dendl;
4038 if (client->require_remount && !client->unmounting) {
4039 assert(0 == "failed to remount for kernel dentry trimming");
4040 }
4041 }
4042 }
4043 };
4044
4045 void Client::_invalidate_kernel_dcache()
4046 {
4047 if (unmounting)
4048 return;
4049 if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
4050 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4051 p != root->dir->dentries.end();
4052 ++p) {
4053 if (p->second->inode)
4054 _schedule_invalidate_dentry_callback(p->second, false);
4055 }
4056 } else if (remount_cb) {
4057 // Hacky:
4058 // when remounting a file system, linux kernel trims all unused dentries in the fs
4059 remount_finisher.queue(new C_Client_Remount(this));
4060 }
4061 }
4062
4063 void Client::trim_caps(MetaSession *s, int max)
4064 {
4065 mds_rank_t mds = s->mds_num;
4066 int caps_size = s->caps.size();
4067 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4068 << " caps " << caps_size << dendl;
4069
4070 int trimmed = 0;
4071 xlist<Cap*>::iterator p = s->caps.begin();
4072 while ((caps_size - trimmed) > max && !p.end()) {
4073 Cap *cap = *p;
4074 Inode *in = cap->inode;
4075
4076 // Increment p early because it will be invalidated if cap
4077 // is deleted inside remove_cap
4078 ++p;
4079
4080 if (in->caps.size() > 1 && cap != in->auth_cap) {
4081 int mine = cap->issued | cap->implemented;
4082 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4083 // disposable non-auth cap
4084 if (!(get_caps_used(in) & ~oissued & mine)) {
4085 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4086 remove_cap(cap, true);
4087 trimmed++;
4088 }
4089 } else {
4090 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4091 bool all = true;
4092 set<Dentry*>::iterator q = in->dn_set.begin();
4093 InodeRef tmp_ref(in);
4094 while (q != in->dn_set.end()) {
4095 Dentry *dn = *q++;
4096 if (dn->lru_is_expireable()) {
4097 if (can_invalidate_dentries &&
4098 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4099 // Only issue one of these per DN for inodes in root: handle
4100 // others more efficiently by calling for root-child DNs at
4101 // the end of this function.
4102 _schedule_invalidate_dentry_callback(dn, true);
4103 }
4104 trim_dentry(dn);
4105 } else {
4106 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4107 all = false;
4108 }
4109 }
4110 if (all && in->ino != MDS_INO_ROOT) {
4111 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4112 trimmed++;
4113 }
4114 }
4115 }
4116
4117 if (s->caps.size() > max)
4118 _invalidate_kernel_dcache();
4119 }
4120
4121 void Client::force_session_readonly(MetaSession *s)
4122 {
4123 s->readonly = true;
4124 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4125 Inode *in = (*p)->inode;
4126 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4127 signal_cond_list(in->waitfor_caps);
4128 }
4129 }
4130
4131 void Client::mark_caps_dirty(Inode *in, int caps)
4132 {
4133 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4134 << ccap_string(in->dirty_caps | caps) << dendl;
4135 if (caps && !in->caps_dirty())
4136 in->get();
4137 in->dirty_caps |= caps;
4138 }
4139
4140 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4141 {
4142 MetaSession *session = in->auth_cap->session;
4143
4144 int flushing = in->dirty_caps;
4145 assert(flushing);
4146
4147 ceph_tid_t flush_tid = ++last_flush_tid;
4148 in->flushing_cap_tids[flush_tid] = flushing;
4149
4150 if (!in->flushing_caps) {
4151 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4152 num_flushing_caps++;
4153 } else {
4154 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4155 }
4156
4157 in->flushing_caps |= flushing;
4158 in->dirty_caps = 0;
4159
4160 if (!in->flushing_cap_item.is_on_list())
4161 session->flushing_caps.push_back(&in->flushing_cap_item);
4162 session->flushing_caps_tids.insert(flush_tid);
4163
4164 *ptid = flush_tid;
4165 return flushing;
4166 }
4167
4168 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4169 {
4170 for (auto &p : in->cap_snaps) {
4171 CapSnap &capsnap = p.second;
4172 if (capsnap.flush_tid > 0) {
4173 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4174 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4175 }
4176 }
4177 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4178 it != in->flushing_cap_tids.end();
4179 ++it) {
4180 old_s->flushing_caps_tids.erase(it->first);
4181 new_s->flushing_caps_tids.insert(it->first);
4182 }
4183 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4184 }
4185
4186 /*
4187 * Flush all caps back to the MDS. Because the callers generally wait on the
4188 * result of this function (syncfs and umount cases), we set
4189 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4190 */
4191 void Client::flush_caps_sync()
4192 {
4193 ldout(cct, 10) << __func__ << dendl;
4194 xlist<Inode*>::iterator p = delayed_caps.begin();
4195 while (!p.end()) {
4196 unsigned flags = CHECK_CAPS_NODELAY;
4197 Inode *in = *p;
4198
4199 ++p;
4200 delayed_caps.pop_front();
4201 if (p.end() && cap_list.empty())
4202 flags |= CHECK_CAPS_SYNCHRONOUS;
4203 check_caps(in, flags);
4204 }
4205
4206 // other caps, too
4207 p = cap_list.begin();
4208 while (!p.end()) {
4209 unsigned flags = CHECK_CAPS_NODELAY;
4210 Inode *in = *p;
4211
4212 ++p;
4213 if (p.end())
4214 flags |= CHECK_CAPS_SYNCHRONOUS;
4215 check_caps(in, flags);
4216 }
4217 }
4218
4219 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4220 {
4221 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4222 Cap *cap = in->auth_cap;
4223 assert(cap->session == session);
4224
4225 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4226 p != in->flushing_cap_tids.end();
4227 ++p) {
4228 bool req_sync = false;
4229
4230 /* If this is a synchronous request, then flush the journal on last one */
4231 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4232 req_sync = true;
4233
4234 send_cap(in, session, cap, req_sync,
4235 (get_caps_used(in) | in->caps_dirty()),
4236 in->caps_wanted(), (cap->issued | cap->implemented),
4237 p->second, p->first);
4238 }
4239 }
4240
4241 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4242 {
4243 while (in->flushing_caps) {
4244 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4245 assert(it != in->flushing_cap_tids.end());
4246 if (it->first > want)
4247 break;
4248 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4249 << ccap_string(it->second) << " want " << want
4250 << " last " << it->first << dendl;
4251 wait_on_list(in->waitfor_caps);
4252 }
4253 }
4254
4255 void Client::wait_sync_caps(ceph_tid_t want)
4256 {
4257 retry:
4258 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4259 << num_flushing_caps << " total flushing)" << dendl;
4260 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4261 p != mds_sessions.end();
4262 ++p) {
4263 MetaSession *s = p->second;
4264 if (s->flushing_caps_tids.empty())
4265 continue;
4266 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4267 if (oldest_tid <= want) {
4268 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4269 << " (want " << want << ")" << dendl;
4270 sync_cond.Wait(client_lock);
4271 goto retry;
4272 }
4273 }
4274 }
4275
4276 void Client::kick_flushing_caps(MetaSession *session)
4277 {
4278 mds_rank_t mds = session->mds_num;
4279 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4280
4281 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4282 Inode *in = *p;
4283 if (session->early_flushing_caps.count(in))
4284 continue;
4285 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4286 if (in->cap_snaps.size())
4287 flush_snaps(in, true);
4288 if (in->flushing_caps)
4289 flush_caps(in, session);
4290 }
4291
4292 session->early_flushing_caps.clear();
4293 }
4294
4295 void Client::early_kick_flushing_caps(MetaSession *session)
4296 {
4297 session->early_flushing_caps.clear();
4298
4299 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4300 Inode *in = *p;
4301 assert(in->auth_cap);
4302
4303 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4304 // stage. This guarantees that MDS processes the cap flush message before issuing
4305 // the flushing caps to other client.
4306 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4307 continue;
4308
4309 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4310 << " to mds." << session->mds_num << dendl;
4311
4312 session->early_flushing_caps.insert(in);
4313
4314 if (in->cap_snaps.size())
4315 flush_snaps(in, true);
4316 if (in->flushing_caps)
4317 flush_caps(in, session);
4318
4319 }
4320 }
4321
4322 void Client::kick_maxsize_requests(MetaSession *session)
4323 {
4324 xlist<Cap*>::iterator iter = session->caps.begin();
4325 while (!iter.end()){
4326 (*iter)->inode->requested_max_size = 0;
4327 (*iter)->inode->wanted_max_size = 0;
4328 signal_cond_list((*iter)->inode->waitfor_caps);
4329 ++iter;
4330 }
4331 }
4332
4333 void SnapRealm::build_snap_context()
4334 {
4335 set<snapid_t> snaps;
4336 snapid_t max_seq = seq;
4337
4338 // start with prior_parents?
4339 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4340 snaps.insert(prior_parent_snaps[i]);
4341
4342 // current parent's snaps
4343 if (pparent) {
4344 const SnapContext& psnapc = pparent->get_snap_context();
4345 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4346 if (psnapc.snaps[i] >= parent_since)
4347 snaps.insert(psnapc.snaps[i]);
4348 if (psnapc.seq > max_seq)
4349 max_seq = psnapc.seq;
4350 }
4351
4352 // my snaps
4353 for (unsigned i=0; i<my_snaps.size(); i++)
4354 snaps.insert(my_snaps[i]);
4355
4356 // ok!
4357 cached_snap_context.seq = max_seq;
4358 cached_snap_context.snaps.resize(0);
4359 cached_snap_context.snaps.reserve(snaps.size());
4360 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4361 cached_snap_context.snaps.push_back(*p);
4362 }
4363
4364 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4365 {
4366 list<SnapRealm*> q;
4367 q.push_back(realm);
4368
4369 while (!q.empty()) {
4370 realm = q.front();
4371 q.pop_front();
4372
4373 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4374 realm->invalidate_cache();
4375
4376 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4377 p != realm->pchildren.end();
4378 ++p)
4379 q.push_back(*p);
4380 }
4381 }
4382
4383 SnapRealm *Client::get_snap_realm(inodeno_t r)
4384 {
4385 SnapRealm *realm = snap_realms[r];
4386 if (!realm)
4387 snap_realms[r] = realm = new SnapRealm(r);
4388 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4389 realm->nref++;
4390 return realm;
4391 }
4392
4393 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4394 {
4395 if (snap_realms.count(r) == 0) {
4396 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4397 return NULL;
4398 }
4399 SnapRealm *realm = snap_realms[r];
4400 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4401 realm->nref++;
4402 return realm;
4403 }
4404
4405 void Client::put_snap_realm(SnapRealm *realm)
4406 {
4407 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4408 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4409 if (--realm->nref == 0) {
4410 snap_realms.erase(realm->ino);
4411 if (realm->pparent) {
4412 realm->pparent->pchildren.erase(realm);
4413 put_snap_realm(realm->pparent);
4414 }
4415 delete realm;
4416 }
4417 }
4418
4419 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4420 {
4421 if (realm->parent != parent) {
4422 ldout(cct, 10) << "adjust_realm_parent " << *realm
4423 << " " << realm->parent << " -> " << parent << dendl;
4424 realm->parent = parent;
4425 if (realm->pparent) {
4426 realm->pparent->pchildren.erase(realm);
4427 put_snap_realm(realm->pparent);
4428 }
4429 realm->pparent = get_snap_realm(parent);
4430 realm->pparent->pchildren.insert(realm);
4431 return true;
4432 }
4433 return false;
4434 }
4435
4436 static bool has_new_snaps(const SnapContext& old_snapc,
4437 const SnapContext& new_snapc)
4438 {
4439 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4440 }
4441
4442
4443 void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4444 {
4445 SnapRealm *first_realm = NULL;
4446 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4447
4448 map<SnapRealm*, SnapContext> dirty_realms;
4449
4450 bufferlist::iterator p = bl.begin();
4451 while (!p.end()) {
4452 SnapRealmInfo info;
4453 ::decode(info, p);
4454 SnapRealm *realm = get_snap_realm(info.ino());
4455
4456 bool invalidate = false;
4457
4458 if (info.seq() > realm->seq) {
4459 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4460 << dendl;
4461
4462 if (flush) {
4463 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4464 // flush me + children
4465 list<SnapRealm*> q;
4466 q.push_back(realm);
4467 while (!q.empty()) {
4468 SnapRealm *realm = q.front();
4469 q.pop_front();
4470
4471 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4472 p != realm->pchildren.end();
4473 ++p)
4474 q.push_back(*p);
4475
4476 if (dirty_realms.count(realm) == 0) {
4477 realm->nref++;
4478 dirty_realms[realm] = realm->get_snap_context();
4479 }
4480 }
4481 }
4482
4483 // update
4484 realm->seq = info.seq();
4485 realm->created = info.created();
4486 realm->parent_since = info.parent_since();
4487 realm->prior_parent_snaps = info.prior_parent_snaps;
4488 realm->my_snaps = info.my_snaps;
4489 invalidate = true;
4490 }
4491
4492 // _always_ verify parent
4493 if (adjust_realm_parent(realm, info.parent()))
4494 invalidate = true;
4495
4496 if (invalidate) {
4497 invalidate_snaprealm_and_children(realm);
4498 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4499 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4500 } else {
4501 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4502 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4503 }
4504
4505 if (!first_realm)
4506 first_realm = realm;
4507 else
4508 put_snap_realm(realm);
4509 }
4510
4511 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4512 q != dirty_realms.end();
4513 ++q) {
4514 SnapRealm *realm = q->first;
4515 // if there are new snaps ?
4516 if (has_new_snaps(q->second, realm->get_snap_context())) {
4517 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4518 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4519 while (!r.end()) {
4520 Inode *in = *r;
4521 ++r;
4522 queue_cap_snap(in, q->second);
4523 }
4524 } else {
4525 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4526 }
4527 put_snap_realm(realm);
4528 }
4529
4530 if (realm_ret)
4531 *realm_ret = first_realm;
4532 else
4533 put_snap_realm(first_realm);
4534 }
4535
4536 void Client::handle_snap(MClientSnap *m)
4537 {
4538 ldout(cct, 10) << "handle_snap " << *m << dendl;
4539 mds_rank_t mds = mds_rank_t(m->get_source().num());
4540 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4541 if (!session) {
4542 m->put();
4543 return;
4544 }
4545
4546 got_mds_push(session);
4547
4548 map<Inode*, SnapContext> to_move;
4549 SnapRealm *realm = 0;
4550
4551 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4552 assert(m->head.split);
4553 SnapRealmInfo info;
4554 bufferlist::iterator p = m->bl.begin();
4555 ::decode(info, p);
4556 assert(info.ino() == m->head.split);
4557
4558 // flush, then move, ino's.
4559 realm = get_snap_realm(info.ino());
4560 ldout(cct, 10) << " splitting off " << *realm << dendl;
4561 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4562 p != m->split_inos.end();
4563 ++p) {
4564 vinodeno_t vino(*p, CEPH_NOSNAP);
4565 if (inode_map.count(vino)) {
4566 Inode *in = inode_map[vino];
4567 if (!in->snaprealm || in->snaprealm == realm)
4568 continue;
4569 if (in->snaprealm->created > info.created()) {
4570 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4571 << *in->snaprealm << dendl;
4572 continue;
4573 }
4574 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4575
4576
4577 in->snaprealm_item.remove_myself();
4578 to_move[in] = in->snaprealm->get_snap_context();
4579 put_snap_realm(in->snaprealm);
4580 }
4581 }
4582
4583 // move child snaprealms, too
4584 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4585 p != m->split_realms.end();
4586 ++p) {
4587 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4588 SnapRealm *child = get_snap_realm_maybe(*p);
4589 if (!child)
4590 continue;
4591 adjust_realm_parent(child, realm->ino);
4592 put_snap_realm(child);
4593 }
4594 }
4595
4596 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4597
4598 if (realm) {
4599 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4600 Inode *in = p->first;
4601 in->snaprealm = realm;
4602 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4603 realm->nref++;
4604 // queue for snap writeback
4605 if (has_new_snaps(p->second, realm->get_snap_context()))
4606 queue_cap_snap(in, p->second);
4607 }
4608 put_snap_realm(realm);
4609 }
4610
4611 m->put();
4612 }
4613
4614 void Client::handle_quota(MClientQuota *m)
4615 {
4616 mds_rank_t mds = mds_rank_t(m->get_source().num());
4617 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4618 if (!session) {
4619 m->put();
4620 return;
4621 }
4622
4623 got_mds_push(session);
4624
4625 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4626
4627 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4628 if (inode_map.count(vino)) {
4629 Inode *in = NULL;
4630 in = inode_map[vino];
4631
4632 if (in) {
4633 in->quota = m->quota;
4634 in->rstat = m->rstat;
4635 }
4636 }
4637
4638 m->put();
4639 }
4640
4641 void Client::handle_caps(MClientCaps *m)
4642 {
4643 mds_rank_t mds = mds_rank_t(m->get_source().num());
4644 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4645 if (!session) {
4646 m->put();
4647 return;
4648 }
4649
4650 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4651 // Pause RADOS operations until we see the required epoch
4652 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4653 }
4654
4655 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4656 // Record the barrier so that we will transmit it to MDS when releasing
4657 set_cap_epoch_barrier(m->osd_epoch_barrier);
4658 }
4659
4660 got_mds_push(session);
4661
4662 m->clear_payload(); // for if/when we send back to MDS
4663
4664 Inode *in = 0;
4665 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4666 if (inode_map.count(vino))
4667 in = inode_map[vino];
4668 if (!in) {
4669 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4670 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4671 session->enqueue_cap_release(
4672 m->get_ino(),
4673 m->get_cap_id(),
4674 m->get_seq(),
4675 m->get_mseq(),
4676 cap_epoch_barrier);
4677 } else {
4678 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4679 }
4680 m->put();
4681
4682 // in case the mds is waiting on e.g. a revocation
4683 flush_cap_releases();
4684 return;
4685 }
4686
4687 switch (m->get_op()) {
4688 case CEPH_CAP_OP_EXPORT:
4689 return handle_cap_export(session, in, m);
4690 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4691 return handle_cap_flushsnap_ack(session, in, m);
4692 case CEPH_CAP_OP_IMPORT:
4693 handle_cap_import(session, in, m);
4694 }
4695
4696 if (in->caps.count(mds) == 0) {
4697 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4698 m->put();
4699 return;
4700 }
4701
4702 Cap *cap = in->caps[mds];
4703
4704 switch (m->get_op()) {
4705 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4706 case CEPH_CAP_OP_IMPORT:
4707 case CEPH_CAP_OP_REVOKE:
4708 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4709 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4710 default:
4711 m->put();
4712 }
4713 }
4714
4715 void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4716 {
4717 mds_rank_t mds = session->mds_num;
4718
4719 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4720 << " IMPORT from mds." << mds << dendl;
4721
4722 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4723 Cap *cap = NULL;
4724 UserPerm cap_perms;
4725 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4726 cap = in->caps[peer_mds];
4727 if (cap) {
4728 cap_perms = cap->latest_perms;
4729 }
4730 }
4731
4732 // add/update it
4733 SnapRealm *realm = NULL;
4734 update_snap_trace(m->snapbl, &realm);
4735
4736 add_update_cap(in, session, m->get_cap_id(),
4737 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4738 CEPH_CAP_FLAG_AUTH, cap_perms);
4739
4740 if (cap && cap->cap_id == m->peer.cap_id) {
4741 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4742 }
4743
4744 if (realm)
4745 put_snap_realm(realm);
4746
4747 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4748 // reflush any/all caps (if we are now the auth_cap)
4749 if (in->cap_snaps.size())
4750 flush_snaps(in, true);
4751 if (in->flushing_caps)
4752 flush_caps(in, session);
4753 }
4754 }
4755
4756 void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4757 {
4758 mds_rank_t mds = session->mds_num;
4759
4760 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4761 << " EXPORT from mds." << mds << dendl;
4762
4763 Cap *cap = NULL;
4764 if (in->caps.count(mds))
4765 cap = in->caps[mds];
4766
4767 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4768
4769 if (cap && cap->cap_id == m->get_cap_id()) {
4770 if (m->peer.cap_id) {
4771 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4772 if (in->caps.count(peer_mds)) {
4773 Cap *tcap = in->caps[peer_mds];
4774 if (tcap->cap_id != m->peer.cap_id ||
4775 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4776 tcap->cap_id = m->peer.cap_id;
4777 tcap->seq = m->peer.seq - 1;
4778 tcap->issue_seq = tcap->seq;
4779 tcap->mseq = m->peer.mseq;
4780 tcap->issued |= cap->issued;
4781 tcap->implemented |= cap->issued;
4782 if (cap == in->auth_cap)
4783 in->auth_cap = tcap;
4784 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4785 adjust_session_flushing_caps(in, session, tsession);
4786 }
4787 } else {
4788 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4789 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4790 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4791 cap->latest_perms);
4792 }
4793 } else {
4794 if (cap == in->auth_cap)
4795 in->flags |= I_CAP_DROPPED;
4796 }
4797
4798 remove_cap(cap, false);
4799 }
4800
4801 m->put();
4802 }
4803
4804 void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4805 {
4806 mds_rank_t mds = session->mds_num;
4807 assert(in->caps[mds]);
4808
4809 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4810 << " size " << in->size << " -> " << m->get_size()
4811 << dendl;
4812
4813 int implemented = 0;
4814 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4815 issued |= implemented;
4816 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4817 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4818 m->get_ctime(), m->get_mtime(), m->get_atime(),
4819 m->inline_version, m->inline_data, issued);
4820 m->put();
4821 }
4822
4823 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4824 {
4825 ceph_tid_t flush_ack_tid = m->get_client_tid();
4826 int dirty = m->get_dirty();
4827 int cleaned = 0;
4828 int flushed = 0;
4829
4830 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4831 it != in->flushing_cap_tids.end(); ) {
4832 if (it->first == flush_ack_tid)
4833 cleaned = it->second;
4834 if (it->first <= flush_ack_tid) {
4835 session->flushing_caps_tids.erase(it->first);
4836 in->flushing_cap_tids.erase(it++);
4837 ++flushed;
4838 continue;
4839 }
4840 cleaned &= ~it->second;
4841 if (!cleaned)
4842 break;
4843 ++it;
4844 }
4845
4846 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4847 << " cleaned " << ccap_string(cleaned) << " on " << *in
4848 << " with " << ccap_string(dirty) << dendl;
4849
4850 if (flushed) {
4851 signal_cond_list(in->waitfor_caps);
4852 if (session->flushing_caps_tids.empty() ||
4853 *session->flushing_caps_tids.begin() > flush_ack_tid)
4854 sync_cond.Signal();
4855 }
4856
4857 if (!dirty) {
4858 in->cap_dirtier_uid = -1;
4859 in->cap_dirtier_gid = -1;
4860 }
4861
4862 if (!cleaned) {
4863 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4864 } else {
4865 if (in->flushing_caps) {
4866 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4867 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4868 in->flushing_caps &= ~cleaned;
4869 if (in->flushing_caps == 0) {
4870 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4871 num_flushing_caps--;
4872 if (in->cap_snaps.empty())
4873 in->flushing_cap_item.remove_myself();
4874 }
4875 if (!in->caps_dirty())
4876 put_inode(in);
4877 }
4878 }
4879
4880 m->put();
4881 }
4882
4883
4884 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4885 {
4886 mds_rank_t mds = session->mds_num;
4887 assert(in->caps[mds]);
4888 snapid_t follows = m->get_snap_follows();
4889
4890 if (in->cap_snaps.count(follows)) {
4891 CapSnap &capsnap = in->cap_snaps.at(follows);
4892 if (m->get_client_tid() != capsnap.flush_tid) {
4893 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4894 } else {
4895 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4896 << " on " << *in << dendl;
4897 InodeRef tmp_ref;
4898 if (in->get_num_ref() == 1)
4899 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4900 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4901 in->flushing_cap_item.remove_myself();
4902 session->flushing_caps_tids.erase(capsnap.flush_tid);
4903 in->cap_snaps.erase(follows);
4904 }
4905 } else {
4906 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4907 << " on " << *in << dendl;
4908 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4909 }
4910
4911 m->put();
4912 }
4913
4914 class C_Client_DentryInvalidate : public Context {
4915 private:
4916 Client *client;
4917 vinodeno_t dirino;
4918 vinodeno_t ino;
4919 string name;
4920 public:
4921 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4922 client(c), name(dn->name) {
4923 if (client->use_faked_inos()) {
4924 dirino.ino = dn->dir->parent_inode->faked_ino;
4925 if (del)
4926 ino.ino = dn->inode->faked_ino;
4927 } else {
4928 dirino = dn->dir->parent_inode->vino();
4929 if (del)
4930 ino = dn->inode->vino();
4931 }
4932 if (!del)
4933 ino.ino = inodeno_t();
4934 }
4935 void finish(int r) override {
4936 // _async_dentry_invalidate is responsible for its own locking
4937 assert(!client->client_lock.is_locked_by_me());
4938 client->_async_dentry_invalidate(dirino, ino, name);
4939 }
4940 };
4941
4942 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4943 {
4944 if (unmounting)
4945 return;
4946 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4947 << " in dir " << dirino << dendl;
4948 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4949 }
4950
4951 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4952 {
4953 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4954 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4955 }
4956
4957 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4958 {
4959 int ref = in->get_num_ref();
4960
4961 if (in->dir && !in->dir->dentries.empty()) {
4962 for (auto p = in->dir->dentries.begin();
4963 p != in->dir->dentries.end(); ) {
4964 Dentry *dn = p->second;
4965 ++p;
4966 /* rmsnap removes whole subtree, need trim inodes recursively.
4967 * we don't need to invalidate dentries recursively. because
4968 * invalidating a directory dentry effectively invalidate
4969 * whole subtree */
4970 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
4971 _try_to_trim_inode(dn->inode.get(), false);
4972
4973 if (dn->lru_is_expireable())
4974 unlink(dn, true, false); // keep dir, drop dentry
4975 }
4976 if (in->dir->dentries.empty()) {
4977 close_dir(in->dir);
4978 --ref;
4979 }
4980 }
4981
4982 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
4983 InodeRef snapdir = open_snapdir(in);
4984 _try_to_trim_inode(snapdir.get(), false);
4985 --ref;
4986 }
4987
4988 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
4989 set<Dentry*>::iterator q = in->dn_set.begin();
4990 while (q != in->dn_set.end()) {
4991 Dentry *dn = *q++;
4992 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
4993 // so in->dn_set doesn't always reflect the state of kernel's dcache.
4994 _schedule_invalidate_dentry_callback(dn, true);
4995 unlink(dn, true, true);
4996 }
4997 }
4998 }
4999
5000 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5001 {
5002 mds_rank_t mds = session->mds_num;
5003 int used = get_caps_used(in);
5004 int wanted = in->caps_wanted();
5005
5006 const int old_caps = cap->issued;
5007 const int new_caps = m->get_caps();
5008 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5009 << " mds." << mds << " seq " << m->get_seq()
5010 << " caps now " << ccap_string(new_caps)
5011 << " was " << ccap_string(old_caps) << dendl;
5012 cap->seq = m->get_seq();
5013
5014 in->layout = m->get_layout();
5015
5016 // update inode
5017 int implemented = 0;
5018 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5019 issued |= implemented;
5020
5021 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5022 in->mode = m->head.mode;
5023 in->uid = m->head.uid;
5024 in->gid = m->head.gid;
5025 in->btime = m->btime;
5026 }
5027 bool deleted_inode = false;
5028 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5029 in->nlink = m->head.nlink;
5030 if (in->nlink == 0 &&
5031 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5032 deleted_inode = true;
5033 }
5034 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5035 m->xattrbl.length() &&
5036 m->head.xattr_version > in->xattr_version) {
5037 bufferlist::iterator p = m->xattrbl.begin();
5038 ::decode(in->xattrs, p);
5039 in->xattr_version = m->head.xattr_version;
5040 }
5041 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5042 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5043 m->get_mtime(), m->get_atime(),
5044 m->inline_version, m->inline_data, issued);
5045
5046 // max_size
5047 if (cap == in->auth_cap &&
5048 m->get_max_size() != in->max_size) {
5049 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5050 in->max_size = m->get_max_size();
5051 if (in->max_size > in->wanted_max_size) {
5052 in->wanted_max_size = 0;
5053 in->requested_max_size = 0;
5054 }
5055 }
5056
5057 bool check = false;
5058 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5059 check = true;
5060
5061 check_cap_issue(in, cap, new_caps);
5062
5063 // update caps
5064 if (old_caps & ~new_caps) {
5065 ldout(cct, 10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl;
5066 cap->issued = new_caps;
5067 cap->implemented |= new_caps;
5068
5069 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5070 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5071 // waitin' for flush
5072 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5073 if (_release(in))
5074 check = true;
5075 } else {
5076 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5077 check = true;
5078 }
5079
5080 } else if (old_caps == new_caps) {
5081 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5082 } else {
5083 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5084 cap->issued = new_caps;
5085 cap->implemented |= new_caps;
5086
5087 if (cap == in->auth_cap) {
5088 // non-auth MDS is revoking the newly grant caps ?
5089 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5090 if (it->second == cap)
5091 continue;
5092 if (it->second->implemented & ~it->second->issued & new_caps) {
5093 check = true;
5094 break;
5095 }
5096 }
5097 }
5098 }
5099
5100 if (check)
5101 check_caps(in, 0);
5102
5103 // wake up waiters
5104 if (new_caps)
5105 signal_cond_list(in->waitfor_caps);
5106
5107 // may drop inode's last ref
5108 if (deleted_inode)
5109 _try_to_trim_inode(in, true);
5110
5111 m->put();
5112 }
5113
5114 int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5115 {
5116 // cppcheck-suppress variableScope
5117 int sgid_count;
5118 gid_t *sgid_buf;
5119
5120 if (getgroups_cb) {
5121 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5122 if (sgid_count > 0) {
5123 *sgids = sgid_buf;
5124 return sgid_count;
5125 }
5126 }
5127
5128 #if HAVE_GETGROUPLIST
5129 struct passwd *pw;
5130 pw = getpwuid(uid);
5131 if (pw == NULL) {
5132 ldout(cct, 3) << "getting user entry failed" << dendl;
5133 return -errno;
5134 }
5135 //use PAM to get the group list
5136 // initial number of group entries, defaults to posix standard of 16
5137 // PAM implementations may provide more than 16 groups....
5138 sgid_count = 16;
5139 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5140 if (sgid_buf == NULL) {
5141 ldout(cct, 3) << "allocating group memory failed" << dendl;
5142 return -ENOMEM;
5143 }
5144
5145 while (1) {
5146 #if defined(__APPLE__)
5147 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5148 #else
5149 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5150 #endif
5151 // we need to resize the group list and try again
5152 void *_realloc = NULL;
5153 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5154 ldout(cct, 3) << "allocating group memory failed" << dendl;
5155 free(sgid_buf);
5156 return -ENOMEM;
5157 }
5158 sgid_buf = (gid_t*)_realloc;
5159 continue;
5160 }
5161 // list was successfully retrieved
5162 break;
5163 }
5164 *sgids = sgid_buf;
5165 return sgid_count;
5166 #else
5167 return 0;
5168 #endif
5169 }
5170
5171 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5172 {
5173 if (perms.uid() == 0)
5174 return 0;
5175
5176 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5177 int ret = _posix_acl_permission(in, perms, want);
5178 if (ret != -EAGAIN)
5179 return ret;
5180 }
5181
5182 // check permissions before doing anything else
5183 if (!in->check_mode(perms, want))
5184 return -EACCES;
5185 return 0;
5186 }
5187
5188 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5189 const UserPerm& perms)
5190 {
5191 int r = _getattr_for_perm(in, perms);
5192 if (r < 0)
5193 goto out;
5194
5195 r = 0;
5196 if (strncmp(name, "system.", 7) == 0) {
5197 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5198 r = -EPERM;
5199 } else {
5200 r = inode_permission(in, perms, want);
5201 }
5202 out:
5203 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5204 return r;
5205 }
5206
5207 ostream& operator<<(ostream &out, const UserPerm& perm) {
5208 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5209 return out;
5210 }
5211
5212 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5213 const UserPerm& perms)
5214 {
5215 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5216 int r = _getattr_for_perm(in, perms);
5217 if (r < 0)
5218 goto out;
5219
5220 if (mask & CEPH_SETATTR_SIZE) {
5221 r = inode_permission(in, perms, MAY_WRITE);
5222 if (r < 0)
5223 goto out;
5224 }
5225
5226 r = -EPERM;
5227 if (mask & CEPH_SETATTR_UID) {
5228 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5229 goto out;
5230 }
5231 if (mask & CEPH_SETATTR_GID) {
5232 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5233 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5234 goto out;
5235 }
5236
5237 if (mask & CEPH_SETATTR_MODE) {
5238 if (perms.uid() != 0 && perms.uid() != in->uid)
5239 goto out;
5240
5241 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5242 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5243 stx->stx_mode &= ~S_ISGID;
5244 }
5245
5246 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5247 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5248 if (perms.uid() != 0 && perms.uid() != in->uid) {
5249 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5250 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5251 check_mask |= CEPH_SETATTR_MTIME;
5252 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5253 check_mask |= CEPH_SETATTR_ATIME;
5254 if (check_mask & mask) {
5255 goto out;
5256 } else {
5257 r = inode_permission(in, perms, MAY_WRITE);
5258 if (r < 0)
5259 goto out;
5260 }
5261 }
5262 }
5263 r = 0;
5264 out:
5265 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5266 return r;
5267 }
5268
5269 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5270 {
5271 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5272 unsigned want = 0;
5273
5274 if ((flags & O_ACCMODE) == O_WRONLY)
5275 want = MAY_WRITE;
5276 else if ((flags & O_ACCMODE) == O_RDWR)
5277 want = MAY_READ | MAY_WRITE;
5278 else if ((flags & O_ACCMODE) == O_RDONLY)
5279 want = MAY_READ;
5280 if (flags & O_TRUNC)
5281 want |= MAY_WRITE;
5282
5283 int r = 0;
5284 switch (in->mode & S_IFMT) {
5285 case S_IFLNK:
5286 r = -ELOOP;
5287 goto out;
5288 case S_IFDIR:
5289 if (want & MAY_WRITE) {
5290 r = -EISDIR;
5291 goto out;
5292 }
5293 break;
5294 }
5295
5296 r = _getattr_for_perm(in, perms);
5297 if (r < 0)
5298 goto out;
5299
5300 r = inode_permission(in, perms, want);
5301 out:
5302 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5303 return r;
5304 }
5305
5306 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5307 {
5308 ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
5309 int r = _getattr_for_perm(dir, perms);
5310 if (r < 0)
5311 goto out;
5312
5313 r = inode_permission(dir, perms, MAY_EXEC);
5314 out:
5315 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5316 return r;
5317 }
5318
5319 int Client::may_create(Inode *dir, const UserPerm& perms)
5320 {
5321 ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
5322 int r = _getattr_for_perm(dir, perms);
5323 if (r < 0)
5324 goto out;
5325
5326 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5327 out:
5328 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5329 return r;
5330 }
5331
5332 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5333 {
5334 ldout(cct, 20) << __func__ << *dir << "; " << "; name " << name << "; " << perms << dendl;
5335 int r = _getattr_for_perm(dir, perms);
5336 if (r < 0)
5337 goto out;
5338
5339 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5340 if (r < 0)
5341 goto out;
5342
5343 /* 'name == NULL' means rmsnap */
5344 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5345 InodeRef otherin;
5346 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5347 if (r < 0)
5348 goto out;
5349 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5350 r = -EPERM;
5351 }
5352 out:
5353 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5354 return r;
5355 }
5356
5357 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5358 {
5359 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5360 int r = _getattr_for_perm(in, perms);
5361 if (r < 0)
5362 goto out;
5363
5364 if (perms.uid() == 0 || perms.uid() == in->uid) {
5365 r = 0;
5366 goto out;
5367 }
5368
5369 r = -EPERM;
5370 if (!S_ISREG(in->mode))
5371 goto out;
5372
5373 if (in->mode & S_ISUID)
5374 goto out;
5375
5376 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5377 goto out;
5378
5379 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5380 out:
5381 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5382 return r;
5383 }
5384
5385 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5386 {
5387 int mask = CEPH_STAT_CAP_MODE;
5388 bool force = false;
5389 if (acl_type != NO_ACL) {
5390 mask |= CEPH_STAT_CAP_XATTR;
5391 force = in->xattr_version == 0;
5392 }
5393 return _getattr(in, mask, perms, force);
5394 }
5395
5396 vinodeno_t Client::_get_vino(Inode *in)
5397 {
5398 /* The caller must hold the client lock */
5399 return vinodeno_t(in->ino, in->snapid);
5400 }
5401
5402 inodeno_t Client::_get_inodeno(Inode *in)
5403 {
5404 /* The caller must hold the client lock */
5405 return in->ino;
5406 }
5407
5408
5409 /**
5410 * Resolve an MDS spec to a list of MDS daemon GIDs.
5411 *
5412 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5413 * It may be '*' in which case it matches all GIDs.
5414 *
5415 * If no error is returned, the `targets` vector will be populated with at least
5416 * one MDS.
5417 */
5418 int Client::resolve_mds(
5419 const std::string &mds_spec,
5420 std::vector<mds_gid_t> *targets)
5421 {
5422 assert(fsmap);
5423 assert(targets != nullptr);
5424
5425 mds_role_t role;
5426 std::stringstream ss;
5427 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5428 if (role_r == 0) {
5429 // We got a role, resolve it to a GID
5430 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5431 << role << "'" << dendl;
5432 targets->push_back(
5433 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5434 return 0;
5435 }
5436
5437 std::string strtol_err;
5438 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5439 if (strtol_err.empty()) {
5440 // It is a possible GID
5441 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5442 if (fsmap->gid_exists(mds_gid)) {
5443 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5444 targets->push_back(mds_gid);
5445 } else {
5446 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5447 << dendl;
5448 return -ENOENT;
5449 }
5450 } else if (mds_spec == "*") {
5451 // It is a wildcard: use all MDSs
5452 const auto mds_info = fsmap->get_mds_info();
5453
5454 if (mds_info.empty()) {
5455 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5456 return -ENOENT;
5457 }
5458
5459 for (const auto i : mds_info) {
5460 targets->push_back(i.first);
5461 }
5462 } else {
5463 // It did not parse as an integer, it is not a wildcard, it must be a name
5464 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5465 if (mds_gid == 0) {
5466 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5467
5468 lderr(cct) << "FSMap: " << *fsmap << dendl;
5469
5470 return -ENOENT;
5471 } else {
5472 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5473 << "' to GID " << mds_gid << dendl;
5474 targets->push_back(mds_gid);
5475 }
5476 }
5477
5478 return 0;
5479 }
5480
5481
5482 /**
5483 * Authenticate with mon and establish global ID
5484 */
5485 int Client::authenticate()
5486 {
5487 assert(client_lock.is_locked_by_me());
5488
5489 if (monclient->is_authenticated()) {
5490 return 0;
5491 }
5492
5493 client_lock.Unlock();
5494 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5495 client_lock.Lock();
5496 if (r < 0) {
5497 return r;
5498 }
5499
5500 whoami = monclient->get_global_id();
5501 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5502
5503 return 0;
5504 }
5505
5506 int Client::fetch_fsmap(bool user)
5507 {
5508 int r;
5509 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5510 // rather than MDSMap because no one MDSMap contains all the daemons, and
5511 // a `tell` can address any daemon.
5512 version_t fsmap_latest;
5513 do {
5514 C_SaferCond cond;
5515 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5516 client_lock.Unlock();
5517 r = cond.wait();
5518 client_lock.Lock();
5519 } while (r == -EAGAIN);
5520
5521 if (r < 0) {
5522 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5523 return r;
5524 }
5525
5526 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5527
5528 if (user) {
5529 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5530 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5531 monclient->renew_subs();
5532 wait_on_list(waiting_for_fsmap);
5533 }
5534 assert(fsmap_user);
5535 assert(fsmap_user->get_epoch() >= fsmap_latest);
5536 } else {
5537 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5538 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5539 monclient->renew_subs();
5540 wait_on_list(waiting_for_fsmap);
5541 }
5542 assert(fsmap);
5543 assert(fsmap->get_epoch() >= fsmap_latest);
5544 }
5545 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5546 << fsmap_latest << dendl;
5547 return 0;
5548 }
5549
5550 /**
5551 *
5552 * @mds_spec one of ID, rank, GID, "*"
5553 *
5554 */
5555 int Client::mds_command(
5556 const std::string &mds_spec,
5557 const vector<string>& cmd,
5558 const bufferlist& inbl,
5559 bufferlist *outbl,
5560 string *outs,
5561 Context *onfinish)
5562 {
5563 Mutex::Locker lock(client_lock);
5564
5565 assert(initialized);
5566
5567 int r;
5568 r = authenticate();
5569 if (r < 0) {
5570 return r;
5571 }
5572
5573 r = fetch_fsmap(false);
5574 if (r < 0) {
5575 return r;
5576 }
5577
5578 // Look up MDS target(s) of the command
5579 std::vector<mds_gid_t> targets;
5580 r = resolve_mds(mds_spec, &targets);
5581 if (r < 0) {
5582 return r;
5583 }
5584
5585 // If daemons are laggy, we won't send them commands. If all
5586 // are laggy then we fail.
5587 std::vector<mds_gid_t> non_laggy;
5588 for (const auto gid : targets) {
5589 const auto info = fsmap->get_info_gid(gid);
5590 if (!info.laggy()) {
5591 non_laggy.push_back(gid);
5592 }
5593 }
5594 if (non_laggy.size() == 0) {
5595 *outs = "All targeted MDS daemons are laggy";
5596 return -ENOENT;
5597 }
5598
5599 if (metadata.empty()) {
5600 // We are called on an unmounted client, so metadata
5601 // won't be initialized yet.
5602 populate_metadata("");
5603 }
5604
5605 // Send commands to targets
5606 C_GatherBuilder gather(cct, onfinish);
5607 for (const auto target_gid : non_laggy) {
5608 const auto info = fsmap->get_info_gid(target_gid);
5609
5610 // Open a connection to the target MDS
5611 entity_inst_t inst = info.get_inst();
5612 ConnectionRef conn = messenger->get_connection(inst);
5613
5614 // Generate MDSCommandOp state
5615 auto &op = command_table.start_command();
5616
5617 op.on_finish = gather.new_sub();
5618 op.cmd = cmd;
5619 op.outbl = outbl;
5620 op.outs = outs;
5621 op.inbl = inbl;
5622 op.mds_gid = target_gid;
5623 op.con = conn;
5624
5625 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5626 << " tid=" << op.tid << cmd << dendl;
5627
5628 // Construct and send MCommand
5629 MCommand *m = op.get_message(monclient->get_fsid());
5630 conn->send_message(m);
5631 }
5632 gather.activate();
5633
5634 return 0;
5635 }
5636
5637 void Client::handle_command_reply(MCommandReply *m)
5638 {
5639 ceph_tid_t const tid = m->get_tid();
5640
5641 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5642
5643 if (!command_table.exists(tid)) {
5644 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5645 m->put();
5646 return;
5647 }
5648
5649 auto &op = command_table.get_command(tid);
5650 if (op.outbl) {
5651 op.outbl->claim(m->get_data());
5652 }
5653 if (op.outs) {
5654 *op.outs = m->rs;
5655 }
5656
5657 if (op.on_finish) {
5658 op.on_finish->complete(m->r);
5659 }
5660
5661 command_table.erase(tid);
5662
5663 m->put();
5664 }
5665
5666 // -------------------
5667 // MOUNT
5668
5669 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5670 bool require_mds)
5671 {
5672 Mutex::Locker lock(client_lock);
5673
5674 if (mounted) {
5675 ldout(cct, 5) << "already mounted" << dendl;
5676 return 0;
5677 }
5678
5679 int r = authenticate();
5680 if (r < 0) {
5681 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5682 return r;
5683 }
5684
5685 std::string want = "mdsmap";
5686 const auto &mds_ns = cct->_conf->client_mds_namespace;
5687 if (!mds_ns.empty()) {
5688 r = fetch_fsmap(true);
5689 if (r < 0)
5690 return r;
5691 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5692 if (cid == FS_CLUSTER_ID_NONE)
5693 return -ENOENT;
5694
5695 std::ostringstream oss;
5696 oss << want << "." << cid;
5697 want = oss.str();
5698 }
5699 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5700
5701 monclient->sub_want(want, 0, 0);
5702 monclient->renew_subs();
5703
5704 tick(); // start tick
5705
5706 if (require_mds) {
5707 while (1) {
5708 auto availability = mdsmap->is_cluster_available();
5709 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5710 // Error out
5711 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5712 return CEPH_FUSE_NO_MDS_UP;
5713 } else if (availability == MDSMap::AVAILABLE) {
5714 // Continue to mount
5715 break;
5716 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5717 // Else, wait. MDSMonitor will update the map to bring
5718 // us to a conclusion eventually.
5719 wait_on_list(waiting_for_mdsmap);
5720 } else {
5721 // Unexpected value!
5722 ceph_abort();
5723 }
5724 }
5725 }
5726
5727 populate_metadata(mount_root.empty() ? "/" : mount_root);
5728
5729 filepath fp(CEPH_INO_ROOT);
5730 if (!mount_root.empty()) {
5731 fp = filepath(mount_root.c_str());
5732 }
5733 while (true) {
5734 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5735 req->set_filepath(fp);
5736 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5737 int res = make_request(req, perms);
5738 if (res < 0) {
5739 if (res == -EACCES && root) {
5740 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5741 break;
5742 }
5743 return res;
5744 }
5745
5746 if (fp.depth())
5747 fp.pop_dentry();
5748 else
5749 break;
5750 }
5751
5752 assert(root);
5753 _ll_get(root);
5754
5755 mounted = true;
5756
5757 // trace?
5758 if (!cct->_conf->client_trace.empty()) {
5759 traceout.open(cct->_conf->client_trace.c_str());
5760 if (traceout.is_open()) {
5761 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5762 } else {
5763 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5764 }
5765 }
5766
5767 /*
5768 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5769 ldout(cct, 3) << "op: struct stat st;" << dendl;
5770 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5771 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5772 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5773 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5774 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5775 ldout(cct, 3) << "op: int fd;" << dendl;
5776 */
5777 return 0;
5778 }
5779
5780 // UNMOUNT
5781
5782 void Client::_close_sessions()
5783 {
5784 while (!mds_sessions.empty()) {
5785 // send session closes!
5786 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5787 p != mds_sessions.end();
5788 ++p) {
5789 if (p->second->state != MetaSession::STATE_CLOSING) {
5790 _close_mds_session(p->second);
5791 }
5792 }
5793
5794 // wait for sessions to close
5795 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5796 mount_cond.Wait(client_lock);
5797 }
5798 }
5799
5800 void Client::flush_mdlog_sync()
5801 {
5802 if (mds_requests.empty())
5803 return;
5804 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5805 p != mds_sessions.end();
5806 ++p) {
5807 MetaSession *s = p->second;
5808 flush_mdlog(s);
5809 }
5810 }
5811
5812 void Client::flush_mdlog(MetaSession *session)
5813 {
5814 // Only send this to Luminous or newer MDS daemons, older daemons
5815 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5816 const uint64_t features = session->con->get_features();
5817 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5818 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5819 session->con->send_message(m);
5820 }
5821 }
5822
5823
5824 void Client::unmount()
5825 {
5826 Mutex::Locker lock(client_lock);
5827
5828 assert(mounted); // caller is confused?
5829
5830 ldout(cct, 2) << "unmounting" << dendl;
5831 unmounting = true;
5832
5833 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5834 while (!mds_requests.empty()) {
5835 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5836 mount_cond.Wait(client_lock);
5837 }
5838
5839 if (tick_event)
5840 timer.cancel_event(tick_event);
5841 tick_event = 0;
5842
5843 cwd.reset();
5844
5845 // clean up any unclosed files
5846 while (!fd_map.empty()) {
5847 Fh *fh = fd_map.begin()->second;
5848 fd_map.erase(fd_map.begin());
5849 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5850 _release_fh(fh);
5851 }
5852
5853 while (!ll_unclosed_fh_set.empty()) {
5854 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5855 Fh *fh = *it;
5856 ll_unclosed_fh_set.erase(fh);
5857 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5858 _release_fh(fh);
5859 }
5860
5861 while (!opened_dirs.empty()) {
5862 dir_result_t *dirp = *opened_dirs.begin();
5863 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5864 _closedir(dirp);
5865 }
5866
5867 _ll_drop_pins();
5868
5869 if (blacklisted) {
5870 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5871
5872 if (cct->_conf->client_oc) {
5873 // Purge all cached data so that ObjectCacher doesn't get hung up
5874 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5875 // is to just leave things marked dirty
5876 // (http://tracker.ceph.com/issues/9105)
5877 for (const auto &i : inode_map) {
5878 objectcacher->purge_set(&(i.second->oset));
5879 }
5880 }
5881
5882 mounted = false;
5883 return;
5884 }
5885
5886 while (unsafe_sync_write > 0) {
5887 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5888 mount_cond.Wait(client_lock);
5889 }
5890
5891 if (cct->_conf->client_oc) {
5892 // flush/release all buffered data
5893 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5894 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5895 p != inode_map.end();
5896 p = next) {
5897 next = p;
5898 ++next;
5899 Inode *in = p->second;
5900 if (!in) {
5901 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5902 assert(in);
5903 }
5904 if (!in->caps.empty()) {
5905 InodeRef tmp_ref(in);
5906 _release(in);
5907 _flush(in, new C_Client_FlushComplete(this, in));
5908 }
5909 }
5910 }
5911
5912 flush_caps_sync();
5913 wait_sync_caps(last_flush_tid);
5914
5915 // empty lru cache
5916 lru.lru_set_max(0);
5917 trim_cache();
5918
5919 while (lru.lru_get_size() > 0 ||
5920 !inode_map.empty()) {
5921 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5922 << "+" << inode_map.size() << " items"
5923 << ", waiting (for caps to release?)"
5924 << dendl;
5925 utime_t until = ceph_clock_now() + utime_t(5, 0);
5926 int r = mount_cond.WaitUntil(client_lock, until);
5927 if (r == ETIMEDOUT) {
5928 dump_cache(NULL);
5929 }
5930 }
5931 assert(lru.lru_get_size() == 0);
5932 assert(inode_map.empty());
5933
5934 // stop tracing
5935 if (!cct->_conf->client_trace.empty()) {
5936 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5937 traceout.close();
5938 }
5939
5940 _close_sessions();
5941
5942 mounted = false;
5943
5944 ldout(cct, 2) << "unmounted." << dendl;
5945 }
5946
5947
5948
5949 class C_C_Tick : public Context {
5950 Client *client;
5951 public:
5952 explicit C_C_Tick(Client *c) : client(c) {}
5953 void finish(int r) override {
5954 // Called back via Timer, which takes client_lock for us
5955 assert(client->client_lock.is_locked_by_me());
5956 client->tick();
5957 }
5958 };
5959
5960 void Client::flush_cap_releases()
5961 {
5962 // send any cap releases
5963 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5964 p != mds_sessions.end();
5965 ++p) {
5966 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5967 p->first)) {
5968 if (cct->_conf->client_inject_release_failure) {
5969 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5970 p->second->release->put();
5971 } else {
5972 p->second->con->send_message(p->second->release);
5973 }
5974 p->second->release = 0;
5975 }
5976 }
5977 }
5978
5979 void Client::tick()
5980 {
5981 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5982 sleep(cct->_conf->client_debug_inject_tick_delay);
5983 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5984 cct->_conf->apply_changes(NULL);
5985 }
5986
5987 ldout(cct, 21) << "tick" << dendl;
5988 tick_event = new C_C_Tick(this);
5989 timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
5990
5991 utime_t now = ceph_clock_now();
5992
5993 if (!mounted && !mds_requests.empty()) {
5994 MetaRequest *req = mds_requests.begin()->second;
5995 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
5996 req->abort(-ETIMEDOUT);
5997 if (req->caller_cond) {
5998 req->kick = true;
5999 req->caller_cond->Signal();
6000 }
6001 signal_cond_list(waiting_for_mdsmap);
6002 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6003 p != mds_sessions.end();
6004 ++p)
6005 signal_context_list(p->second->waiting_for_open);
6006 }
6007 }
6008
6009 if (mdsmap->get_epoch()) {
6010 // renew caps?
6011 utime_t el = now - last_cap_renew;
6012 if (el > mdsmap->get_session_timeout() / 3.0)
6013 renew_caps();
6014
6015 flush_cap_releases();
6016 }
6017
6018 // delayed caps
6019 xlist<Inode*>::iterator p = delayed_caps.begin();
6020 while (!p.end()) {
6021 Inode *in = *p;
6022 ++p;
6023 if (in->hold_caps_until > now)
6024 break;
6025 delayed_caps.pop_front();
6026 cap_list.push_back(&in->cap_item);
6027 check_caps(in, CHECK_CAPS_NODELAY);
6028 }
6029
6030 trim_cache(true);
6031 }
6032
6033 void Client::renew_caps()
6034 {
6035 ldout(cct, 10) << "renew_caps()" << dendl;
6036 last_cap_renew = ceph_clock_now();
6037
6038 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6039 p != mds_sessions.end();
6040 ++p) {
6041 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6042 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6043 renew_caps(p->second);
6044 }
6045 }
6046
6047 void Client::renew_caps(MetaSession *session)
6048 {
6049 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6050 session->last_cap_renew_request = ceph_clock_now();
6051 uint64_t seq = ++session->cap_renew_seq;
6052 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6053 }
6054
6055
6056 // ===============================================================
6057 // high level (POSIXy) interface
6058
6059 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6060 InodeRef *target, const UserPerm& perms)
6061 {
6062 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6063 MetaRequest *req = new MetaRequest(op);
6064 filepath path;
6065 dir->make_nosnap_relative_path(path);
6066 path.push_dentry(name);
6067 req->set_filepath(path);
6068 req->set_inode(dir);
6069 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6070 mask |= DEBUG_GETATTR_CAPS;
6071 req->head.args.getattr.mask = mask;
6072
6073 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6074
6075 int r = make_request(req, perms, target);
6076 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6077 return r;
6078 }
6079
6080 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6081 const UserPerm& perms)
6082 {
6083 int r = 0;
6084 Dentry *dn = NULL;
6085
6086 if (!dir->is_dir()) {
6087 r = -ENOTDIR;
6088 goto done;
6089 }
6090
6091 if (dname == "..") {
6092 if (dir->dn_set.empty())
6093 *target = dir;
6094 else
6095 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6096 goto done;
6097 }
6098
6099 if (dname == ".") {
6100 *target = dir;
6101 goto done;
6102 }
6103
6104 if (dname.length() > NAME_MAX) {
6105 r = -ENAMETOOLONG;
6106 goto done;
6107 }
6108
6109 if (dname == cct->_conf->client_snapdir &&
6110 dir->snapid == CEPH_NOSNAP) {
6111 *target = open_snapdir(dir);
6112 goto done;
6113 }
6114
6115 if (dir->dir &&
6116 dir->dir->dentries.count(dname)) {
6117 dn = dir->dir->dentries[dname];
6118
6119 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6120 << " seq " << dn->lease_seq
6121 << dendl;
6122
6123 if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
6124 // is dn lease valid?
6125 utime_t now = ceph_clock_now();
6126 if (dn->lease_mds >= 0 &&
6127 dn->lease_ttl > now &&
6128 mds_sessions.count(dn->lease_mds)) {
6129 MetaSession *s = mds_sessions[dn->lease_mds];
6130 if (s->cap_ttl > now &&
6131 s->cap_gen == dn->lease_gen) {
6132 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6133 // make trim_caps() behave.
6134 dir->try_touch_cap(dn->lease_mds);
6135 goto hit_dn;
6136 }
6137 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6138 << " vs lease_gen " << dn->lease_gen << dendl;
6139 }
6140 // dir lease?
6141 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
6142 if (dn->cap_shared_gen == dir->shared_gen &&
6143 (!dn->inode || dn->inode->caps_issued_mask(mask)))
6144 goto hit_dn;
6145 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6146 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6147 << *dir << " dn '" << dname << "'" << dendl;
6148 return -ENOENT;
6149 }
6150 }
6151 } else {
6152 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6153 }
6154 } else {
6155 // can we conclude ENOENT locally?
6156 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
6157 (dir->flags & I_COMPLETE)) {
6158 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6159 return -ENOENT;
6160 }
6161 }
6162
6163 r = _do_lookup(dir, dname, mask, target, perms);
6164 goto done;
6165
6166 hit_dn:
6167 if (dn->inode) {
6168 *target = dn->inode;
6169 } else {
6170 r = -ENOENT;
6171 }
6172 touch_dn(dn);
6173
6174 done:
6175 if (r < 0)
6176 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6177 else
6178 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6179 return r;
6180 }
6181
6182 int Client::get_or_create(Inode *dir, const char* name,
6183 Dentry **pdn, bool expect_null)
6184 {
6185 // lookup
6186 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6187 dir->open_dir();
6188 if (dir->dir->dentries.count(name)) {
6189 Dentry *dn = dir->dir->dentries[name];
6190
6191 // is dn lease valid?
6192 utime_t now = ceph_clock_now();
6193 if (dn->inode &&
6194 dn->lease_mds >= 0 &&
6195 dn->lease_ttl > now &&
6196 mds_sessions.count(dn->lease_mds)) {
6197 MetaSession *s = mds_sessions[dn->lease_mds];
6198 if (s->cap_ttl > now &&
6199 s->cap_gen == dn->lease_gen) {
6200 if (expect_null)
6201 return -EEXIST;
6202 }
6203 }
6204 *pdn = dn;
6205 } else {
6206 // otherwise link up a new one
6207 *pdn = link(dir->dir, name, NULL, NULL);
6208 }
6209
6210 // success
6211 return 0;
6212 }
6213
6214 int Client::path_walk(const filepath& origpath, InodeRef *end,
6215 const UserPerm& perms, bool followsym, int mask)
6216 {
6217 filepath path = origpath;
6218 InodeRef cur;
6219 if (origpath.absolute())
6220 cur = root;
6221 else
6222 cur = cwd;
6223 assert(cur);
6224
6225 ldout(cct, 10) << "path_walk " << path << dendl;
6226
6227 int symlinks = 0;
6228
6229 unsigned i=0;
6230 while (i < path.depth() && cur) {
6231 int caps = 0;
6232 const string &dname = path[i];
6233 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6234 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6235 InodeRef next;
6236 if (cct->_conf->client_permissions) {
6237 int r = may_lookup(cur.get(), perms);
6238 if (r < 0)
6239 return r;
6240 caps = CEPH_CAP_AUTH_SHARED;
6241 }
6242
6243 /* Get extra requested caps on the last component */
6244 if (i == (path.depth() - 1))
6245 caps |= mask;
6246 int r = _lookup(cur.get(), dname, caps, &next, perms);
6247 if (r < 0)
6248 return r;
6249 // only follow trailing symlink if followsym. always follow
6250 // 'directory' symlinks.
6251 if (next && next->is_symlink()) {
6252 symlinks++;
6253 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6254 if (symlinks > MAXSYMLINKS) {
6255 return -ELOOP;
6256 }
6257
6258 if (i < path.depth() - 1) {
6259 // dir symlink
6260 // replace consumed components of path with symlink dir target
6261 filepath resolved(next->symlink.c_str());
6262 resolved.append(path.postfixpath(i + 1));
6263 path = resolved;
6264 i = 0;
6265 if (next->symlink[0] == '/') {
6266 cur = root;
6267 }
6268 continue;
6269 } else if (followsym) {
6270 if (next->symlink[0] == '/') {
6271 path = next->symlink.c_str();
6272 i = 0;
6273 // reset position
6274 cur = root;
6275 } else {
6276 filepath more(next->symlink.c_str());
6277 // we need to remove the symlink component from off of the path
6278 // before adding the target that the symlink points to. remain
6279 // at the same position in the path.
6280 path.pop_dentry();
6281 path.append(more);
6282 }
6283 continue;
6284 }
6285 }
6286 cur.swap(next);
6287 i++;
6288 }
6289 if (!cur)
6290 return -ENOENT;
6291 if (end)
6292 end->swap(cur);
6293 return 0;
6294 }
6295
6296
6297 // namespace ops
6298
6299 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6300 {
6301 Mutex::Locker lock(client_lock);
6302 tout(cct) << "link" << std::endl;
6303 tout(cct) << relexisting << std::endl;
6304 tout(cct) << relpath << std::endl;
6305
6306 filepath existing(relexisting);
6307
6308 InodeRef in, dir;
6309 int r = path_walk(existing, &in, perm, true);
6310 if (r < 0)
6311 return r;
6312 if (std::string(relpath) == "/") {
6313 r = -EEXIST;
6314 return r;
6315 }
6316 filepath path(relpath);
6317 string name = path.last_dentry();
6318 path.pop_dentry();
6319
6320 r = path_walk(path, &dir, perm, true);
6321 if (r < 0)
6322 return r;
6323 if (cct->_conf->client_permissions) {
6324 if (S_ISDIR(in->mode)) {
6325 r = -EPERM;
6326 return r;
6327 }
6328 r = may_hardlink(in.get(), perm);
6329 if (r < 0)
6330 return r;
6331 r = may_create(dir.get(), perm);
6332 if (r < 0)
6333 return r;
6334 }
6335 r = _link(in.get(), dir.get(), name.c_str(), perm);
6336 return r;
6337 }
6338
6339 int Client::unlink(const char *relpath, const UserPerm& perm)
6340 {
6341 Mutex::Locker lock(client_lock);
6342 tout(cct) << "unlink" << std::endl;
6343 tout(cct) << relpath << std::endl;
6344
6345 if (std::string(relpath) == "/")
6346 return -EISDIR;
6347
6348 filepath path(relpath);
6349 string name = path.last_dentry();
6350 path.pop_dentry();
6351 InodeRef dir;
6352 int r = path_walk(path, &dir, perm);
6353 if (r < 0)
6354 return r;
6355 if (cct->_conf->client_permissions) {
6356 r = may_delete(dir.get(), name.c_str(), perm);
6357 if (r < 0)
6358 return r;
6359 }
6360 return _unlink(dir.get(), name.c_str(), perm);
6361 }
6362
6363 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6364 {
6365 Mutex::Locker lock(client_lock);
6366 tout(cct) << "rename" << std::endl;
6367 tout(cct) << relfrom << std::endl;
6368 tout(cct) << relto << std::endl;
6369
6370 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6371 return -EBUSY;
6372
6373 filepath from(relfrom);
6374 filepath to(relto);
6375 string fromname = from.last_dentry();
6376 from.pop_dentry();
6377 string toname = to.last_dentry();
6378 to.pop_dentry();
6379
6380 InodeRef fromdir, todir;
6381 int r = path_walk(from, &fromdir, perm);
6382 if (r < 0)
6383 goto out;
6384 r = path_walk(to, &todir, perm);
6385 if (r < 0)
6386 goto out;
6387
6388 if (cct->_conf->client_permissions) {
6389 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6390 if (r < 0)
6391 return r;
6392 r = may_delete(todir.get(), toname.c_str(), perm);
6393 if (r < 0 && r != -ENOENT)
6394 return r;
6395 }
6396 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6397 out:
6398 return r;
6399 }
6400
6401 // dirs
6402
6403 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6404 {
6405 Mutex::Locker lock(client_lock);
6406 tout(cct) << "mkdir" << std::endl;
6407 tout(cct) << relpath << std::endl;
6408 tout(cct) << mode << std::endl;
6409 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6410
6411 if (std::string(relpath) == "/")
6412 return -EEXIST;
6413
6414 filepath path(relpath);
6415 string name = path.last_dentry();
6416 path.pop_dentry();
6417 InodeRef dir;
6418 int r = path_walk(path, &dir, perm);
6419 if (r < 0)
6420 return r;
6421 if (cct->_conf->client_permissions) {
6422 r = may_create(dir.get(), perm);
6423 if (r < 0)
6424 return r;
6425 }
6426 return _mkdir(dir.get(), name.c_str(), mode, perm);
6427 }
6428
6429 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6430 {
6431 Mutex::Locker lock(client_lock);
6432 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6433 tout(cct) << "mkdirs" << std::endl;
6434 tout(cct) << relpath << std::endl;
6435 tout(cct) << mode << std::endl;
6436
6437 //get through existing parts of path
6438 filepath path(relpath);
6439 unsigned int i;
6440 int r = 0, caps = 0;
6441 InodeRef cur, next;
6442 cur = cwd;
6443 for (i=0; i<path.depth(); ++i) {
6444 if (cct->_conf->client_permissions) {
6445 r = may_lookup(cur.get(), perms);
6446 if (r < 0)
6447 break;
6448 caps = CEPH_CAP_AUTH_SHARED;
6449 }
6450 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6451 if (r < 0)
6452 break;
6453 cur.swap(next);
6454 }
6455 //check that we have work left to do
6456 if (i==path.depth()) return -EEXIST;
6457 if (r!=-ENOENT) return r;
6458 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6459 //make new directory at each level
6460 for (; i<path.depth(); ++i) {
6461 if (cct->_conf->client_permissions) {
6462 r = may_create(cur.get(), perms);
6463 if (r < 0)
6464 return r;
6465 }
6466 //make new dir
6467 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6468
6469 //check proper creation/existence
6470 if(-EEXIST == r && i < path.depth() - 1) {
6471 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6472 }
6473 if (r < 0)
6474 return r;
6475 //move to new dir and continue
6476 cur.swap(next);
6477 ldout(cct, 20) << "mkdirs: successfully created directory "
6478 << filepath(cur->ino).get_path() << dendl;
6479 }
6480 return 0;
6481 }
6482
6483 int Client::rmdir(const char *relpath, const UserPerm& perms)
6484 {
6485 Mutex::Locker lock(client_lock);
6486 tout(cct) << "rmdir" << std::endl;
6487 tout(cct) << relpath << std::endl;
6488
6489 if (std::string(relpath) == "/")
6490 return -EBUSY;
6491
6492 filepath path(relpath);
6493 string name = path.last_dentry();
6494 path.pop_dentry();
6495 InodeRef dir;
6496 int r = path_walk(path, &dir, perms);
6497 if (r < 0)
6498 return r;
6499 if (cct->_conf->client_permissions) {
6500 int r = may_delete(dir.get(), name.c_str(), perms);
6501 if (r < 0)
6502 return r;
6503 }
6504 return _rmdir(dir.get(), name.c_str(), perms);
6505 }
6506
6507 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6508 {
6509 Mutex::Locker lock(client_lock);
6510 tout(cct) << "mknod" << std::endl;
6511 tout(cct) << relpath << std::endl;
6512 tout(cct) << mode << std::endl;
6513 tout(cct) << rdev << std::endl;
6514
6515 if (std::string(relpath) == "/")
6516 return -EEXIST;
6517
6518 filepath path(relpath);
6519 string name = path.last_dentry();
6520 path.pop_dentry();
6521 InodeRef dir;
6522 int r = path_walk(path, &dir, perms);
6523 if (r < 0)
6524 return r;
6525 if (cct->_conf->client_permissions) {
6526 int r = may_create(dir.get(), perms);
6527 if (r < 0)
6528 return r;
6529 }
6530 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6531 }
6532
6533 // symlinks
6534
6535 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6536 {
6537 Mutex::Locker lock(client_lock);
6538 tout(cct) << "symlink" << std::endl;
6539 tout(cct) << target << std::endl;
6540 tout(cct) << relpath << std::endl;
6541
6542 if (std::string(relpath) == "/")
6543 return -EEXIST;
6544
6545 filepath path(relpath);
6546 string name = path.last_dentry();
6547 path.pop_dentry();
6548 InodeRef dir;
6549 int r = path_walk(path, &dir, perms);
6550 if (r < 0)
6551 return r;
6552 if (cct->_conf->client_permissions) {
6553 int r = may_create(dir.get(), perms);
6554 if (r < 0)
6555 return r;
6556 }
6557 return _symlink(dir.get(), name.c_str(), target, perms);
6558 }
6559
6560 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6561 {
6562 Mutex::Locker lock(client_lock);
6563 tout(cct) << "readlink" << std::endl;
6564 tout(cct) << relpath << std::endl;
6565
6566 filepath path(relpath);
6567 InodeRef in;
6568 int r = path_walk(path, &in, perms, false);
6569 if (r < 0)
6570 return r;
6571
6572 return _readlink(in.get(), buf, size);
6573 }
6574
6575 int Client::_readlink(Inode *in, char *buf, size_t size)
6576 {
6577 if (!in->is_symlink())
6578 return -EINVAL;
6579
6580 // copy into buf (at most size bytes)
6581 int r = in->symlink.length();
6582 if (r > (int)size)
6583 r = size;
6584 memcpy(buf, in->symlink.c_str(), r);
6585 return r;
6586 }
6587
6588
6589 // inode stuff
6590
6591 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6592 {
6593 bool yes = in->caps_issued_mask(mask);
6594
6595 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6596 if (yes && !force)
6597 return 0;
6598
6599 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6600 filepath path;
6601 in->make_nosnap_relative_path(path);
6602 req->set_filepath(path);
6603 req->set_inode(in);
6604 req->head.args.getattr.mask = mask;
6605
6606 int res = make_request(req, perms);
6607 ldout(cct, 10) << "_getattr result=" << res << dendl;
6608 return res;
6609 }
6610
6611 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6612 const UserPerm& perms, InodeRef *inp)
6613 {
6614 int issued = in->caps_issued();
6615
6616 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6617 ccap_string(issued) << dendl;
6618
6619 if (in->snapid != CEPH_NOSNAP) {
6620 return -EROFS;
6621 }
6622 if ((mask & CEPH_SETATTR_SIZE) &&
6623 (unsigned long)stx->stx_size > in->size &&
6624 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6625 perms)) {
6626 return -EDQUOT;
6627 }
6628
6629 // make the change locally?
6630 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6631 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6632 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6633 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6634 << in->cap_dirtier_gid << ", forcing sync setattr"
6635 << dendl;
6636 /*
6637 * This works because we implicitly flush the caps as part of the
6638 * request, so the cap update check will happen with the writeback
6639 * cap context, and then the setattr check will happen with the
6640 * caller's context.
6641 *
6642 * In reality this pattern is likely pretty rare (different users
6643 * setattr'ing the same file). If that turns out not to be the
6644 * case later, we can build a more complex pipelined cap writeback
6645 * infrastructure...
6646 */
6647 if (!mask)
6648 mask |= CEPH_SETATTR_CTIME;
6649 goto force_request;
6650 }
6651
6652 if (!mask) {
6653 // caller just needs us to bump the ctime
6654 in->ctime = ceph_clock_now();
6655 in->cap_dirtier_uid = perms.uid();
6656 in->cap_dirtier_gid = perms.gid();
6657 if (issued & CEPH_CAP_AUTH_EXCL)
6658 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6659 else if (issued & CEPH_CAP_FILE_EXCL)
6660 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6661 else if (issued & CEPH_CAP_XATTR_EXCL)
6662 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6663 else
6664 mask |= CEPH_SETATTR_CTIME;
6665 }
6666
6667 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6668 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6669
6670 mask &= ~CEPH_SETATTR_KILL_SGUID;
6671
6672 if (mask & CEPH_SETATTR_UID) {
6673 in->ctime = ceph_clock_now();
6674 in->cap_dirtier_uid = perms.uid();
6675 in->cap_dirtier_gid = perms.gid();
6676 in->uid = stx->stx_uid;
6677 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6678 mask &= ~CEPH_SETATTR_UID;
6679 kill_sguid = true;
6680 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6681 }
6682 if (mask & CEPH_SETATTR_GID) {
6683 in->ctime = ceph_clock_now();
6684 in->cap_dirtier_uid = perms.uid();
6685 in->cap_dirtier_gid = perms.gid();
6686 in->gid = stx->stx_gid;
6687 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6688 mask &= ~CEPH_SETATTR_GID;
6689 kill_sguid = true;
6690 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6691 }
6692
6693 if (mask & CEPH_SETATTR_MODE) {
6694 in->ctime = ceph_clock_now();
6695 in->cap_dirtier_uid = perms.uid();
6696 in->cap_dirtier_gid = perms.gid();
6697 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6698 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6699 mask &= ~CEPH_SETATTR_MODE;
6700 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6701 } else if (kill_sguid && S_ISREG(in->mode)) {
6702 /* Must squash the any setuid/setgid bits with an ownership change */
6703 in->mode &= ~S_ISUID;
6704 if ((in->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
6705 in->mode &= ~S_ISGID;
6706 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6707 }
6708
6709 if (mask & CEPH_SETATTR_BTIME) {
6710 in->ctime = ceph_clock_now();
6711 in->cap_dirtier_uid = perms.uid();
6712 in->cap_dirtier_gid = perms.gid();
6713 in->btime = utime_t(stx->stx_btime);
6714 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6715 mask &= ~CEPH_SETATTR_BTIME;
6716 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6717 }
6718 } else if (mask & CEPH_SETATTR_SIZE) {
6719 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6720 mask |= CEPH_SETATTR_KILL_SGUID;
6721 }
6722
6723 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6724 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6725 if (mask & CEPH_SETATTR_MTIME)
6726 in->mtime = utime_t(stx->stx_mtime);
6727 if (mask & CEPH_SETATTR_ATIME)
6728 in->atime = utime_t(stx->stx_atime);
6729 in->ctime = ceph_clock_now();
6730 in->cap_dirtier_uid = perms.uid();
6731 in->cap_dirtier_gid = perms.gid();
6732 in->time_warp_seq++;
6733 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6734 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6735 }
6736 }
6737 if (!mask) {
6738 in->change_attr++;
6739 return 0;
6740 }
6741
6742 force_request:
6743 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6744
6745 filepath path;
6746
6747 in->make_nosnap_relative_path(path);
6748 req->set_filepath(path);
6749 req->set_inode(in);
6750
6751 if (mask & CEPH_SETATTR_KILL_SGUID) {
6752 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6753 }
6754 if (mask & CEPH_SETATTR_MODE) {
6755 req->head.args.setattr.mode = stx->stx_mode;
6756 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6757 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6758 }
6759 if (mask & CEPH_SETATTR_UID) {
6760 req->head.args.setattr.uid = stx->stx_uid;
6761 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6762 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6763 }
6764 if (mask & CEPH_SETATTR_GID) {
6765 req->head.args.setattr.gid = stx->stx_gid;
6766 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6767 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6768 }
6769 if (mask & CEPH_SETATTR_BTIME) {
6770 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6771 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6772 }
6773 if (mask & CEPH_SETATTR_MTIME) {
6774 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6775 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6776 CEPH_CAP_FILE_WR;
6777 }
6778 if (mask & CEPH_SETATTR_ATIME) {
6779 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6780 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6781 CEPH_CAP_FILE_WR;
6782 }
6783 if (mask & CEPH_SETATTR_SIZE) {
6784 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6785 req->head.args.setattr.size = stx->stx_size;
6786 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6787 } else { //too big!
6788 put_request(req);
6789 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6790 return -EFBIG;
6791 }
6792 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6793 CEPH_CAP_FILE_WR;
6794 }
6795 req->head.args.setattr.mask = mask;
6796
6797 req->regetattr_mask = mask;
6798
6799 int res = make_request(req, perms, inp);
6800 ldout(cct, 10) << "_setattr result=" << res << dendl;
6801 return res;
6802 }
6803
6804 /* Note that we only care about attrs that setattr cares about */
6805 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6806 {
6807 stx->stx_size = st->st_size;
6808 stx->stx_mode = st->st_mode;
6809 stx->stx_uid = st->st_uid;
6810 stx->stx_gid = st->st_gid;
6811 stx->stx_mtime = st->st_mtim;
6812 stx->stx_atime = st->st_atim;
6813 }
6814
6815 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6816 const UserPerm& perms, InodeRef *inp)
6817 {
6818 int ret = _do_setattr(in, stx, mask, perms, inp);
6819 if (ret < 0)
6820 return ret;
6821 if (mask & CEPH_SETATTR_MODE)
6822 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6823 return ret;
6824 }
6825
6826 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6827 const UserPerm& perms)
6828 {
6829 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6830 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6831 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6832 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6833 if (cct->_conf->client_permissions) {
6834 int r = may_setattr(in.get(), stx, mask, perms);
6835 if (r < 0)
6836 return r;
6837 }
6838 return __setattrx(in.get(), stx, mask, perms);
6839 }
6840
6841 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6842 const UserPerm& perms)
6843 {
6844 struct ceph_statx stx;
6845
6846 stat_to_statx(attr, &stx);
6847 mask &= ~CEPH_SETATTR_BTIME;
6848 return _setattrx(in, &stx, mask, perms);
6849 }
6850
6851 int Client::setattr(const char *relpath, struct stat *attr, int mask,
6852 const UserPerm& perms)
6853 {
6854 Mutex::Locker lock(client_lock);
6855 tout(cct) << "setattr" << std::endl;
6856 tout(cct) << relpath << std::endl;
6857 tout(cct) << mask << std::endl;
6858
6859 filepath path(relpath);
6860 InodeRef in;
6861 int r = path_walk(path, &in, perms);
6862 if (r < 0)
6863 return r;
6864 return _setattr(in, attr, mask, perms);
6865 }
6866
6867 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6868 const UserPerm& perms, int flags)
6869 {
6870 Mutex::Locker lock(client_lock);
6871 tout(cct) << "setattrx" << std::endl;
6872 tout(cct) << relpath << std::endl;
6873 tout(cct) << mask << std::endl;
6874
6875 filepath path(relpath);
6876 InodeRef in;
6877 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6878 if (r < 0)
6879 return r;
6880 return _setattrx(in, stx, mask, perms);
6881 }
6882
6883 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6884 {
6885 Mutex::Locker lock(client_lock);
6886 tout(cct) << "fsetattr" << std::endl;
6887 tout(cct) << fd << std::endl;
6888 tout(cct) << mask << std::endl;
6889
6890 Fh *f = get_filehandle(fd);
6891 if (!f)
6892 return -EBADF;
6893 #if defined(__linux__) && defined(O_PATH)
6894 if (f->flags & O_PATH)
6895 return -EBADF;
6896 #endif
6897 return _setattr(f->inode, attr, mask, perms);
6898 }
6899
6900 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6901 {
6902 Mutex::Locker lock(client_lock);
6903 tout(cct) << "fsetattr" << std::endl;
6904 tout(cct) << fd << std::endl;
6905 tout(cct) << mask << std::endl;
6906
6907 Fh *f = get_filehandle(fd);
6908 if (!f)
6909 return -EBADF;
6910 #if defined(__linux__) && defined(O_PATH)
6911 if (f->flags & O_PATH)
6912 return -EBADF;
6913 #endif
6914 return _setattrx(f->inode, stx, mask, perms);
6915 }
6916
6917 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6918 frag_info_t *dirstat, int mask)
6919 {
6920 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6921 Mutex::Locker lock(client_lock);
6922 tout(cct) << "stat" << std::endl;
6923 tout(cct) << relpath << std::endl;
6924 filepath path(relpath);
6925 InodeRef in;
6926 int r = path_walk(path, &in, perms, true, mask);
6927 if (r < 0)
6928 return r;
6929 r = _getattr(in, mask, perms);
6930 if (r < 0) {
6931 ldout(cct, 3) << "stat exit on error!" << dendl;
6932 return r;
6933 }
6934 fill_stat(in, stbuf, dirstat);
6935 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6936 return r;
6937 }
6938
6939 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6940 {
6941 unsigned mask = 0;
6942
6943 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
6944 if (flags & AT_NO_ATTR_SYNC)
6945 goto out;
6946
6947 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
6948 mask |= CEPH_CAP_PIN;
6949 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6950 mask |= CEPH_CAP_AUTH_SHARED;
6951 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6952 mask |= CEPH_CAP_LINK_SHARED;
6953 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
6954 mask |= CEPH_CAP_FILE_SHARED;
6955 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
6956 mask |= CEPH_CAP_XATTR_SHARED;
6957 out:
6958 return mask;
6959 }
6960
6961 int Client::statx(const char *relpath, struct ceph_statx *stx,
6962 const UserPerm& perms,
6963 unsigned int want, unsigned int flags)
6964 {
6965 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
6966 Mutex::Locker lock(client_lock);
6967 tout(cct) << "statx" << std::endl;
6968 tout(cct) << relpath << std::endl;
6969 filepath path(relpath);
6970 InodeRef in;
6971
6972 unsigned mask = statx_to_mask(flags, want);
6973
6974 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
6975 if (r < 0)
6976 return r;
6977
6978 r = _getattr(in, mask, perms);
6979 if (r < 0) {
6980 ldout(cct, 3) << "statx exit on error!" << dendl;
6981 return r;
6982 }
6983
6984 fill_statx(in, mask, stx);
6985 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
6986 return r;
6987 }
6988
6989 int Client::lstat(const char *relpath, struct stat *stbuf,
6990 const UserPerm& perms, frag_info_t *dirstat, int mask)
6991 {
6992 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6993 Mutex::Locker lock(client_lock);
6994 tout(cct) << "lstat" << std::endl;
6995 tout(cct) << relpath << std::endl;
6996 filepath path(relpath);
6997 InodeRef in;
6998 // don't follow symlinks
6999 int r = path_walk(path, &in, perms, false, mask);
7000 if (r < 0)
7001 return r;
7002 r = _getattr(in, mask, perms);
7003 if (r < 0) {
7004 ldout(cct, 3) << "lstat exit on error!" << dendl;
7005 return r;
7006 }
7007 fill_stat(in, stbuf, dirstat);
7008 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7009 return r;
7010 }
7011
7012 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7013 {
7014 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7015 << " mode 0" << oct << in->mode << dec
7016 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7017 memset(st, 0, sizeof(struct stat));
7018 if (use_faked_inos())
7019 st->st_ino = in->faked_ino;
7020 else
7021 st->st_ino = in->ino;
7022 st->st_dev = in->snapid;
7023 st->st_mode = in->mode;
7024 st->st_rdev = in->rdev;
7025 st->st_nlink = in->nlink;
7026 st->st_uid = in->uid;
7027 st->st_gid = in->gid;
7028 if (in->ctime > in->mtime) {
7029 stat_set_ctime_sec(st, in->ctime.sec());
7030 stat_set_ctime_nsec(st, in->ctime.nsec());
7031 } else {
7032 stat_set_ctime_sec(st, in->mtime.sec());
7033 stat_set_ctime_nsec(st, in->mtime.nsec());
7034 }
7035 stat_set_atime_sec(st, in->atime.sec());
7036 stat_set_atime_nsec(st, in->atime.nsec());
7037 stat_set_mtime_sec(st, in->mtime.sec());
7038 stat_set_mtime_nsec(st, in->mtime.nsec());
7039 if (in->is_dir()) {
7040 if (cct->_conf->client_dirsize_rbytes)
7041 st->st_size = in->rstat.rbytes;
7042 else
7043 st->st_size = in->dirstat.size();
7044 st->st_blocks = 1;
7045 } else {
7046 st->st_size = in->size;
7047 st->st_blocks = (in->size + 511) >> 9;
7048 }
7049 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7050
7051 if (dirstat)
7052 *dirstat = in->dirstat;
7053 if (rstat)
7054 *rstat = in->rstat;
7055
7056 return in->caps_issued();
7057 }
7058
7059 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7060 {
7061 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7062 << " mode 0" << oct << in->mode << dec
7063 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7064 memset(stx, 0, sizeof(struct ceph_statx));
7065
7066 /*
7067 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7068 * so that all bits are set.
7069 */
7070 if (!mask)
7071 mask = ~0;
7072
7073 /* These are always considered to be available */
7074 stx->stx_dev = in->snapid;
7075 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7076
7077 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7078 stx->stx_mode = S_IFMT & in->mode;
7079 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7080 stx->stx_rdev = in->rdev;
7081 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7082
7083 if (mask & CEPH_CAP_AUTH_SHARED) {
7084 stx->stx_uid = in->uid;
7085 stx->stx_gid = in->gid;
7086 stx->stx_mode = in->mode;
7087 in->btime.to_timespec(&stx->stx_btime);
7088 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7089 }
7090
7091 if (mask & CEPH_CAP_LINK_SHARED) {
7092 stx->stx_nlink = in->nlink;
7093 stx->stx_mask |= CEPH_STATX_NLINK;
7094 }
7095
7096 if (mask & CEPH_CAP_FILE_SHARED) {
7097
7098 in->atime.to_timespec(&stx->stx_atime);
7099 in->mtime.to_timespec(&stx->stx_mtime);
7100
7101 if (in->is_dir()) {
7102 if (cct->_conf->client_dirsize_rbytes)
7103 stx->stx_size = in->rstat.rbytes;
7104 else
7105 stx->stx_size = in->dirstat.size();
7106 stx->stx_blocks = 1;
7107 } else {
7108 stx->stx_size = in->size;
7109 stx->stx_blocks = (in->size + 511) >> 9;
7110 }
7111 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7112 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7113 }
7114
7115 /* Change time and change_attr both require all shared caps to view */
7116 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7117 stx->stx_version = in->change_attr;
7118 if (in->ctime > in->mtime)
7119 in->ctime.to_timespec(&stx->stx_ctime);
7120 else
7121 in->mtime.to_timespec(&stx->stx_ctime);
7122 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7123 }
7124
7125 }
7126
7127 void Client::touch_dn(Dentry *dn)
7128 {
7129 lru.lru_touch(dn);
7130 }
7131
7132 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7133 {
7134 Mutex::Locker lock(client_lock);
7135 tout(cct) << "chmod" << std::endl;
7136 tout(cct) << relpath << std::endl;
7137 tout(cct) << mode << std::endl;
7138 filepath path(relpath);
7139 InodeRef in;
7140 int r = path_walk(path, &in, perms);
7141 if (r < 0)
7142 return r;
7143 struct stat attr;
7144 attr.st_mode = mode;
7145 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7146 }
7147
7148 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7149 {
7150 Mutex::Locker lock(client_lock);
7151 tout(cct) << "fchmod" << std::endl;
7152 tout(cct) << fd << std::endl;
7153 tout(cct) << mode << std::endl;
7154 Fh *f = get_filehandle(fd);
7155 if (!f)
7156 return -EBADF;
7157 #if defined(__linux__) && defined(O_PATH)
7158 if (f->flags & O_PATH)
7159 return -EBADF;
7160 #endif
7161 struct stat attr;
7162 attr.st_mode = mode;
7163 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7164 }
7165
7166 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7167 {
7168 Mutex::Locker lock(client_lock);
7169 tout(cct) << "lchmod" << std::endl;
7170 tout(cct) << relpath << std::endl;
7171 tout(cct) << mode << std::endl;
7172 filepath path(relpath);
7173 InodeRef in;
7174 // don't follow symlinks
7175 int r = path_walk(path, &in, perms, false);
7176 if (r < 0)
7177 return r;
7178 struct stat attr;
7179 attr.st_mode = mode;
7180 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7181 }
7182
7183 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7184 const UserPerm& perms)
7185 {
7186 Mutex::Locker lock(client_lock);
7187 tout(cct) << "chown" << std::endl;
7188 tout(cct) << relpath << std::endl;
7189 tout(cct) << new_uid << std::endl;
7190 tout(cct) << new_gid << std::endl;
7191 filepath path(relpath);
7192 InodeRef in;
7193 int r = path_walk(path, &in, perms);
7194 if (r < 0)
7195 return r;
7196 struct stat attr;
7197 attr.st_uid = new_uid;
7198 attr.st_gid = new_gid;
7199 int mask = 0;
7200 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7201 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7202 return _setattr(in, &attr, mask, perms);
7203 }
7204
7205 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7206 {
7207 Mutex::Locker lock(client_lock);
7208 tout(cct) << "fchown" << std::endl;
7209 tout(cct) << fd << std::endl;
7210 tout(cct) << new_uid << std::endl;
7211 tout(cct) << new_gid << std::endl;
7212 Fh *f = get_filehandle(fd);
7213 if (!f)
7214 return -EBADF;
7215 #if defined(__linux__) && defined(O_PATH)
7216 if (f->flags & O_PATH)
7217 return -EBADF;
7218 #endif
7219 struct stat attr;
7220 attr.st_uid = new_uid;
7221 attr.st_gid = new_gid;
7222 int mask = 0;
7223 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7224 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7225 return _setattr(f->inode, &attr, mask, perms);
7226 }
7227
7228 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7229 const UserPerm& perms)
7230 {
7231 Mutex::Locker lock(client_lock);
7232 tout(cct) << "lchown" << std::endl;
7233 tout(cct) << relpath << std::endl;
7234 tout(cct) << new_uid << std::endl;
7235 tout(cct) << new_gid << std::endl;
7236 filepath path(relpath);
7237 InodeRef in;
7238 // don't follow symlinks
7239 int r = path_walk(path, &in, perms, false);
7240 if (r < 0)
7241 return r;
7242 struct stat attr;
7243 attr.st_uid = new_uid;
7244 attr.st_gid = new_gid;
7245 int mask = 0;
7246 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7247 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7248 return _setattr(in, &attr, mask, perms);
7249 }
7250
7251 int Client::utime(const char *relpath, struct utimbuf *buf,
7252 const UserPerm& perms)
7253 {
7254 Mutex::Locker lock(client_lock);
7255 tout(cct) << "utime" << std::endl;
7256 tout(cct) << relpath << std::endl;
7257 tout(cct) << buf->modtime << std::endl;
7258 tout(cct) << buf->actime << std::endl;
7259 filepath path(relpath);
7260 InodeRef in;
7261 int r = path_walk(path, &in, perms);
7262 if (r < 0)
7263 return r;
7264 struct stat attr;
7265 stat_set_mtime_sec(&attr, buf->modtime);
7266 stat_set_mtime_nsec(&attr, 0);
7267 stat_set_atime_sec(&attr, buf->actime);
7268 stat_set_atime_nsec(&attr, 0);
7269 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7270 }
7271
7272 int Client::lutime(const char *relpath, struct utimbuf *buf,
7273 const UserPerm& perms)
7274 {
7275 Mutex::Locker lock(client_lock);
7276 tout(cct) << "lutime" << std::endl;
7277 tout(cct) << relpath << std::endl;
7278 tout(cct) << buf->modtime << std::endl;
7279 tout(cct) << buf->actime << std::endl;
7280 filepath path(relpath);
7281 InodeRef in;
7282 // don't follow symlinks
7283 int r = path_walk(path, &in, perms, false);
7284 if (r < 0)
7285 return r;
7286 struct stat attr;
7287 stat_set_mtime_sec(&attr, buf->modtime);
7288 stat_set_mtime_nsec(&attr, 0);
7289 stat_set_atime_sec(&attr, buf->actime);
7290 stat_set_atime_nsec(&attr, 0);
7291 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7292 }
7293
7294 int Client::flock(int fd, int operation, uint64_t owner)
7295 {
7296 Mutex::Locker lock(client_lock);
7297 tout(cct) << "flock" << std::endl;
7298 tout(cct) << fd << std::endl;
7299 tout(cct) << operation << std::endl;
7300 tout(cct) << owner << std::endl;
7301 Fh *f = get_filehandle(fd);
7302 if (!f)
7303 return -EBADF;
7304
7305 return _flock(f, operation, owner);
7306 }
7307
7308 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7309 {
7310 Mutex::Locker lock(client_lock);
7311 tout(cct) << "opendir" << std::endl;
7312 tout(cct) << relpath << std::endl;
7313 filepath path(relpath);
7314 InodeRef in;
7315 int r = path_walk(path, &in, perms, true);
7316 if (r < 0)
7317 return r;
7318 if (cct->_conf->client_permissions) {
7319 int r = may_open(in.get(), O_RDONLY, perms);
7320 if (r < 0)
7321 return r;
7322 }
7323 r = _opendir(in.get(), dirpp, perms);
7324 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7325 if (r != -ENOTDIR)
7326 tout(cct) << (unsigned long)*dirpp << std::endl;
7327 return r;
7328 }
7329
7330 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7331 {
7332 if (!in->is_dir())
7333 return -ENOTDIR;
7334 *dirpp = new dir_result_t(in, perms);
7335 opened_dirs.insert(*dirpp);
7336 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7337 return 0;
7338 }
7339
7340
7341 int Client::closedir(dir_result_t *dir)
7342 {
7343 Mutex::Locker lock(client_lock);
7344 tout(cct) << "closedir" << std::endl;
7345 tout(cct) << (unsigned long)dir << std::endl;
7346
7347 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7348 _closedir(dir);
7349 return 0;
7350 }
7351
7352 void Client::_closedir(dir_result_t *dirp)
7353 {
7354 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7355 if (dirp->inode) {
7356 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7357 dirp->inode.reset();
7358 }
7359 _readdir_drop_dirp_buffer(dirp);
7360 opened_dirs.erase(dirp);
7361 delete dirp;
7362 }
7363
7364 void Client::rewinddir(dir_result_t *dirp)
7365 {
7366 Mutex::Locker lock(client_lock);
7367
7368 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7369 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7370 _readdir_drop_dirp_buffer(d);
7371 d->reset();
7372 }
7373
7374 loff_t Client::telldir(dir_result_t *dirp)
7375 {
7376 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7377 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7378 return d->offset;
7379 }
7380
7381 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7382 {
7383 Mutex::Locker lock(client_lock);
7384
7385 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7386
7387 if (offset == dirp->offset)
7388 return;
7389
7390 if (offset > dirp->offset)
7391 dirp->release_count = 0; // bump if we do a forward seek
7392 else
7393 dirp->ordered_count = 0; // disable filling readdir cache
7394
7395 if (dirp->hash_order()) {
7396 if (dirp->offset > offset) {
7397 _readdir_drop_dirp_buffer(dirp);
7398 dirp->reset();
7399 }
7400 } else {
7401 if (offset == 0 ||
7402 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7403 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7404 _readdir_drop_dirp_buffer(dirp);
7405 dirp->reset();
7406 }
7407 }
7408
7409 dirp->offset = offset;
7410 }
7411
7412
7413 //struct dirent {
7414 // ino_t d_ino; /* inode number */
7415 // off_t d_off; /* offset to the next dirent */
7416 // unsigned short d_reclen; /* length of this record */
7417 // unsigned char d_type; /* type of file */
7418 // char d_name[256]; /* filename */
7419 //};
7420 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7421 {
7422 strncpy(de->d_name, name, 255);
7423 de->d_name[255] = '\0';
7424 #ifndef __CYGWIN__
7425 de->d_ino = ino;
7426 #if !defined(DARWIN) && !defined(__FreeBSD__)
7427 de->d_off = next_off;
7428 #endif
7429 de->d_reclen = 1;
7430 de->d_type = IFTODT(type);
7431 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7432 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7433 #endif
7434 }
7435
7436 void Client::_readdir_next_frag(dir_result_t *dirp)
7437 {
7438 frag_t fg = dirp->buffer_frag;
7439
7440 if (fg.is_rightmost()) {
7441 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7442 dirp->set_end();
7443 return;
7444 }
7445
7446 // advance
7447 fg = fg.next();
7448 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7449
7450 if (dirp->hash_order()) {
7451 // keep last_name
7452 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7453 if (dirp->offset < new_offset) // don't decrease offset
7454 dirp->offset = new_offset;
7455 } else {
7456 dirp->last_name.clear();
7457 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7458 _readdir_rechoose_frag(dirp);
7459 }
7460 }
7461
7462 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7463 {
7464 assert(dirp->inode);
7465
7466 if (dirp->hash_order())
7467 return;
7468
7469 frag_t cur = frag_t(dirp->offset_high());
7470 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7471 if (fg != cur) {
7472 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7473 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7474 dirp->last_name.clear();
7475 dirp->next_offset = 2;
7476 }
7477 }
7478
7479 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7480 {
7481 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7482 dirp->buffer.clear();
7483 }
7484
7485 int Client::_readdir_get_frag(dir_result_t *dirp)
7486 {
7487 assert(dirp);
7488 assert(dirp->inode);
7489
7490 // get the current frag.
7491 frag_t fg;
7492 if (dirp->hash_order())
7493 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7494 else
7495 fg = frag_t(dirp->offset_high());
7496
7497 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7498 << " offset " << hex << dirp->offset << dec << dendl;
7499
7500 int op = CEPH_MDS_OP_READDIR;
7501 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7502 op = CEPH_MDS_OP_LSSNAP;
7503
7504 InodeRef& diri = dirp->inode;
7505
7506 MetaRequest *req = new MetaRequest(op);
7507 filepath path;
7508 diri->make_nosnap_relative_path(path);
7509 req->set_filepath(path);
7510 req->set_inode(diri.get());
7511 req->head.args.readdir.frag = fg;
7512 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7513 if (dirp->last_name.length()) {
7514 req->path2.set_path(dirp->last_name.c_str());
7515 } else if (dirp->hash_order()) {
7516 req->head.args.readdir.offset_hash = dirp->offset_high();
7517 }
7518 req->dirp = dirp;
7519
7520 bufferlist dirbl;
7521 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7522
7523 if (res == -EAGAIN) {
7524 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7525 _readdir_rechoose_frag(dirp);
7526 return _readdir_get_frag(dirp);
7527 }
7528
7529 if (res == 0) {
7530 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7531 << " size " << dirp->buffer.size() << dendl;
7532 } else {
7533 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7534 dirp->set_end();
7535 }
7536
7537 return res;
7538 }
7539
7540 struct dentry_off_lt {
7541 bool operator()(const Dentry* dn, int64_t off) const {
7542 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7543 }
7544 };
7545
7546 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7547 int caps, bool getref)
7548 {
7549 assert(client_lock.is_locked());
7550 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7551 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7552 << dendl;
7553 Dir *dir = dirp->inode->dir;
7554
7555 if (!dir) {
7556 ldout(cct, 10) << " dir is empty" << dendl;
7557 dirp->set_end();
7558 return 0;
7559 }
7560
7561 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7562 dir->readdir_cache.end(),
7563 dirp->offset, dentry_off_lt());
7564
7565 string dn_name;
7566 while (true) {
7567 if (!dirp->inode->is_complete_and_ordered())
7568 return -EAGAIN;
7569 if (pd == dir->readdir_cache.end())
7570 break;
7571 Dentry *dn = *pd;
7572 if (dn->inode == NULL) {
7573 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7574 ++pd;
7575 continue;
7576 }
7577 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7578 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7579 ++pd;
7580 continue;
7581 }
7582
7583 int r = _getattr(dn->inode, caps, dirp->perms);
7584 if (r < 0)
7585 return r;
7586
7587 struct ceph_statx stx;
7588 struct dirent de;
7589 fill_statx(dn->inode, caps, &stx);
7590
7591 uint64_t next_off = dn->offset + 1;
7592 ++pd;
7593 if (pd == dir->readdir_cache.end())
7594 next_off = dir_result_t::END;
7595
7596 Inode *in = NULL;
7597 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7598 if (getref) {
7599 in = dn->inode.get();
7600 _ll_get(in);
7601 }
7602
7603 dn_name = dn->name; // fill in name while we have lock
7604
7605 client_lock.Unlock();
7606 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7607 client_lock.Lock();
7608 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7609 << " = " << r << dendl;
7610 if (r < 0) {
7611 return r;
7612 }
7613
7614 dirp->offset = next_off;
7615 if (dirp->at_end())
7616 dirp->next_offset = 2;
7617 else
7618 dirp->next_offset = dirp->offset_low();
7619 dirp->last_name = dn_name; // we successfully returned this one; update!
7620 if (r > 0)
7621 return r;
7622 }
7623
7624 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7625 dirp->set_end();
7626 return 0;
7627 }
7628
7629 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7630 unsigned want, unsigned flags, bool getref)
7631 {
7632 int caps = statx_to_mask(flags, want);
7633
7634 Mutex::Locker lock(client_lock);
7635
7636 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7637
7638 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7639 << dec << " at_end=" << dirp->at_end()
7640 << " hash_order=" << dirp->hash_order() << dendl;
7641
7642 struct dirent de;
7643 struct ceph_statx stx;
7644 memset(&de, 0, sizeof(de));
7645 memset(&stx, 0, sizeof(stx));
7646
7647 InodeRef& diri = dirp->inode;
7648
7649 if (dirp->at_end())
7650 return 0;
7651
7652 if (dirp->offset == 0) {
7653 ldout(cct, 15) << " including ." << dendl;
7654 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7655 uint64_t next_off = 1;
7656
7657 int r;
7658 r = _getattr(diri, caps, dirp->perms);
7659 if (r < 0)
7660 return r;
7661
7662 fill_statx(diri, caps, &stx);
7663 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7664
7665 Inode *inode = NULL;
7666 if (getref) {
7667 inode = diri.get();
7668 _ll_get(inode);
7669 }
7670
7671 client_lock.Unlock();
7672 r = cb(p, &de, &stx, next_off, inode);
7673 client_lock.Lock();
7674 if (r < 0)
7675 return r;
7676
7677 dirp->offset = next_off;
7678 if (r > 0)
7679 return r;
7680 }
7681 if (dirp->offset == 1) {
7682 ldout(cct, 15) << " including .." << dendl;
7683 uint64_t next_off = 2;
7684 InodeRef in;
7685 if (diri->dn_set.empty())
7686 in = diri;
7687 else
7688 in = diri->get_first_parent()->inode;
7689
7690 int r;
7691 r = _getattr(diri, caps, dirp->perms);
7692 if (r < 0)
7693 return r;
7694
7695 fill_statx(in, caps, &stx);
7696 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7697
7698 Inode *inode = NULL;
7699 if (getref) {
7700 inode = in.get();
7701 _ll_get(inode);
7702 }
7703
7704 client_lock.Unlock();
7705 r = cb(p, &de, &stx, next_off, inode);
7706 client_lock.Lock();
7707 if (r < 0)
7708 return r;
7709
7710 dirp->offset = next_off;
7711 if (r > 0)
7712 return r;
7713 }
7714
7715 // can we read from our cache?
7716 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7717 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7718 << dirp->inode->is_complete_and_ordered()
7719 << " issued " << ccap_string(dirp->inode->caps_issued())
7720 << dendl;
7721 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7722 dirp->inode->is_complete_and_ordered() &&
7723 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
7724 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7725 if (err != -EAGAIN)
7726 return err;
7727 }
7728
7729 while (1) {
7730 if (dirp->at_end())
7731 return 0;
7732
7733 bool check_caps = true;
7734 if (!dirp->is_cached()) {
7735 int r = _readdir_get_frag(dirp);
7736 if (r)
7737 return r;
7738 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7739 // different than the requested one. (our dirfragtree was outdated)
7740 check_caps = false;
7741 }
7742 frag_t fg = dirp->buffer_frag;
7743
7744 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7745 << " offset " << hex << dirp->offset << dendl;
7746
7747 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7748 dirp->offset, dir_result_t::dentry_off_lt());
7749 it != dirp->buffer.end();
7750 ++it) {
7751 dir_result_t::dentry &entry = *it;
7752
7753 uint64_t next_off = entry.offset + 1;
7754
7755 int r;
7756 if (check_caps) {
7757 r = _getattr(entry.inode, caps, dirp->perms);
7758 if (r < 0)
7759 return r;
7760 }
7761
7762 fill_statx(entry.inode, caps, &stx);
7763 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7764
7765 Inode *inode = NULL;
7766 if (getref) {
7767 inode = entry.inode.get();
7768 _ll_get(inode);
7769 }
7770
7771 client_lock.Unlock();
7772 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7773 client_lock.Lock();
7774
7775 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7776 << " = " << r << dendl;
7777 if (r < 0)
7778 return r;
7779
7780 dirp->offset = next_off;
7781 if (r > 0)
7782 return r;
7783 }
7784
7785 if (dirp->next_offset > 2) {
7786 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7787 _readdir_drop_dirp_buffer(dirp);
7788 continue; // more!
7789 }
7790
7791 if (!fg.is_rightmost()) {
7792 // next frag!
7793 _readdir_next_frag(dirp);
7794 continue;
7795 }
7796
7797 if (diri->shared_gen == dirp->start_shared_gen &&
7798 diri->dir_release_count == dirp->release_count) {
7799 if (diri->dir_ordered_count == dirp->ordered_count) {
7800 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7801 if (diri->dir) {
7802 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7803 diri->dir->readdir_cache.resize(dirp->cache_index);
7804 }
7805 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7806 } else {
7807 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7808 diri->flags |= I_COMPLETE;
7809 }
7810 }
7811
7812 dirp->set_end();
7813 return 0;
7814 }
7815 ceph_abort();
7816 return 0;
7817 }
7818
7819
7820 int Client::readdir_r(dir_result_t *d, struct dirent *de)
7821 {
7822 return readdirplus_r(d, de, 0, 0, 0, NULL);
7823 }
7824
7825 /*
7826 * readdirplus_r
7827 *
7828 * returns
7829 * 1 if we got a dirent
7830 * 0 for end of directory
7831 * <0 on error
7832 */
7833
7834 struct single_readdir {
7835 struct dirent *de;
7836 struct ceph_statx *stx;
7837 Inode *inode;
7838 bool full;
7839 };
7840
7841 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7842 struct ceph_statx *stx, off_t off,
7843 Inode *in)
7844 {
7845 single_readdir *c = static_cast<single_readdir *>(p);
7846
7847 if (c->full)
7848 return -1; // already filled this dirent
7849
7850 *c->de = *de;
7851 if (c->stx)
7852 *c->stx = *stx;
7853 c->inode = in;
7854 c->full = true;
7855 return 1;
7856 }
7857
7858 struct dirent *Client::readdir(dir_result_t *d)
7859 {
7860 int ret;
7861 static struct dirent de;
7862 single_readdir sr;
7863 sr.de = &de;
7864 sr.stx = NULL;
7865 sr.inode = NULL;
7866 sr.full = false;
7867
7868 // our callback fills the dirent and sets sr.full=true on first
7869 // call, and returns -1 the second time around.
7870 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
7871 if (ret < -1) {
7872 errno = -ret; // this sucks.
7873 return (dirent *) NULL;
7874 }
7875 if (sr.full) {
7876 return &de;
7877 }
7878 return (dirent *) NULL;
7879 }
7880
7881 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
7882 struct ceph_statx *stx, unsigned want,
7883 unsigned flags, Inode **out)
7884 {
7885 single_readdir sr;
7886 sr.de = de;
7887 sr.stx = stx;
7888 sr.inode = NULL;
7889 sr.full = false;
7890
7891 // our callback fills the dirent and sets sr.full=true on first
7892 // call, and returns -1 the second time around.
7893 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
7894 if (r < -1)
7895 return r;
7896 if (out)
7897 *out = sr.inode;
7898 if (sr.full)
7899 return 1;
7900 return 0;
7901 }
7902
7903
7904 /* getdents */
7905 struct getdents_result {
7906 char *buf;
7907 int buflen;
7908 int pos;
7909 bool fullent;
7910 };
7911
7912 static int _readdir_getdent_cb(void *p, struct dirent *de,
7913 struct ceph_statx *stx, off_t off, Inode *in)
7914 {
7915 struct getdents_result *c = static_cast<getdents_result *>(p);
7916
7917 int dlen;
7918 if (c->fullent)
7919 dlen = sizeof(*de);
7920 else
7921 dlen = strlen(de->d_name) + 1;
7922
7923 if (c->pos + dlen > c->buflen)
7924 return -1; // doesn't fit
7925
7926 if (c->fullent) {
7927 memcpy(c->buf + c->pos, de, sizeof(*de));
7928 } else {
7929 memcpy(c->buf + c->pos, de->d_name, dlen);
7930 }
7931 c->pos += dlen;
7932 return 0;
7933 }
7934
7935 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
7936 {
7937 getdents_result gr;
7938 gr.buf = buf;
7939 gr.buflen = buflen;
7940 gr.fullent = fullent;
7941 gr.pos = 0;
7942
7943 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
7944
7945 if (r < 0) { // some error
7946 if (r == -1) { // buffer ran out of space
7947 if (gr.pos) { // but we got some entries already!
7948 return gr.pos;
7949 } // or we need a larger buffer
7950 return -ERANGE;
7951 } else { // actual error, return it
7952 return r;
7953 }
7954 }
7955 return gr.pos;
7956 }
7957
7958
7959 /* getdir */
7960 struct getdir_result {
7961 list<string> *contents;
7962 int num;
7963 };
7964
7965 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
7966 {
7967 getdir_result *r = static_cast<getdir_result *>(p);
7968
7969 r->contents->push_back(de->d_name);
7970 r->num++;
7971 return 0;
7972 }
7973
7974 int Client::getdir(const char *relpath, list<string>& contents,
7975 const UserPerm& perms)
7976 {
7977 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
7978 {
7979 Mutex::Locker lock(client_lock);
7980 tout(cct) << "getdir" << std::endl;
7981 tout(cct) << relpath << std::endl;
7982 }
7983
7984 dir_result_t *d;
7985 int r = opendir(relpath, &d, perms);
7986 if (r < 0)
7987 return r;
7988
7989 getdir_result gr;
7990 gr.contents = &contents;
7991 gr.num = 0;
7992 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
7993
7994 closedir(d);
7995
7996 if (r < 0)
7997 return r;
7998 return gr.num;
7999 }
8000
8001
8002 /****** file i/o **********/
8003 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8004 mode_t mode, int stripe_unit, int stripe_count,
8005 int object_size, const char *data_pool)
8006 {
8007 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8008 Mutex::Locker lock(client_lock);
8009 tout(cct) << "open" << std::endl;
8010 tout(cct) << relpath << std::endl;
8011 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8012
8013 Fh *fh = NULL;
8014
8015 #if defined(__linux__) && defined(O_PATH)
8016 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8017 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8018 * in kernel (fs/open.c). */
8019 if (flags & O_PATH)
8020 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8021 #endif
8022
8023 filepath path(relpath);
8024 InodeRef in;
8025 bool created = false;
8026 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8027 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8028 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8029
8030 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8031 return -EEXIST;
8032
8033 #if defined(__linux__) && defined(O_PATH)
8034 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8035 #else
8036 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8037 #endif
8038 return -ELOOP;
8039
8040 if (r == -ENOENT && (flags & O_CREAT)) {
8041 filepath dirpath = path;
8042 string dname = dirpath.last_dentry();
8043 dirpath.pop_dentry();
8044 InodeRef dir;
8045 r = path_walk(dirpath, &dir, perms, true,
8046 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8047 if (r < 0)
8048 goto out;
8049 if (cct->_conf->client_permissions) {
8050 r = may_create(dir.get(), perms);
8051 if (r < 0)
8052 goto out;
8053 }
8054 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8055 stripe_count, object_size, data_pool, &created, perms);
8056 }
8057 if (r < 0)
8058 goto out;
8059
8060 if (!created) {
8061 // posix says we can only check permissions of existing files
8062 if (cct->_conf->client_permissions) {
8063 r = may_open(in.get(), flags, perms);
8064 if (r < 0)
8065 goto out;
8066 }
8067 }
8068
8069 if (!fh)
8070 r = _open(in.get(), flags, mode, &fh, perms);
8071 if (r >= 0) {
8072 // allocate a integer file descriptor
8073 assert(fh);
8074 r = get_fd();
8075 assert(fd_map.count(r) == 0);
8076 fd_map[r] = fh;
8077 }
8078
8079 out:
8080 tout(cct) << r << std::endl;
8081 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8082 return r;
8083 }
8084
8085 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8086 {
8087 /* Use default file striping parameters */
8088 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8089 }
8090
8091 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8092 const UserPerm& perms)
8093 {
8094 Mutex::Locker lock(client_lock);
8095 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8096
8097 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8098 filepath path(ino);
8099 req->set_filepath(path);
8100
8101 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8102 char f[30];
8103 sprintf(f, "%u", h);
8104 filepath path2(dirino);
8105 path2.push_dentry(string(f));
8106 req->set_filepath2(path2);
8107
8108 int r = make_request(req, perms, NULL, NULL,
8109 rand() % mdsmap->get_num_in_mds());
8110 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8111 return r;
8112 }
8113
8114
8115 /**
8116 * Load inode into local cache.
8117 *
8118 * If inode pointer is non-NULL, and take a reference on
8119 * the resulting Inode object in one operation, so that caller
8120 * can safely assume inode will still be there after return.
8121 */
8122 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8123 {
8124 Mutex::Locker lock(client_lock);
8125 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8126
8127 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8128 filepath path(ino);
8129 req->set_filepath(path);
8130
8131 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8132 if (r == 0 && inode != NULL) {
8133 vinodeno_t vino(ino, CEPH_NOSNAP);
8134 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8135 assert(p != inode_map.end());
8136 *inode = p->second;
8137 _ll_get(*inode);
8138 }
8139 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8140 return r;
8141 }
8142
8143
8144
8145 /**
8146 * Find the parent inode of `ino` and insert it into
8147 * our cache. Conditionally also set `parent` to a referenced
8148 * Inode* if caller provides non-NULL value.
8149 */
8150 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8151 {
8152 Mutex::Locker lock(client_lock);
8153 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8154
8155 if (!ino->dn_set.empty()) {
8156 // if we exposed the parent here, we'd need to check permissions,
8157 // but right now we just rely on the MDS doing so in make_request
8158 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8159 return 0;
8160 }
8161
8162 if (ino->is_root()) {
8163 *parent = NULL;
8164 ldout(cct, 3) << "ino is root, no parent" << dendl;
8165 return -EINVAL;
8166 }
8167
8168 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8169 filepath path(ino->ino);
8170 req->set_filepath(path);
8171
8172 InodeRef target;
8173 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8174 // Give caller a reference to the parent ino if they provided a pointer.
8175 if (parent != NULL) {
8176 if (r == 0) {
8177 *parent = target.get();
8178 _ll_get(*parent);
8179 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8180 } else {
8181 *parent = NULL;
8182 }
8183 }
8184 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8185 return r;
8186 }
8187
8188
8189 /**
8190 * Populate the parent dentry for `ino`, provided it is
8191 * a child of `parent`.
8192 */
8193 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8194 {
8195 assert(parent->is_dir());
8196
8197 Mutex::Locker lock(client_lock);
8198 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8199
8200 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8201 req->set_filepath2(filepath(parent->ino));
8202 req->set_filepath(filepath(ino->ino));
8203 req->set_inode(ino);
8204
8205 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8206 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8207 return r;
8208 }
8209
8210
8211 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8212 {
8213 assert(in);
8214 Fh *f = new Fh(in);
8215 f->mode = cmode;
8216 f->flags = flags;
8217
8218 // inode
8219 f->actor_perms = perms;
8220
8221 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8222
8223 if (in->snapid != CEPH_NOSNAP) {
8224 in->snap_cap_refs++;
8225 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8226 << ccap_string(in->caps_issued()) << dendl;
8227 }
8228
8229 const md_config_t *conf = cct->_conf;
8230 f->readahead.set_trigger_requests(1);
8231 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8232 uint64_t max_readahead = Readahead::NO_LIMIT;
8233 if (conf->client_readahead_max_bytes) {
8234 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8235 }
8236 if (conf->client_readahead_max_periods) {
8237 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8238 }
8239 f->readahead.set_max_readahead_size(max_readahead);
8240 vector<uint64_t> alignments;
8241 alignments.push_back(in->layout.get_period());
8242 alignments.push_back(in->layout.stripe_unit);
8243 f->readahead.set_alignments(alignments);
8244
8245 return f;
8246 }
8247
8248 int Client::_release_fh(Fh *f)
8249 {
8250 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8251 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8252 Inode *in = f->inode.get();
8253 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8254
8255 if (in->snapid == CEPH_NOSNAP) {
8256 if (in->put_open_ref(f->mode)) {
8257 _flush(in, new C_Client_FlushComplete(this, in));
8258 check_caps(in, 0);
8259 }
8260 } else {
8261 assert(in->snap_cap_refs > 0);
8262 in->snap_cap_refs--;
8263 }
8264
8265 _release_filelocks(f);
8266
8267 // Finally, read any async err (i.e. from flushes)
8268 int err = f->take_async_err();
8269 if (err != 0) {
8270 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8271 << cpp_strerror(err) << dendl;
8272 } else {
8273 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8274 }
8275
8276 _put_fh(f);
8277
8278 return err;
8279 }
8280
8281 void Client::_put_fh(Fh *f)
8282 {
8283 int left = f->put();
8284 if (!left) {
8285 delete f;
8286 }
8287 }
8288
8289 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8290 const UserPerm& perms)
8291 {
8292 if (in->snapid != CEPH_NOSNAP &&
8293 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8294 return -EROFS;
8295 }
8296
8297 // use normalized flags to generate cmode
8298 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8299 if (cmode < 0)
8300 return -EINVAL;
8301 int want = ceph_caps_for_mode(cmode);
8302 int result = 0;
8303
8304 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8305
8306 if ((flags & O_TRUNC) == 0 &&
8307 in->caps_issued_mask(want)) {
8308 // update wanted?
8309 check_caps(in, CHECK_CAPS_NODELAY);
8310 } else {
8311 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8312 filepath path;
8313 in->make_nosnap_relative_path(path);
8314 req->set_filepath(path);
8315 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8316 req->head.args.open.mode = mode;
8317 req->head.args.open.pool = -1;
8318 if (cct->_conf->client_debug_getattr_caps)
8319 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8320 else
8321 req->head.args.open.mask = 0;
8322 req->head.args.open.old_size = in->size; // for O_TRUNC
8323 req->set_inode(in);
8324 result = make_request(req, perms);
8325 }
8326
8327 // success?
8328 if (result >= 0) {
8329 if (fhp)
8330 *fhp = _create_fh(in, flags, cmode, perms);
8331 } else {
8332 in->put_open_ref(cmode);
8333 }
8334
8335 trim_cache();
8336
8337 return result;
8338 }
8339
8340 int Client::_renew_caps(Inode *in)
8341 {
8342 int wanted = in->caps_file_wanted();
8343 if (in->is_any_caps() &&
8344 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8345 check_caps(in, CHECK_CAPS_NODELAY);
8346 return 0;
8347 }
8348
8349 int flags = 0;
8350 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8351 flags = O_RDWR;
8352 else if (wanted & CEPH_CAP_FILE_RD)
8353 flags = O_RDONLY;
8354 else if (wanted & CEPH_CAP_FILE_WR)
8355 flags = O_WRONLY;
8356
8357 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8358 filepath path;
8359 in->make_nosnap_relative_path(path);
8360 req->set_filepath(path);
8361 req->head.args.open.flags = flags;
8362 req->head.args.open.pool = -1;
8363 if (cct->_conf->client_debug_getattr_caps)
8364 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8365 else
8366 req->head.args.open.mask = 0;
8367 req->set_inode(in);
8368
8369 // duplicate in case Cap goes away; not sure if that race is a concern?
8370 const UserPerm *pperm = in->get_best_perms();
8371 UserPerm perms;
8372 if (pperm != NULL)
8373 perms = *pperm;
8374 int ret = make_request(req, perms);
8375 return ret;
8376 }
8377
8378 int Client::close(int fd)
8379 {
8380 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8381 Mutex::Locker lock(client_lock);
8382 tout(cct) << "close" << std::endl;
8383 tout(cct) << fd << std::endl;
8384
8385 Fh *fh = get_filehandle(fd);
8386 if (!fh)
8387 return -EBADF;
8388 int err = _release_fh(fh);
8389 fd_map.erase(fd);
8390 put_fd(fd);
8391 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8392 return err;
8393 }
8394
8395
8396 // ------------
8397 // read, write
8398
8399 loff_t Client::lseek(int fd, loff_t offset, int whence)
8400 {
8401 Mutex::Locker lock(client_lock);
8402 tout(cct) << "lseek" << std::endl;
8403 tout(cct) << fd << std::endl;
8404 tout(cct) << offset << std::endl;
8405 tout(cct) << whence << std::endl;
8406
8407 Fh *f = get_filehandle(fd);
8408 if (!f)
8409 return -EBADF;
8410 #if defined(__linux__) && defined(O_PATH)
8411 if (f->flags & O_PATH)
8412 return -EBADF;
8413 #endif
8414 return _lseek(f, offset, whence);
8415 }
8416
8417 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8418 {
8419 Inode *in = f->inode.get();
8420 int r;
8421
8422 switch (whence) {
8423 case SEEK_SET:
8424 f->pos = offset;
8425 break;
8426
8427 case SEEK_CUR:
8428 f->pos += offset;
8429 break;
8430
8431 case SEEK_END:
8432 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8433 if (r < 0)
8434 return r;
8435 f->pos = in->size + offset;
8436 break;
8437
8438 default:
8439 ceph_abort();
8440 }
8441
8442 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8443 return f->pos;
8444 }
8445
8446
8447 void Client::lock_fh_pos(Fh *f)
8448 {
8449 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8450
8451 if (f->pos_locked || !f->pos_waiters.empty()) {
8452 Cond cond;
8453 f->pos_waiters.push_back(&cond);
8454 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8455 while (f->pos_locked || f->pos_waiters.front() != &cond)
8456 cond.Wait(client_lock);
8457 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8458 assert(f->pos_waiters.front() == &cond);
8459 f->pos_waiters.pop_front();
8460 }
8461
8462 f->pos_locked = true;
8463 }
8464
8465 void Client::unlock_fh_pos(Fh *f)
8466 {
8467 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8468 f->pos_locked = false;
8469 }
8470
8471 int Client::uninline_data(Inode *in, Context *onfinish)
8472 {
8473 if (!in->inline_data.length()) {
8474 onfinish->complete(0);
8475 return 0;
8476 }
8477
8478 char oid_buf[32];
8479 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8480 object_t oid = oid_buf;
8481
8482 ObjectOperation create_ops;
8483 create_ops.create(false);
8484
8485 objecter->mutate(oid,
8486 OSDMap::file_to_object_locator(in->layout),
8487 create_ops,
8488 in->snaprealm->get_snap_context(),
8489 ceph::real_clock::now(),
8490 0,
8491 NULL);
8492
8493 bufferlist inline_version_bl;
8494 ::encode(in->inline_version, inline_version_bl);
8495
8496 ObjectOperation uninline_ops;
8497 uninline_ops.cmpxattr("inline_version",
8498 CEPH_OSD_CMPXATTR_OP_GT,
8499 CEPH_OSD_CMPXATTR_MODE_U64,
8500 inline_version_bl);
8501 bufferlist inline_data = in->inline_data;
8502 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8503 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8504
8505 objecter->mutate(oid,
8506 OSDMap::file_to_object_locator(in->layout),
8507 uninline_ops,
8508 in->snaprealm->get_snap_context(),
8509 ceph::real_clock::now(),
8510 0,
8511 onfinish);
8512
8513 return 0;
8514 }
8515
8516 //
8517
8518 // blocking osd interface
8519
8520 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8521 {
8522 Mutex::Locker lock(client_lock);
8523 tout(cct) << "read" << std::endl;
8524 tout(cct) << fd << std::endl;
8525 tout(cct) << size << std::endl;
8526 tout(cct) << offset << std::endl;
8527
8528 Fh *f = get_filehandle(fd);
8529 if (!f)
8530 return -EBADF;
8531 #if defined(__linux__) && defined(O_PATH)
8532 if (f->flags & O_PATH)
8533 return -EBADF;
8534 #endif
8535 bufferlist bl;
8536 int r = _read(f, offset, size, &bl);
8537 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8538 if (r >= 0) {
8539 bl.copy(0, bl.length(), buf);
8540 r = bl.length();
8541 }
8542 return r;
8543 }
8544
8545 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8546 {
8547 if (iovcnt < 0)
8548 return -EINVAL;
8549 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8550 }
8551
8552 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8553 {
8554 const md_config_t *conf = cct->_conf;
8555 Inode *in = f->inode.get();
8556
8557 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8558 return -EBADF;
8559 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8560
8561 bool movepos = false;
8562 if (offset < 0) {
8563 lock_fh_pos(f);
8564 offset = f->pos;
8565 movepos = true;
8566 }
8567 loff_t start_pos = offset;
8568
8569 if (in->inline_version == 0) {
8570 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8571 if (r < 0) {
8572 if (movepos)
8573 unlock_fh_pos(f);
8574 return r;
8575 }
8576 assert(in->inline_version > 0);
8577 }
8578
8579 retry:
8580 int have;
8581 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8582 if (r < 0) {
8583 if (movepos)
8584 unlock_fh_pos(f);
8585 return r;
8586 }
8587 if (f->flags & O_DIRECT)
8588 have &= ~CEPH_CAP_FILE_CACHE;
8589
8590 Mutex uninline_flock("Client::_read_uninline_data flock");
8591 Cond uninline_cond;
8592 bool uninline_done = false;
8593 int uninline_ret = 0;
8594 Context *onuninline = NULL;
8595
8596 if (in->inline_version < CEPH_INLINE_NONE) {
8597 if (!(have & CEPH_CAP_FILE_CACHE)) {
8598 onuninline = new C_SafeCond(&uninline_flock,
8599 &uninline_cond,
8600 &uninline_done,
8601 &uninline_ret);
8602 uninline_data(in, onuninline);
8603 } else {
8604 uint32_t len = in->inline_data.length();
8605
8606 uint64_t endoff = offset + size;
8607 if (endoff > in->size)
8608 endoff = in->size;
8609
8610 if (offset < len) {
8611 if (endoff <= len) {
8612 bl->substr_of(in->inline_data, offset, endoff - offset);
8613 } else {
8614 bl->substr_of(in->inline_data, offset, len - offset);
8615 bl->append_zero(endoff - len);
8616 }
8617 } else if ((uint64_t)offset < endoff) {
8618 bl->append_zero(endoff - offset);
8619 }
8620
8621 goto success;
8622 }
8623 }
8624
8625 if (!conf->client_debug_force_sync_read &&
8626 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8627
8628 if (f->flags & O_RSYNC) {
8629 _flush_range(in, offset, size);
8630 }
8631 r = _read_async(f, offset, size, bl);
8632 if (r < 0)
8633 goto done;
8634 } else {
8635 if (f->flags & O_DIRECT)
8636 _flush_range(in, offset, size);
8637
8638 bool checkeof = false;
8639 r = _read_sync(f, offset, size, bl, &checkeof);
8640 if (r < 0)
8641 goto done;
8642 if (checkeof) {
8643 offset += r;
8644 size -= r;
8645
8646 put_cap_ref(in, CEPH_CAP_FILE_RD);
8647 have = 0;
8648 // reverify size
8649 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8650 if (r < 0)
8651 goto done;
8652
8653 // eof? short read.
8654 if ((uint64_t)offset < in->size)
8655 goto retry;
8656 }
8657 }
8658
8659 success:
8660 if (movepos) {
8661 // adjust fd pos
8662 f->pos = start_pos + bl->length();
8663 unlock_fh_pos(f);
8664 }
8665
8666 done:
8667 // done!
8668
8669 if (onuninline) {
8670 client_lock.Unlock();
8671 uninline_flock.Lock();
8672 while (!uninline_done)
8673 uninline_cond.Wait(uninline_flock);
8674 uninline_flock.Unlock();
8675 client_lock.Lock();
8676
8677 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8678 in->inline_data.clear();
8679 in->inline_version = CEPH_INLINE_NONE;
8680 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8681 check_caps(in, 0);
8682 } else
8683 r = uninline_ret;
8684 }
8685
8686 if (have)
8687 put_cap_ref(in, CEPH_CAP_FILE_RD);
8688 if (r < 0) {
8689 if (movepos)
8690 unlock_fh_pos(f);
8691 return r;
8692 } else
8693 return bl->length();
8694 }
8695
8696 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8697 client(c), f(f) {
8698 f->get();
8699 f->readahead.inc_pending();
8700 }
8701
8702 Client::C_Readahead::~C_Readahead() {
8703 f->readahead.dec_pending();
8704 client->_put_fh(f);
8705 }
8706
8707 void Client::C_Readahead::finish(int r) {
8708 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8709 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8710 }
8711
8712 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8713 {
8714 const md_config_t *conf = cct->_conf;
8715 Inode *in = f->inode.get();
8716
8717 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8718
8719 // trim read based on file size?
8720 if (off >= in->size)
8721 return 0;
8722 if (len == 0)
8723 return 0;
8724 if (off + len > in->size) {
8725 len = in->size - off;
8726 }
8727
8728 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8729 << " max_bytes=" << f->readahead.get_max_readahead_size()
8730 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8731
8732 // read (and possibly block)
8733 int r, rvalue = 0;
8734 Mutex flock("Client::_read_async flock");
8735 Cond cond;
8736 bool done = false;
8737 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8738 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8739 off, len, bl, 0, onfinish);
8740 if (r == 0) {
8741 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8742 client_lock.Unlock();
8743 flock.Lock();
8744 while (!done)
8745 cond.Wait(flock);
8746 flock.Unlock();
8747 client_lock.Lock();
8748 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8749 r = rvalue;
8750 } else {
8751 // it was cached.
8752 delete onfinish;
8753 }
8754
8755 if(f->readahead.get_min_readahead_size() > 0) {
8756 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8757 if (readahead_extent.second > 0) {
8758 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8759 << " (caller wants " << off << "~" << len << ")" << dendl;
8760 Context *onfinish2 = new C_Readahead(this, f);
8761 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8762 readahead_extent.first, readahead_extent.second,
8763 NULL, 0, onfinish2);
8764 if (r2 == 0) {
8765 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8766 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8767 } else {
8768 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8769 delete onfinish2;
8770 }
8771 }
8772 }
8773
8774 return r;
8775 }
8776
8777 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8778 bool *checkeof)
8779 {
8780 Inode *in = f->inode.get();
8781 uint64_t pos = off;
8782 int left = len;
8783 int read = 0;
8784
8785 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8786
8787 Mutex flock("Client::_read_sync flock");
8788 Cond cond;
8789 while (left > 0) {
8790 int r = 0;
8791 bool done = false;
8792 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8793 bufferlist tbl;
8794
8795 int wanted = left;
8796 filer->read_trunc(in->ino, &in->layout, in->snapid,
8797 pos, left, &tbl, 0,
8798 in->truncate_size, in->truncate_seq,
8799 onfinish);
8800 client_lock.Unlock();
8801 flock.Lock();
8802 while (!done)
8803 cond.Wait(flock);
8804 flock.Unlock();
8805 client_lock.Lock();
8806
8807 // if we get ENOENT from OSD, assume 0 bytes returned
8808 if (r == -ENOENT)
8809 r = 0;
8810 if (r < 0)
8811 return r;
8812 if (tbl.length()) {
8813 r = tbl.length();
8814
8815 read += r;
8816 pos += r;
8817 left -= r;
8818 bl->claim_append(tbl);
8819 }
8820 // short read?
8821 if (r >= 0 && r < wanted) {
8822 if (pos < in->size) {
8823 // zero up to known EOF
8824 int64_t some = in->size - pos;
8825 if (some > left)
8826 some = left;
8827 bufferptr z(some);
8828 z.zero();
8829 bl->push_back(z);
8830 read += some;
8831 pos += some;
8832 left -= some;
8833 if (left == 0)
8834 return read;
8835 }
8836
8837 *checkeof = true;
8838 return read;
8839 }
8840 }
8841 return read;
8842 }
8843
8844
8845 /*
8846 * we keep count of uncommitted sync writes on the inode, so that
8847 * fsync can DDRT.
8848 */
8849 void Client::_sync_write_commit(Inode *in)
8850 {
8851 assert(unsafe_sync_write > 0);
8852 unsafe_sync_write--;
8853
8854 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
8855
8856 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
8857 if (unsafe_sync_write == 0 && unmounting) {
8858 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
8859 mount_cond.Signal();
8860 }
8861 }
8862
8863 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
8864 {
8865 Mutex::Locker lock(client_lock);
8866 tout(cct) << "write" << std::endl;
8867 tout(cct) << fd << std::endl;
8868 tout(cct) << size << std::endl;
8869 tout(cct) << offset << std::endl;
8870
8871 Fh *fh = get_filehandle(fd);
8872 if (!fh)
8873 return -EBADF;
8874 #if defined(__linux__) && defined(O_PATH)
8875 if (fh->flags & O_PATH)
8876 return -EBADF;
8877 #endif
8878 int r = _write(fh, offset, size, buf, NULL, 0);
8879 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
8880 return r;
8881 }
8882
8883 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
8884 {
8885 if (iovcnt < 0)
8886 return -EINVAL;
8887 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
8888 }
8889
8890 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
8891 {
8892 Mutex::Locker lock(client_lock);
8893 tout(cct) << fd << std::endl;
8894 tout(cct) << offset << std::endl;
8895
8896 Fh *fh = get_filehandle(fd);
8897 if (!fh)
8898 return -EBADF;
8899 #if defined(__linux__) && defined(O_PATH)
8900 if (fh->flags & O_PATH)
8901 return -EBADF;
8902 #endif
8903 loff_t totallen = 0;
8904 for (unsigned i = 0; i < iovcnt; i++) {
8905 totallen += iov[i].iov_len;
8906 }
8907 if (write) {
8908 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
8909 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
8910 return w;
8911 } else {
8912 bufferlist bl;
8913 int r = _read(fh, offset, totallen, &bl);
8914 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
8915 if (r <= 0)
8916 return r;
8917
8918 int bufoff = 0;
8919 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
8920 /*
8921 * This piece of code aims to handle the case that bufferlist does not have enough data
8922 * to fill in the iov
8923 */
8924 if (resid < iov[j].iov_len) {
8925 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
8926 break;
8927 } else {
8928 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
8929 }
8930 resid -= iov[j].iov_len;
8931 bufoff += iov[j].iov_len;
8932 }
8933 return r;
8934 }
8935 }
8936
8937 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
8938 const struct iovec *iov, int iovcnt)
8939 {
8940 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
8941 return -EFBIG;
8942
8943 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
8944 Inode *in = f->inode.get();
8945
8946 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
8947 return -ENOSPC;
8948 }
8949
8950 assert(in->snapid == CEPH_NOSNAP);
8951
8952 // was Fh opened as writeable?
8953 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
8954 return -EBADF;
8955
8956 // check quota
8957 uint64_t endoff = offset + size;
8958 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
8959 f->actor_perms)) {
8960 return -EDQUOT;
8961 }
8962
8963 // use/adjust fd pos?
8964 if (offset < 0) {
8965 lock_fh_pos(f);
8966 /*
8967 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
8968 * change out from under us.
8969 */
8970 if (f->flags & O_APPEND) {
8971 int r = _lseek(f, 0, SEEK_END);
8972 if (r < 0) {
8973 unlock_fh_pos(f);
8974 return r;
8975 }
8976 }
8977 offset = f->pos;
8978 f->pos = offset+size;
8979 unlock_fh_pos(f);
8980 }
8981
8982 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8983
8984 ldout(cct, 10) << "cur file size is " << in->size << dendl;
8985
8986 // time it.
8987 utime_t start = ceph_clock_now();
8988
8989 if (in->inline_version == 0) {
8990 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8991 if (r < 0)
8992 return r;
8993 assert(in->inline_version > 0);
8994 }
8995
8996 // copy into fresh buffer (since our write may be resub, async)
8997 bufferlist bl;
8998 if (buf) {
8999 if (size > 0)
9000 bl.append(buf, size);
9001 } else if (iov){
9002 for (int i = 0; i < iovcnt; i++) {
9003 if (iov[i].iov_len > 0) {
9004 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9005 }
9006 }
9007 }
9008
9009 utime_t lat;
9010 uint64_t totalwritten;
9011 int have;
9012 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9013 CEPH_CAP_FILE_BUFFER, &have, endoff);
9014 if (r < 0)
9015 return r;
9016
9017 /* clear the setuid/setgid bits, if any */
9018 if (unlikely((in->mode & S_ISUID) ||
9019 (in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) {
9020 struct ceph_statx stx = { 0 };
9021
9022 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9023 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9024 if (r < 0)
9025 return r;
9026 } else {
9027 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9028 }
9029
9030 if (f->flags & O_DIRECT)
9031 have &= ~CEPH_CAP_FILE_BUFFER;
9032
9033 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9034
9035 Mutex uninline_flock("Client::_write_uninline_data flock");
9036 Cond uninline_cond;
9037 bool uninline_done = false;
9038 int uninline_ret = 0;
9039 Context *onuninline = NULL;
9040
9041 if (in->inline_version < CEPH_INLINE_NONE) {
9042 if (endoff > cct->_conf->client_max_inline_size ||
9043 endoff > CEPH_INLINE_MAX_SIZE ||
9044 !(have & CEPH_CAP_FILE_BUFFER)) {
9045 onuninline = new C_SafeCond(&uninline_flock,
9046 &uninline_cond,
9047 &uninline_done,
9048 &uninline_ret);
9049 uninline_data(in, onuninline);
9050 } else {
9051 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9052
9053 uint32_t len = in->inline_data.length();
9054
9055 if (endoff < len)
9056 in->inline_data.copy(endoff, len - endoff, bl);
9057
9058 if (offset < len)
9059 in->inline_data.splice(offset, len - offset);
9060 else if (offset > len)
9061 in->inline_data.append_zero(offset - len);
9062
9063 in->inline_data.append(bl);
9064 in->inline_version++;
9065
9066 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9067
9068 goto success;
9069 }
9070 }
9071
9072 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9073 // do buffered write
9074 if (!in->oset.dirty_or_tx)
9075 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9076
9077 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9078
9079 // async, caching, non-blocking.
9080 r = objectcacher->file_write(&in->oset, &in->layout,
9081 in->snaprealm->get_snap_context(),
9082 offset, size, bl, ceph::real_clock::now(),
9083 0);
9084 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9085
9086 if (r < 0)
9087 goto done;
9088
9089 // flush cached write if O_SYNC is set on file fh
9090 // O_DSYNC == O_SYNC on linux < 2.6.33
9091 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9092 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9093 _flush_range(in, offset, size);
9094 }
9095 } else {
9096 if (f->flags & O_DIRECT)
9097 _flush_range(in, offset, size);
9098
9099 // simple, non-atomic sync write
9100 Mutex flock("Client::_write flock");
9101 Cond cond;
9102 bool done = false;
9103 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9104
9105 unsafe_sync_write++;
9106 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9107
9108 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9109 offset, size, bl, ceph::real_clock::now(), 0,
9110 in->truncate_size, in->truncate_seq,
9111 onfinish);
9112 client_lock.Unlock();
9113 flock.Lock();
9114
9115 while (!done)
9116 cond.Wait(flock);
9117 flock.Unlock();
9118 client_lock.Lock();
9119 _sync_write_commit(in);
9120 }
9121
9122 // if we get here, write was successful, update client metadata
9123 success:
9124 // time
9125 lat = ceph_clock_now();
9126 lat -= start;
9127 logger->tinc(l_c_wrlat, lat);
9128
9129 totalwritten = size;
9130 r = (int)totalwritten;
9131
9132 // extend file?
9133 if (totalwritten + offset > in->size) {
9134 in->size = totalwritten + offset;
9135 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9136
9137 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9138 check_caps(in, CHECK_CAPS_NODELAY);
9139 } else if (is_max_size_approaching(in)) {
9140 check_caps(in, 0);
9141 }
9142
9143 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9144 } else {
9145 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9146 }
9147
9148 // mtime
9149 in->mtime = ceph_clock_now();
9150 in->change_attr++;
9151 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9152
9153 done:
9154
9155 if (onuninline) {
9156 client_lock.Unlock();
9157 uninline_flock.Lock();
9158 while (!uninline_done)
9159 uninline_cond.Wait(uninline_flock);
9160 uninline_flock.Unlock();
9161 client_lock.Lock();
9162
9163 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9164 in->inline_data.clear();
9165 in->inline_version = CEPH_INLINE_NONE;
9166 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9167 check_caps(in, 0);
9168 } else
9169 r = uninline_ret;
9170 }
9171
9172 put_cap_ref(in, CEPH_CAP_FILE_WR);
9173 return r;
9174 }
9175
9176 int Client::_flush(Fh *f)
9177 {
9178 Inode *in = f->inode.get();
9179 int err = f->take_async_err();
9180 if (err != 0) {
9181 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9182 << cpp_strerror(err) << dendl;
9183 } else {
9184 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9185 }
9186
9187 return err;
9188 }
9189
9190 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9191 {
9192 struct ceph_statx stx;
9193 stx.stx_size = length;
9194 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9195 }
9196
9197 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9198 {
9199 Mutex::Locker lock(client_lock);
9200 tout(cct) << "ftruncate" << std::endl;
9201 tout(cct) << fd << std::endl;
9202 tout(cct) << length << std::endl;
9203
9204 Fh *f = get_filehandle(fd);
9205 if (!f)
9206 return -EBADF;
9207 #if defined(__linux__) && defined(O_PATH)
9208 if (f->flags & O_PATH)
9209 return -EBADF;
9210 #endif
9211 struct stat attr;
9212 attr.st_size = length;
9213 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9214 }
9215
9216 int Client::fsync(int fd, bool syncdataonly)
9217 {
9218 Mutex::Locker lock(client_lock);
9219 tout(cct) << "fsync" << std::endl;
9220 tout(cct) << fd << std::endl;
9221 tout(cct) << syncdataonly << std::endl;
9222
9223 Fh *f = get_filehandle(fd);
9224 if (!f)
9225 return -EBADF;
9226 #if defined(__linux__) && defined(O_PATH)
9227 if (f->flags & O_PATH)
9228 return -EBADF;
9229 #endif
9230 int r = _fsync(f, syncdataonly);
9231 if (r == 0) {
9232 // The IOs in this fsync were okay, but maybe something happened
9233 // in the background that we shoudl be reporting?
9234 r = f->take_async_err();
9235 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9236 << ") = 0, async_err = " << r << dendl;
9237 } else {
9238 // Assume that an error we encountered during fsync, even reported
9239 // synchronously, would also have applied the error to the Fh, and we
9240 // should clear it here to avoid returning the same error again on next
9241 // call.
9242 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9243 << r << dendl;
9244 f->take_async_err();
9245 }
9246 return r;
9247 }
9248
9249 int Client::_fsync(Inode *in, bool syncdataonly)
9250 {
9251 int r = 0;
9252 Mutex lock("Client::_fsync::lock");
9253 Cond cond;
9254 bool done = false;
9255 C_SafeCond *object_cacher_completion = NULL;
9256 ceph_tid_t flush_tid = 0;
9257 InodeRef tmp_ref;
9258
9259 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9260
9261 if (cct->_conf->client_oc) {
9262 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9263 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9264 _flush(in, object_cacher_completion);
9265 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9266 }
9267
9268 if (!syncdataonly && in->dirty_caps) {
9269 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9270 if (in->flushing_caps)
9271 flush_tid = last_flush_tid;
9272 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9273
9274 if (!syncdataonly && !in->unsafe_ops.empty()) {
9275 MetaRequest *req = in->unsafe_ops.back();
9276 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9277
9278 req->get();
9279 wait_on_list(req->waitfor_safe);
9280 put_request(req);
9281 }
9282
9283 if (object_cacher_completion) { // wait on a real reply instead of guessing
9284 client_lock.Unlock();
9285 lock.Lock();
9286 ldout(cct, 15) << "waiting on data to flush" << dendl;
9287 while (!done)
9288 cond.Wait(lock);
9289 lock.Unlock();
9290 client_lock.Lock();
9291 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9292 } else {
9293 // FIXME: this can starve
9294 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9295 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9296 << " uncommitted, waiting" << dendl;
9297 wait_on_list(in->waitfor_commit);
9298 }
9299 }
9300
9301 if (!r) {
9302 if (flush_tid > 0)
9303 wait_sync_caps(in, flush_tid);
9304
9305 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9306 } else {
9307 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9308 << cpp_strerror(-r) << dendl;
9309 }
9310
9311 return r;
9312 }
9313
9314 int Client::_fsync(Fh *f, bool syncdataonly)
9315 {
9316 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9317 return _fsync(f->inode.get(), syncdataonly);
9318 }
9319
9320 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9321 {
9322 Mutex::Locker lock(client_lock);
9323 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9324 tout(cct) << fd << std::endl;
9325
9326 Fh *f = get_filehandle(fd);
9327 if (!f)
9328 return -EBADF;
9329 int r = _getattr(f->inode, mask, perms);
9330 if (r < 0)
9331 return r;
9332 fill_stat(f->inode, stbuf, NULL);
9333 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9334 return r;
9335 }
9336
9337 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9338 unsigned int want, unsigned int flags)
9339 {
9340 Mutex::Locker lock(client_lock);
9341 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9342 tout(cct) << fd << std::endl;
9343
9344 Fh *f = get_filehandle(fd);
9345 if (!f)
9346 return -EBADF;
9347
9348 unsigned mask = statx_to_mask(flags, want);
9349
9350 int r = 0;
9351 if (mask && !f->inode->caps_issued_mask(mask)) {
9352 r = _getattr(f->inode, mask, perms);
9353 if (r < 0) {
9354 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9355 return r;
9356 }
9357 }
9358
9359 fill_statx(f->inode, mask, stx);
9360 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9361 return r;
9362 }
9363
9364 // not written yet, but i want to link!
9365
9366 int Client::chdir(const char *relpath, std::string &new_cwd,
9367 const UserPerm& perms)
9368 {
9369 Mutex::Locker lock(client_lock);
9370 tout(cct) << "chdir" << std::endl;
9371 tout(cct) << relpath << std::endl;
9372 filepath path(relpath);
9373 InodeRef in;
9374 int r = path_walk(path, &in, perms);
9375 if (r < 0)
9376 return r;
9377 if (cwd != in)
9378 cwd.swap(in);
9379 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9380
9381 getcwd(new_cwd, perms);
9382 return 0;
9383 }
9384
9385 void Client::getcwd(string& dir, const UserPerm& perms)
9386 {
9387 filepath path;
9388 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9389
9390 Inode *in = cwd.get();
9391 while (in != root) {
9392 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9393
9394 // A cwd or ancester is unlinked
9395 if (in->dn_set.empty()) {
9396 return;
9397 }
9398
9399 Dentry *dn = in->get_first_parent();
9400
9401
9402 if (!dn) {
9403 // look it up
9404 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9405 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9406 filepath path(in->ino);
9407 req->set_filepath(path);
9408 req->set_inode(in);
9409 int res = make_request(req, perms);
9410 if (res < 0)
9411 break;
9412
9413 // start over
9414 path = filepath();
9415 in = cwd.get();
9416 continue;
9417 }
9418 path.push_front_dentry(dn->name);
9419 in = dn->dir->parent_inode;
9420 }
9421 dir = "/";
9422 dir += path.get_path();
9423 }
9424
9425 int Client::statfs(const char *path, struct statvfs *stbuf,
9426 const UserPerm& perms)
9427 {
9428 Mutex::Locker l(client_lock);
9429 tout(cct) << "statfs" << std::endl;
9430
9431 ceph_statfs stats;
9432 C_SaferCond cond;
9433 objecter->get_fs_stats(stats, &cond);
9434
9435 client_lock.Unlock();
9436 int rval = cond.wait();
9437 client_lock.Lock();
9438
9439 if (rval < 0) {
9440 ldout(cct, 1) << "underlying call to statfs returned error: "
9441 << cpp_strerror(rval)
9442 << dendl;
9443 return rval;
9444 }
9445
9446 memset(stbuf, 0, sizeof(*stbuf));
9447
9448 /*
9449 * we're going to set a block size of 4MB so we can represent larger
9450 * FSes without overflowing. Additionally convert the space
9451 * measurements from KB to bytes while making them in terms of
9452 * blocks. We use 4MB only because it is big enough, and because it
9453 * actually *is* the (ceph) default block size.
9454 */
9455 const int CEPH_BLOCK_SHIFT = 22;
9456 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9457 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9458 stbuf->f_files = stats.num_objects;
9459 stbuf->f_ffree = -1;
9460 stbuf->f_favail = -1;
9461 stbuf->f_fsid = -1; // ??
9462 stbuf->f_flag = 0; // ??
9463 stbuf->f_namemax = NAME_MAX;
9464
9465 // Usually quota_root will == root_ancestor, but if the mount root has no
9466 // quota but we can see a parent of it that does have a quota, we'll
9467 // respect that one instead.
9468 assert(root != nullptr);
9469 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9470
9471 // get_quota_root should always give us something
9472 // because client quotas are always enabled
9473 assert(quota_root != nullptr);
9474
9475 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9476
9477 // Skip the getattr if any sessions are stale, as we don't want to
9478 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9479 // is unhealthy.
9480 if (!_any_stale_sessions()) {
9481 int r = _getattr(quota_root, 0, perms, true);
9482 if (r != 0) {
9483 // Ignore return value: error getting latest inode metadata is not a good
9484 // reason to break "df".
9485 lderr(cct) << "Error in getattr on quota root 0x"
9486 << std::hex << quota_root->ino << std::dec
9487 << " statfs result may be outdated" << dendl;
9488 }
9489 }
9490
9491 // Special case: if there is a size quota set on the Inode acting
9492 // as the root for this client mount, then report the quota status
9493 // as the filesystem statistics.
9494 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9495 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
9496 // It is possible for a quota to be exceeded: arithmetic here must
9497 // handle case where used > total.
9498 const fsblkcnt_t free = total > used ? total - used : 0;
9499
9500 stbuf->f_blocks = total;
9501 stbuf->f_bfree = free;
9502 stbuf->f_bavail = free;
9503 } else {
9504 // General case: report the overall RADOS cluster's statistics. Because
9505 // multiple pools may be used without one filesystem namespace via
9506 // layouts, this is the most correct thing we can do.
9507 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9508 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9509 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9510 }
9511
9512 return rval;
9513 }
9514
9515 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9516 struct flock *fl, uint64_t owner, bool removing)
9517 {
9518 ldout(cct, 10) << "_do_filelock ino " << in->ino
9519 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9520 << " type " << fl->l_type << " owner " << owner
9521 << " " << fl->l_start << "~" << fl->l_len << dendl;
9522
9523 int lock_cmd;
9524 if (F_RDLCK == fl->l_type)
9525 lock_cmd = CEPH_LOCK_SHARED;
9526 else if (F_WRLCK == fl->l_type)
9527 lock_cmd = CEPH_LOCK_EXCL;
9528 else if (F_UNLCK == fl->l_type)
9529 lock_cmd = CEPH_LOCK_UNLOCK;
9530 else
9531 return -EIO;
9532
9533 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9534 sleep = 0;
9535
9536 /*
9537 * Set the most significant bit, so that MDS knows the 'owner'
9538 * is sufficient to identify the owner of lock. (old code uses
9539 * both 'owner' and 'pid')
9540 */
9541 owner |= (1ULL << 63);
9542
9543 MetaRequest *req = new MetaRequest(op);
9544 filepath path;
9545 in->make_nosnap_relative_path(path);
9546 req->set_filepath(path);
9547 req->set_inode(in);
9548
9549 req->head.args.filelock_change.rule = lock_type;
9550 req->head.args.filelock_change.type = lock_cmd;
9551 req->head.args.filelock_change.owner = owner;
9552 req->head.args.filelock_change.pid = fl->l_pid;
9553 req->head.args.filelock_change.start = fl->l_start;
9554 req->head.args.filelock_change.length = fl->l_len;
9555 req->head.args.filelock_change.wait = sleep;
9556
9557 int ret;
9558 bufferlist bl;
9559
9560 if (sleep && switch_interrupt_cb) {
9561 // enable interrupt
9562 switch_interrupt_cb(callback_handle, req->get());
9563 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9564 // disable interrupt
9565 switch_interrupt_cb(callback_handle, NULL);
9566 if (ret == 0 && req->aborted()) {
9567 // effect of this lock request has been revoked by the 'lock intr' request
9568 ret = req->get_abort_code();
9569 }
9570 put_request(req);
9571 } else {
9572 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9573 }
9574
9575 if (ret == 0) {
9576 if (op == CEPH_MDS_OP_GETFILELOCK) {
9577 ceph_filelock filelock;
9578 bufferlist::iterator p = bl.begin();
9579 ::decode(filelock, p);
9580
9581 if (CEPH_LOCK_SHARED == filelock.type)
9582 fl->l_type = F_RDLCK;
9583 else if (CEPH_LOCK_EXCL == filelock.type)
9584 fl->l_type = F_WRLCK;
9585 else
9586 fl->l_type = F_UNLCK;
9587
9588 fl->l_whence = SEEK_SET;
9589 fl->l_start = filelock.start;
9590 fl->l_len = filelock.length;
9591 fl->l_pid = filelock.pid;
9592 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9593 ceph_lock_state_t *lock_state;
9594 if (lock_type == CEPH_LOCK_FCNTL) {
9595 if (!in->fcntl_locks)
9596 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9597 lock_state = in->fcntl_locks;
9598 } else if (lock_type == CEPH_LOCK_FLOCK) {
9599 if (!in->flock_locks)
9600 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9601 lock_state = in->flock_locks;
9602 } else {
9603 ceph_abort();
9604 return -EINVAL;
9605 }
9606 _update_lock_state(fl, owner, lock_state);
9607
9608 if (!removing) {
9609 if (lock_type == CEPH_LOCK_FCNTL) {
9610 if (!fh->fcntl_locks)
9611 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9612 lock_state = fh->fcntl_locks;
9613 } else {
9614 if (!fh->flock_locks)
9615 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9616 lock_state = fh->flock_locks;
9617 }
9618 _update_lock_state(fl, owner, lock_state);
9619 }
9620 } else
9621 ceph_abort();
9622 }
9623 return ret;
9624 }
9625
9626 int Client::_interrupt_filelock(MetaRequest *req)
9627 {
9628 // Set abort code, but do not kick. The abort code prevents the request
9629 // from being re-sent.
9630 req->abort(-EINTR);
9631 if (req->mds < 0)
9632 return 0; // haven't sent the request
9633
9634 Inode *in = req->inode();
9635
9636 int lock_type;
9637 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9638 lock_type = CEPH_LOCK_FLOCK_INTR;
9639 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9640 lock_type = CEPH_LOCK_FCNTL_INTR;
9641 else {
9642 ceph_abort();
9643 return -EINVAL;
9644 }
9645
9646 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9647 filepath path;
9648 in->make_nosnap_relative_path(path);
9649 intr_req->set_filepath(path);
9650 intr_req->set_inode(in);
9651 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9652 intr_req->head.args.filelock_change.rule = lock_type;
9653 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9654
9655 UserPerm perms(req->get_uid(), req->get_gid());
9656 return make_request(intr_req, perms, NULL, NULL, -1);
9657 }
9658
9659 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9660 {
9661 if (!in->fcntl_locks && !in->flock_locks)
9662 return;
9663
9664 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9665 ::encode(nr_fcntl_locks, bl);
9666 if (nr_fcntl_locks) {
9667 ceph_lock_state_t* lock_state = in->fcntl_locks;
9668 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9669 p != lock_state->held_locks.end();
9670 ++p)
9671 ::encode(p->second, bl);
9672 }
9673
9674 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9675 ::encode(nr_flock_locks, bl);
9676 if (nr_flock_locks) {
9677 ceph_lock_state_t* lock_state = in->flock_locks;
9678 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9679 p != lock_state->held_locks.end();
9680 ++p)
9681 ::encode(p->second, bl);
9682 }
9683
9684 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9685 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9686 }
9687
9688 void Client::_release_filelocks(Fh *fh)
9689 {
9690 if (!fh->fcntl_locks && !fh->flock_locks)
9691 return;
9692
9693 Inode *in = fh->inode.get();
9694 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9695
9696 list<pair<int, ceph_filelock> > to_release;
9697
9698 if (fh->fcntl_locks) {
9699 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9700 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9701 p != lock_state->held_locks.end();
9702 ++p)
9703 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9704 delete fh->fcntl_locks;
9705 }
9706 if (fh->flock_locks) {
9707 ceph_lock_state_t* lock_state = fh->flock_locks;
9708 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9709 p != lock_state->held_locks.end();
9710 ++p)
9711 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9712 delete fh->flock_locks;
9713 }
9714
9715 if (to_release.empty())
9716 return;
9717
9718 struct flock fl;
9719 memset(&fl, 0, sizeof(fl));
9720 fl.l_whence = SEEK_SET;
9721 fl.l_type = F_UNLCK;
9722
9723 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9724 p != to_release.end();
9725 ++p) {
9726 fl.l_start = p->second.start;
9727 fl.l_len = p->second.length;
9728 fl.l_pid = p->second.pid;
9729 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9730 p->second.owner, true);
9731 }
9732 }
9733
9734 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9735 ceph_lock_state_t *lock_state)
9736 {
9737 int lock_cmd;
9738 if (F_RDLCK == fl->l_type)
9739 lock_cmd = CEPH_LOCK_SHARED;
9740 else if (F_WRLCK == fl->l_type)
9741 lock_cmd = CEPH_LOCK_EXCL;
9742 else
9743 lock_cmd = CEPH_LOCK_UNLOCK;;
9744
9745 ceph_filelock filelock;
9746 filelock.start = fl->l_start;
9747 filelock.length = fl->l_len;
9748 filelock.client = 0;
9749 // see comment in _do_filelock()
9750 filelock.owner = owner | (1ULL << 63);
9751 filelock.pid = fl->l_pid;
9752 filelock.type = lock_cmd;
9753
9754 if (filelock.type == CEPH_LOCK_UNLOCK) {
9755 list<ceph_filelock> activated_locks;
9756 lock_state->remove_lock(filelock, activated_locks);
9757 } else {
9758 bool r = lock_state->add_lock(filelock, false, false, NULL);
9759 assert(r);
9760 }
9761 }
9762
9763 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9764 {
9765 Inode *in = fh->inode.get();
9766 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9767 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9768 return ret;
9769 }
9770
9771 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9772 {
9773 Inode *in = fh->inode.get();
9774 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
9775 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
9776 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
9777 return ret;
9778 }
9779
9780 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
9781 {
9782 Inode *in = fh->inode.get();
9783 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
9784
9785 int sleep = !(cmd & LOCK_NB);
9786 cmd &= ~LOCK_NB;
9787
9788 int type;
9789 switch (cmd) {
9790 case LOCK_SH:
9791 type = F_RDLCK;
9792 break;
9793 case LOCK_EX:
9794 type = F_WRLCK;
9795 break;
9796 case LOCK_UN:
9797 type = F_UNLCK;
9798 break;
9799 default:
9800 return -EINVAL;
9801 }
9802
9803 struct flock fl;
9804 memset(&fl, 0, sizeof(fl));
9805 fl.l_type = type;
9806 fl.l_whence = SEEK_SET;
9807
9808 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
9809 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
9810 return ret;
9811 }
9812
9813 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
9814 {
9815 /* Since the only thing this does is wrap a call to statfs, and
9816 statfs takes a lock, it doesn't seem we have a need to split it
9817 out. */
9818 return statfs(0, stbuf, perms);
9819 }
9820
9821 void Client::ll_register_callbacks(struct client_callback_args *args)
9822 {
9823 if (!args)
9824 return;
9825 Mutex::Locker l(client_lock);
9826 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
9827 << " invalidate_ino_cb " << args->ino_cb
9828 << " invalidate_dentry_cb " << args->dentry_cb
9829 << " getgroups_cb" << args->getgroups_cb
9830 << " switch_interrupt_cb " << args->switch_intr_cb
9831 << " remount_cb " << args->remount_cb
9832 << dendl;
9833 callback_handle = args->handle;
9834 if (args->ino_cb) {
9835 ino_invalidate_cb = args->ino_cb;
9836 async_ino_invalidator.start();
9837 }
9838 if (args->dentry_cb) {
9839 dentry_invalidate_cb = args->dentry_cb;
9840 async_dentry_invalidator.start();
9841 }
9842 if (args->switch_intr_cb) {
9843 switch_interrupt_cb = args->switch_intr_cb;
9844 interrupt_finisher.start();
9845 }
9846 if (args->remount_cb) {
9847 remount_cb = args->remount_cb;
9848 remount_finisher.start();
9849 }
9850 getgroups_cb = args->getgroups_cb;
9851 umask_cb = args->umask_cb;
9852 }
9853
9854 int Client::test_dentry_handling(bool can_invalidate)
9855 {
9856 int r = 0;
9857
9858 can_invalidate_dentries = can_invalidate;
9859
9860 if (can_invalidate_dentries) {
9861 assert(dentry_invalidate_cb);
9862 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
9863 } else if (remount_cb) {
9864 ldout(cct, 1) << "using remount_cb" << dendl;
9865 int s = remount_cb(callback_handle);
9866 if (s) {
9867 lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency"
9868 << dendl;
9869 }
9870 if (cct->_conf->client_die_on_failed_remount) {
9871 require_remount = true;
9872 r = s;
9873 }
9874 } else {
9875 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
9876 if (cct->_conf->client_die_on_failed_remount)
9877 ceph_abort();
9878 }
9879 return r;
9880 }
9881
9882 int Client::_sync_fs()
9883 {
9884 ldout(cct, 10) << "_sync_fs" << dendl;
9885
9886 // flush file data
9887 Mutex lock("Client::_fsync::lock");
9888 Cond cond;
9889 bool flush_done = false;
9890 if (cct->_conf->client_oc)
9891 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
9892 else
9893 flush_done = true;
9894
9895 // flush caps
9896 flush_caps_sync();
9897 ceph_tid_t flush_tid = last_flush_tid;
9898
9899 // wait for unsafe mds requests
9900 wait_unsafe_requests();
9901
9902 wait_sync_caps(flush_tid);
9903
9904 if (!flush_done) {
9905 client_lock.Unlock();
9906 lock.Lock();
9907 ldout(cct, 15) << "waiting on data to flush" << dendl;
9908 while (!flush_done)
9909 cond.Wait(lock);
9910 lock.Unlock();
9911 client_lock.Lock();
9912 }
9913
9914 return 0;
9915 }
9916
9917 int Client::sync_fs()
9918 {
9919 Mutex::Locker l(client_lock);
9920 return _sync_fs();
9921 }
9922
9923 int64_t Client::drop_caches()
9924 {
9925 Mutex::Locker l(client_lock);
9926 return objectcacher->release_all();
9927 }
9928
9929
9930 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
9931 {
9932 Mutex::Locker l(client_lock);
9933 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
9934 << ", " << offset << ", " << count << ")" << dendl;
9935
9936 Fh *f = get_filehandle(fd);
9937 if (!f)
9938 return -EBADF;
9939
9940 // for now
9941 _fsync(f, true);
9942
9943 return 0;
9944 }
9945
9946 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
9947 {
9948 Mutex::Locker l(client_lock);
9949 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
9950 << ", " << offset << ", " << count << ")" << dendl;
9951
9952 Fh *f = get_filehandle(fd);
9953 if (!f)
9954 return -EBADF;
9955 Inode *in = f->inode.get();
9956
9957 _fsync(f, true);
9958 if (_release(in))
9959 check_caps(in, 0);
9960 return 0;
9961 }
9962
9963
9964 // =============================
9965 // snaps
9966
9967 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
9968 {
9969 Mutex::Locker l(client_lock);
9970 filepath path(relpath);
9971 InodeRef in;
9972 int r = path_walk(path, &in, perm);
9973 if (r < 0)
9974 return r;
9975 if (cct->_conf->client_permissions) {
9976 r = may_create(in.get(), perm);
9977 if (r < 0)
9978 return r;
9979 }
9980 Inode *snapdir = open_snapdir(in.get());
9981 return _mkdir(snapdir, name, 0, perm);
9982 }
9983 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
9984 {
9985 Mutex::Locker l(client_lock);
9986 filepath path(relpath);
9987 InodeRef in;
9988 int r = path_walk(path, &in, perms);
9989 if (r < 0)
9990 return r;
9991 if (cct->_conf->client_permissions) {
9992 r = may_delete(in.get(), NULL, perms);
9993 if (r < 0)
9994 return r;
9995 }
9996 Inode *snapdir = open_snapdir(in.get());
9997 return _rmdir(snapdir, name, perms);
9998 }
9999
10000 // =============================
10001 // expose caps
10002
10003 int Client::get_caps_issued(int fd) {
10004
10005 Mutex::Locker lock(client_lock);
10006
10007 Fh *f = get_filehandle(fd);
10008 if (!f)
10009 return -EBADF;
10010
10011 return f->inode->caps_issued();
10012 }
10013
10014 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10015 {
10016 Mutex::Locker lock(client_lock);
10017 filepath p(path);
10018 InodeRef in;
10019 int r = path_walk(p, &in, perms, true);
10020 if (r < 0)
10021 return r;
10022 return in->caps_issued();
10023 }
10024
10025 // =========================================
10026 // low level
10027
10028 Inode *Client::open_snapdir(Inode *diri)
10029 {
10030 Inode *in;
10031 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10032 if (!inode_map.count(vino)) {
10033 in = new Inode(this, vino, &diri->layout);
10034
10035 in->ino = diri->ino;
10036 in->snapid = CEPH_SNAPDIR;
10037 in->mode = diri->mode;
10038 in->uid = diri->uid;
10039 in->gid = diri->gid;
10040 in->mtime = diri->mtime;
10041 in->ctime = diri->ctime;
10042 in->btime = diri->btime;
10043 in->size = diri->size;
10044 in->change_attr = diri->change_attr;
10045
10046 in->dirfragtree.clear();
10047 in->snapdir_parent = diri;
10048 diri->flags |= I_SNAPDIR_OPEN;
10049 inode_map[vino] = in;
10050 if (use_faked_inos())
10051 _assign_faked_ino(in);
10052 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10053 } else {
10054 in = inode_map[vino];
10055 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10056 }
10057 return in;
10058 }
10059
10060 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10061 Inode **out, const UserPerm& perms)
10062 {
10063 Mutex::Locker lock(client_lock);
10064 vinodeno_t vparent = _get_vino(parent);
10065 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
10066 tout(cct) << "ll_lookup" << std::endl;
10067 tout(cct) << name << std::endl;
10068
10069 int r = 0;
10070 if (!cct->_conf->fuse_default_permissions) {
10071 r = may_lookup(parent, perms);
10072 if (r < 0)
10073 return r;
10074 }
10075
10076 string dname(name);
10077 InodeRef in;
10078
10079 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10080 if (r < 0) {
10081 attr->st_ino = 0;
10082 goto out;
10083 }
10084
10085 assert(in);
10086 fill_stat(in, attr);
10087 _ll_get(in.get());
10088
10089 out:
10090 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
10091 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10092 tout(cct) << attr->st_ino << std::endl;
10093 *out = in.get();
10094 return r;
10095 }
10096
10097 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10098 struct ceph_statx *stx, unsigned want, unsigned flags,
10099 const UserPerm& perms)
10100 {
10101 Mutex::Locker lock(client_lock);
10102 vinodeno_t vparent = _get_vino(parent);
10103 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
10104 tout(cct) << "ll_lookupx" << std::endl;
10105 tout(cct) << name << std::endl;
10106
10107 int r = 0;
10108 if (!cct->_conf->fuse_default_permissions) {
10109 r = may_lookup(parent, perms);
10110 if (r < 0)
10111 return r;
10112 }
10113
10114 string dname(name);
10115 InodeRef in;
10116
10117 unsigned mask = statx_to_mask(flags, want);
10118 r = _lookup(parent, dname, mask, &in, perms);
10119 if (r < 0) {
10120 stx->stx_ino = 0;
10121 stx->stx_mask = 0;
10122 } else {
10123 assert(in);
10124 fill_statx(in, mask, stx);
10125 _ll_get(in.get());
10126 }
10127
10128 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
10129 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10130 tout(cct) << stx->stx_ino << std::endl;
10131 *out = in.get();
10132 return r;
10133 }
10134
10135 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10136 unsigned int want, unsigned int flags, const UserPerm& perms)
10137 {
10138 Mutex::Locker lock(client_lock);
10139 filepath fp(name, 0);
10140 InodeRef in;
10141 int rc;
10142 unsigned mask = statx_to_mask(flags, want);
10143
10144 ldout(cct, 3) << "ll_walk" << name << dendl;
10145 tout(cct) << "ll_walk" << std::endl;
10146 tout(cct) << name << std::endl;
10147
10148 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10149 if (rc < 0) {
10150 /* zero out mask, just in case... */
10151 stx->stx_mask = 0;
10152 stx->stx_ino = 0;
10153 *out = NULL;
10154 return rc;
10155 } else {
10156 assert(in);
10157 fill_statx(in, mask, stx);
10158 _ll_get(in.get());
10159 *out = in.get();
10160 return 0;
10161 }
10162 }
10163
10164 void Client::_ll_get(Inode *in)
10165 {
10166 if (in->ll_ref == 0) {
10167 in->get();
10168 if (in->is_dir() && !in->dn_set.empty()) {
10169 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10170 in->get_first_parent()->get(); // pin dentry
10171 }
10172 }
10173 in->ll_get();
10174 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10175 }
10176
10177 int Client::_ll_put(Inode *in, int num)
10178 {
10179 in->ll_put(num);
10180 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10181 if (in->ll_ref == 0) {
10182 if (in->is_dir() && !in->dn_set.empty()) {
10183 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10184 in->get_first_parent()->put(); // unpin dentry
10185 }
10186 put_inode(in);
10187 return 0;
10188 } else {
10189 return in->ll_ref;
10190 }
10191 }
10192
10193 void Client::_ll_drop_pins()
10194 {
10195 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10196 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10197 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10198 it != inode_map.end();
10199 it = next) {
10200 Inode *in = it->second;
10201 next = it;
10202 ++next;
10203 if (in->ll_ref)
10204 _ll_put(in, in->ll_ref);
10205 }
10206 }
10207
10208 bool Client::ll_forget(Inode *in, int count)
10209 {
10210 Mutex::Locker lock(client_lock);
10211 inodeno_t ino = _get_inodeno(in);
10212
10213 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10214 tout(cct) << "ll_forget" << std::endl;
10215 tout(cct) << ino.val << std::endl;
10216 tout(cct) << count << std::endl;
10217
10218 if (ino == 1) return true; // ignore forget on root.
10219
10220 bool last = false;
10221 if (in->ll_ref < count) {
10222 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10223 << ", which only has ll_ref=" << in->ll_ref << dendl;
10224 _ll_put(in, in->ll_ref);
10225 last = true;
10226 } else {
10227 if (_ll_put(in, count) == 0)
10228 last = true;
10229 }
10230
10231 return last;
10232 }
10233
10234 bool Client::ll_put(Inode *in)
10235 {
10236 /* ll_forget already takes the lock */
10237 return ll_forget(in, 1);
10238 }
10239
10240 snapid_t Client::ll_get_snapid(Inode *in)
10241 {
10242 Mutex::Locker lock(client_lock);
10243 return in->snapid;
10244 }
10245
10246 Inode *Client::ll_get_inode(ino_t ino)
10247 {
10248 Mutex::Locker lock(client_lock);
10249 vinodeno_t vino = _map_faked_ino(ino);
10250 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10251 if (p == inode_map.end())
10252 return NULL;
10253 Inode *in = p->second;
10254 _ll_get(in);
10255 return in;
10256 }
10257
10258 Inode *Client::ll_get_inode(vinodeno_t vino)
10259 {
10260 Mutex::Locker lock(client_lock);
10261 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10262 if (p == inode_map.end())
10263 return NULL;
10264 Inode *in = p->second;
10265 _ll_get(in);
10266 return in;
10267 }
10268
10269 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10270 {
10271 vinodeno_t vino = _get_vino(in);
10272
10273 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10274 tout(cct) << "ll_getattr" << std::endl;
10275 tout(cct) << vino.ino.val << std::endl;
10276
10277 if (vino.snapid < CEPH_NOSNAP)
10278 return 0;
10279 else
10280 return _getattr(in, caps, perms);
10281 }
10282
10283 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10284 {
10285 Mutex::Locker lock(client_lock);
10286
10287 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10288
10289 if (res == 0)
10290 fill_stat(in, attr);
10291 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10292 return res;
10293 }
10294
10295 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10296 unsigned int flags, const UserPerm& perms)
10297 {
10298 Mutex::Locker lock(client_lock);
10299
10300 int res = 0;
10301 unsigned mask = statx_to_mask(flags, want);
10302
10303 if (mask && !in->caps_issued_mask(mask))
10304 res = _ll_getattr(in, mask, perms);
10305
10306 if (res == 0)
10307 fill_statx(in, mask, stx);
10308 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10309 return res;
10310 }
10311
10312 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10313 const UserPerm& perms, InodeRef *inp)
10314 {
10315 vinodeno_t vino = _get_vino(in);
10316
10317 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10318 << dendl;
10319 tout(cct) << "ll_setattrx" << std::endl;
10320 tout(cct) << vino.ino.val << std::endl;
10321 tout(cct) << stx->stx_mode << std::endl;
10322 tout(cct) << stx->stx_uid << std::endl;
10323 tout(cct) << stx->stx_gid << std::endl;
10324 tout(cct) << stx->stx_size << std::endl;
10325 tout(cct) << stx->stx_mtime << std::endl;
10326 tout(cct) << stx->stx_atime << std::endl;
10327 tout(cct) << stx->stx_btime << std::endl;
10328 tout(cct) << mask << std::endl;
10329
10330 if (!cct->_conf->fuse_default_permissions) {
10331 int res = may_setattr(in, stx, mask, perms);
10332 if (res < 0)
10333 return res;
10334 }
10335
10336 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10337
10338 return __setattrx(in, stx, mask, perms, inp);
10339 }
10340
10341 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10342 const UserPerm& perms)
10343 {
10344 Mutex::Locker lock(client_lock);
10345 InodeRef target(in);
10346 int res = _ll_setattrx(in, stx, mask, perms, &target);
10347 if (res == 0) {
10348 assert(in == target.get());
10349 fill_statx(in, in->caps_issued(), stx);
10350 }
10351
10352 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10353 return res;
10354 }
10355
10356 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10357 const UserPerm& perms)
10358 {
10359 struct ceph_statx stx;
10360 stat_to_statx(attr, &stx);
10361
10362 Mutex::Locker lock(client_lock);
10363 InodeRef target(in);
10364 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10365 if (res == 0) {
10366 assert(in == target.get());
10367 fill_stat(in, attr);
10368 }
10369
10370 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10371 return res;
10372 }
10373
10374
10375 // ----------
10376 // xattrs
10377
10378 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10379 const UserPerm& perms)
10380 {
10381 Mutex::Locker lock(client_lock);
10382 InodeRef in;
10383 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10384 if (r < 0)
10385 return r;
10386 return _getxattr(in, name, value, size, perms);
10387 }
10388
10389 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10390 const UserPerm& perms)
10391 {
10392 Mutex::Locker lock(client_lock);
10393 InodeRef in;
10394 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10395 if (r < 0)
10396 return r;
10397 return _getxattr(in, name, value, size, perms);
10398 }
10399
10400 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10401 const UserPerm& perms)
10402 {
10403 Mutex::Locker lock(client_lock);
10404 Fh *f = get_filehandle(fd);
10405 if (!f)
10406 return -EBADF;
10407 return _getxattr(f->inode, name, value, size, perms);
10408 }
10409
10410 int Client::listxattr(const char *path, char *list, size_t size,
10411 const UserPerm& perms)
10412 {
10413 Mutex::Locker lock(client_lock);
10414 InodeRef in;
10415 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10416 if (r < 0)
10417 return r;
10418 return Client::_listxattr(in.get(), list, size, perms);
10419 }
10420
10421 int Client::llistxattr(const char *path, char *list, size_t size,
10422 const UserPerm& perms)
10423 {
10424 Mutex::Locker lock(client_lock);
10425 InodeRef in;
10426 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10427 if (r < 0)
10428 return r;
10429 return Client::_listxattr(in.get(), list, size, perms);
10430 }
10431
10432 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10433 {
10434 Mutex::Locker lock(client_lock);
10435 Fh *f = get_filehandle(fd);
10436 if (!f)
10437 return -EBADF;
10438 return Client::_listxattr(f->inode.get(), list, size, perms);
10439 }
10440
10441 int Client::removexattr(const char *path, const char *name,
10442 const UserPerm& perms)
10443 {
10444 Mutex::Locker lock(client_lock);
10445 InodeRef in;
10446 int r = Client::path_walk(path, &in, perms, true);
10447 if (r < 0)
10448 return r;
10449 return _removexattr(in, name, perms);
10450 }
10451
10452 int Client::lremovexattr(const char *path, const char *name,
10453 const UserPerm& perms)
10454 {
10455 Mutex::Locker lock(client_lock);
10456 InodeRef in;
10457 int r = Client::path_walk(path, &in, perms, false);
10458 if (r < 0)
10459 return r;
10460 return _removexattr(in, name, perms);
10461 }
10462
10463 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10464 {
10465 Mutex::Locker lock(client_lock);
10466 Fh *f = get_filehandle(fd);
10467 if (!f)
10468 return -EBADF;
10469 return _removexattr(f->inode, name, perms);
10470 }
10471
10472 int Client::setxattr(const char *path, const char *name, const void *value,
10473 size_t size, int flags, const UserPerm& perms)
10474 {
10475 _setxattr_maybe_wait_for_osdmap(name, value, size);
10476
10477 Mutex::Locker lock(client_lock);
10478 InodeRef in;
10479 int r = Client::path_walk(path, &in, perms, true);
10480 if (r < 0)
10481 return r;
10482 return _setxattr(in, name, value, size, flags, perms);
10483 }
10484
10485 int Client::lsetxattr(const char *path, const char *name, const void *value,
10486 size_t size, int flags, const UserPerm& perms)
10487 {
10488 _setxattr_maybe_wait_for_osdmap(name, value, size);
10489
10490 Mutex::Locker lock(client_lock);
10491 InodeRef in;
10492 int r = Client::path_walk(path, &in, perms, false);
10493 if (r < 0)
10494 return r;
10495 return _setxattr(in, name, value, size, flags, perms);
10496 }
10497
10498 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10499 int flags, const UserPerm& perms)
10500 {
10501 _setxattr_maybe_wait_for_osdmap(name, value, size);
10502
10503 Mutex::Locker lock(client_lock);
10504 Fh *f = get_filehandle(fd);
10505 if (!f)
10506 return -EBADF;
10507 return _setxattr(f->inode, name, value, size, flags, perms);
10508 }
10509
10510 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10511 const UserPerm& perms)
10512 {
10513 int r;
10514
10515 const VXattr *vxattr = _match_vxattr(in, name);
10516 if (vxattr) {
10517 r = -ENODATA;
10518
10519 // Do a force getattr to get the latest quota before returning
10520 // a value to userspace.
10521 r = _getattr(in, 0, perms, true);
10522 if (r != 0) {
10523 // Error from getattr!
10524 return r;
10525 }
10526
10527 // call pointer-to-member function
10528 char buf[256];
10529 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10530 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10531 } else {
10532 r = -ENODATA;
10533 }
10534
10535 if (size != 0) {
10536 if (r > (int)size) {
10537 r = -ERANGE;
10538 } else if (r > 0) {
10539 memcpy(value, buf, r);
10540 }
10541 }
10542 goto out;
10543 }
10544
10545 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10546 r = -EOPNOTSUPP;
10547 goto out;
10548 }
10549
10550 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10551 if (r == 0) {
10552 string n(name);
10553 r = -ENODATA;
10554 if (in->xattrs.count(n)) {
10555 r = in->xattrs[n].length();
10556 if (r > 0 && size != 0) {
10557 if (size >= (unsigned)r)
10558 memcpy(value, in->xattrs[n].c_str(), r);
10559 else
10560 r = -ERANGE;
10561 }
10562 }
10563 }
10564 out:
10565 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10566 return r;
10567 }
10568
10569 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10570 const UserPerm& perms)
10571 {
10572 if (cct->_conf->client_permissions) {
10573 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10574 if (r < 0)
10575 return r;
10576 }
10577 return _getxattr(in.get(), name, value, size, perms);
10578 }
10579
10580 int Client::ll_getxattr(Inode *in, const char *name, void *value,
10581 size_t size, const UserPerm& perms)
10582 {
10583 Mutex::Locker lock(client_lock);
10584
10585 vinodeno_t vino = _get_vino(in);
10586
10587 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10588 tout(cct) << "ll_getxattr" << std::endl;
10589 tout(cct) << vino.ino.val << std::endl;
10590 tout(cct) << name << std::endl;
10591
10592 if (!cct->_conf->fuse_default_permissions) {
10593 int r = xattr_permission(in, name, MAY_READ, perms);
10594 if (r < 0)
10595 return r;
10596 }
10597
10598 return _getxattr(in, name, value, size, perms);
10599 }
10600
10601 int Client::_listxattr(Inode *in, char *name, size_t size,
10602 const UserPerm& perms)
10603 {
10604 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10605 if (r == 0) {
10606 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10607 p != in->xattrs.end();
10608 ++p)
10609 r += p->first.length() + 1;
10610
10611 const VXattr *vxattrs = _get_vxattrs(in);
10612 r += _vxattrs_name_size(vxattrs);
10613
10614 if (size != 0) {
10615 if (size >= (unsigned)r) {
10616 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10617 p != in->xattrs.end();
10618 ++p) {
10619 memcpy(name, p->first.c_str(), p->first.length());
10620 name += p->first.length();
10621 *name = '\0';
10622 name++;
10623 }
10624 if (vxattrs) {
10625 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10626 const VXattr& vxattr = vxattrs[i];
10627 if (vxattr.hidden)
10628 continue;
10629 // call pointer-to-member function
10630 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10631 continue;
10632 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10633 name += vxattr.name.length();
10634 *name = '\0';
10635 name++;
10636 }
10637 }
10638 } else
10639 r = -ERANGE;
10640 }
10641 }
10642 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10643 return r;
10644 }
10645
10646 int Client::ll_listxattr(Inode *in, char *names, size_t size,
10647 const UserPerm& perms)
10648 {
10649 Mutex::Locker lock(client_lock);
10650
10651 vinodeno_t vino = _get_vino(in);
10652
10653 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10654 tout(cct) << "ll_listxattr" << std::endl;
10655 tout(cct) << vino.ino.val << std::endl;
10656 tout(cct) << size << std::endl;
10657
10658 return _listxattr(in, names, size, perms);
10659 }
10660
10661 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10662 size_t size, int flags, const UserPerm& perms)
10663 {
10664
10665 int xattr_flags = 0;
10666 if (!value)
10667 xattr_flags |= CEPH_XATTR_REMOVE;
10668 if (flags & XATTR_CREATE)
10669 xattr_flags |= CEPH_XATTR_CREATE;
10670 if (flags & XATTR_REPLACE)
10671 xattr_flags |= CEPH_XATTR_REPLACE;
10672
10673 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
10674 filepath path;
10675 in->make_nosnap_relative_path(path);
10676 req->set_filepath(path);
10677 req->set_string2(name);
10678 req->set_inode(in);
10679 req->head.args.setxattr.flags = xattr_flags;
10680
10681 bufferlist bl;
10682 bl.append((const char*)value, size);
10683 req->set_data(bl);
10684
10685 int res = make_request(req, perms);
10686
10687 trim_cache();
10688 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
10689 res << dendl;
10690 return res;
10691 }
10692
10693 int Client::_setxattr(Inode *in, const char *name, const void *value,
10694 size_t size, int flags, const UserPerm& perms)
10695 {
10696 if (in->snapid != CEPH_NOSNAP) {
10697 return -EROFS;
10698 }
10699
10700 bool posix_acl_xattr = false;
10701 if (acl_type == POSIX_ACL)
10702 posix_acl_xattr = !strncmp(name, "system.", 7);
10703
10704 if (strncmp(name, "user.", 5) &&
10705 strncmp(name, "security.", 9) &&
10706 strncmp(name, "trusted.", 8) &&
10707 strncmp(name, "ceph.", 5) &&
10708 !posix_acl_xattr)
10709 return -EOPNOTSUPP;
10710
10711 if (posix_acl_xattr) {
10712 if (!strcmp(name, ACL_EA_ACCESS)) {
10713 mode_t new_mode = in->mode;
10714 if (value) {
10715 int ret = posix_acl_equiv_mode(value, size, &new_mode);
10716 if (ret < 0)
10717 return ret;
10718 if (ret == 0) {
10719 value = NULL;
10720 size = 0;
10721 }
10722 if (new_mode != in->mode) {
10723 struct ceph_statx stx;
10724 stx.stx_mode = new_mode;
10725 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
10726 if (ret < 0)
10727 return ret;
10728 }
10729 }
10730 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
10731 if (value) {
10732 if (!S_ISDIR(in->mode))
10733 return -EACCES;
10734 int ret = posix_acl_check(value, size);
10735 if (ret < 0)
10736 return -EINVAL;
10737 if (ret == 0) {
10738 value = NULL;
10739 size = 0;
10740 }
10741 }
10742 } else {
10743 return -EOPNOTSUPP;
10744 }
10745 } else {
10746 const VXattr *vxattr = _match_vxattr(in, name);
10747 if (vxattr && vxattr->readonly)
10748 return -EOPNOTSUPP;
10749 }
10750
10751 return _do_setxattr(in, name, value, size, flags, perms);
10752 }
10753
10754 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
10755 size_t size, int flags, const UserPerm& perms)
10756 {
10757 if (cct->_conf->client_permissions) {
10758 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
10759 if (r < 0)
10760 return r;
10761 }
10762 return _setxattr(in.get(), name, value, size, flags, perms);
10763 }
10764
10765 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
10766 {
10767 string tmp;
10768 if (name == "layout") {
10769 string::iterator begin = value.begin();
10770 string::iterator end = value.end();
10771 keys_and_values<string::iterator> p; // create instance of parser
10772 std::map<string, string> m; // map to receive results
10773 if (!qi::parse(begin, end, p, m)) { // returns true if successful
10774 return -EINVAL;
10775 }
10776 if (begin != end)
10777 return -EINVAL;
10778 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
10779 if (q->first == "pool") {
10780 tmp = q->second;
10781 break;
10782 }
10783 }
10784 } else if (name == "layout.pool") {
10785 tmp = value;
10786 }
10787
10788 if (tmp.length()) {
10789 int64_t pool;
10790 try {
10791 pool = boost::lexical_cast<unsigned>(tmp);
10792 if (!osdmap->have_pg_pool(pool))
10793 return -ENOENT;
10794 } catch (boost::bad_lexical_cast const&) {
10795 pool = osdmap->lookup_pg_pool_name(tmp);
10796 if (pool < 0) {
10797 return -ENOENT;
10798 }
10799 }
10800 }
10801
10802 return 0;
10803 }
10804
10805 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
10806 {
10807 // For setting pool of layout, MetaRequest need osdmap epoch.
10808 // There is a race which create a new data pool but client and mds both don't have.
10809 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
10810 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
10811 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
10812 string rest(strstr(name, "layout"));
10813 string v((const char*)value, size);
10814 int r = objecter->with_osdmap([&](const OSDMap& o) {
10815 return _setxattr_check_data_pool(rest, v, &o);
10816 });
10817
10818 if (r == -ENOENT) {
10819 C_SaferCond ctx;
10820 objecter->wait_for_latest_osdmap(&ctx);
10821 ctx.wait();
10822 }
10823 }
10824 }
10825
10826 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
10827 size_t size, int flags, const UserPerm& perms)
10828 {
10829 _setxattr_maybe_wait_for_osdmap(name, value, size);
10830
10831 Mutex::Locker lock(client_lock);
10832
10833 vinodeno_t vino = _get_vino(in);
10834
10835 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
10836 tout(cct) << "ll_setxattr" << std::endl;
10837 tout(cct) << vino.ino.val << std::endl;
10838 tout(cct) << name << std::endl;
10839
10840 if (!cct->_conf->fuse_default_permissions) {
10841 int r = xattr_permission(in, name, MAY_WRITE, perms);
10842 if (r < 0)
10843 return r;
10844 }
10845 return _setxattr(in, name, value, size, flags, perms);
10846 }
10847
10848 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
10849 {
10850 if (in->snapid != CEPH_NOSNAP) {
10851 return -EROFS;
10852 }
10853
10854 // same xattrs supported by kernel client
10855 if (strncmp(name, "user.", 5) &&
10856 strncmp(name, "system.", 7) &&
10857 strncmp(name, "security.", 9) &&
10858 strncmp(name, "trusted.", 8) &&
10859 strncmp(name, "ceph.", 5))
10860 return -EOPNOTSUPP;
10861
10862 const VXattr *vxattr = _match_vxattr(in, name);
10863 if (vxattr && vxattr->readonly)
10864 return -EOPNOTSUPP;
10865
10866 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
10867 filepath path;
10868 in->make_nosnap_relative_path(path);
10869 req->set_filepath(path);
10870 req->set_filepath2(name);
10871 req->set_inode(in);
10872
10873 int res = make_request(req, perms);
10874
10875 trim_cache();
10876 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
10877 return res;
10878 }
10879
10880 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
10881 {
10882 if (cct->_conf->client_permissions) {
10883 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
10884 if (r < 0)
10885 return r;
10886 }
10887 return _removexattr(in.get(), name, perms);
10888 }
10889
10890 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
10891 {
10892 Mutex::Locker lock(client_lock);
10893
10894 vinodeno_t vino = _get_vino(in);
10895
10896 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
10897 tout(cct) << "ll_removexattr" << std::endl;
10898 tout(cct) << vino.ino.val << std::endl;
10899 tout(cct) << name << std::endl;
10900
10901 if (!cct->_conf->fuse_default_permissions) {
10902 int r = xattr_permission(in, name, MAY_WRITE, perms);
10903 if (r < 0)
10904 return r;
10905 }
10906
10907 return _removexattr(in, name, perms);
10908 }
10909
10910 bool Client::_vxattrcb_quota_exists(Inode *in)
10911 {
10912 return in->quota.is_enable();
10913 }
10914 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
10915 {
10916 return snprintf(val, size,
10917 "max_bytes=%lld max_files=%lld",
10918 (long long int)in->quota.max_bytes,
10919 (long long int)in->quota.max_files);
10920 }
10921 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
10922 {
10923 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
10924 }
10925 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
10926 {
10927 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
10928 }
10929
10930 bool Client::_vxattrcb_layout_exists(Inode *in)
10931 {
10932 return in->layout != file_layout_t();
10933 }
10934 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
10935 {
10936 int r = snprintf(val, size,
10937 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
10938 (unsigned long long)in->layout.stripe_unit,
10939 (unsigned long long)in->layout.stripe_count,
10940 (unsigned long long)in->layout.object_size);
10941 objecter->with_osdmap([&](const OSDMap& o) {
10942 if (o.have_pg_pool(in->layout.pool_id))
10943 r += snprintf(val + r, size - r, "%s",
10944 o.get_pool_name(in->layout.pool_id).c_str());
10945 else
10946 r += snprintf(val + r, size - r, "%" PRIu64,
10947 (uint64_t)in->layout.pool_id);
10948 });
10949 if (in->layout.pool_ns.length())
10950 r += snprintf(val + r, size - r, " pool_namespace=%s",
10951 in->layout.pool_ns.c_str());
10952 return r;
10953 }
10954 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
10955 {
10956 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
10957 }
10958 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
10959 {
10960 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
10961 }
10962 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
10963 {
10964 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
10965 }
10966 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
10967 {
10968 size_t r;
10969 objecter->with_osdmap([&](const OSDMap& o) {
10970 if (o.have_pg_pool(in->layout.pool_id))
10971 r = snprintf(val, size, "%s", o.get_pool_name(
10972 in->layout.pool_id).c_str());
10973 else
10974 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
10975 });
10976 return r;
10977 }
10978 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
10979 {
10980 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
10981 }
10982 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
10983 {
10984 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
10985 }
10986 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
10987 {
10988 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
10989 }
10990 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
10991 {
10992 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
10993 }
10994 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
10995 {
10996 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
10997 }
10998 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
10999 {
11000 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11001 }
11002 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11003 {
11004 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11005 }
11006 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11007 {
11008 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11009 }
11010 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11011 {
11012 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11013 (long)in->rstat.rctime.nsec());
11014 }
11015
11016 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11017 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11018
11019 #define XATTR_NAME_CEPH(_type, _name) \
11020 { \
11021 name: CEPH_XATTR_NAME(_type, _name), \
11022 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11023 readonly: true, \
11024 hidden: false, \
11025 exists_cb: NULL, \
11026 }
11027 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11028 { \
11029 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11030 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11031 readonly: false, \
11032 hidden: true, \
11033 exists_cb: &Client::_vxattrcb_layout_exists, \
11034 }
11035 #define XATTR_QUOTA_FIELD(_type, _name) \
11036 { \
11037 name: CEPH_XATTR_NAME(_type, _name), \
11038 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11039 readonly: false, \
11040 hidden: true, \
11041 exists_cb: &Client::_vxattrcb_quota_exists, \
11042 }
11043
11044 const Client::VXattr Client::_dir_vxattrs[] = {
11045 {
11046 name: "ceph.dir.layout",
11047 getxattr_cb: &Client::_vxattrcb_layout,
11048 readonly: false,
11049 hidden: true,
11050 exists_cb: &Client::_vxattrcb_layout_exists,
11051 },
11052 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11053 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11054 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11055 XATTR_LAYOUT_FIELD(dir, layout, pool),
11056 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11057 XATTR_NAME_CEPH(dir, entries),
11058 XATTR_NAME_CEPH(dir, files),
11059 XATTR_NAME_CEPH(dir, subdirs),
11060 XATTR_NAME_CEPH(dir, rentries),
11061 XATTR_NAME_CEPH(dir, rfiles),
11062 XATTR_NAME_CEPH(dir, rsubdirs),
11063 XATTR_NAME_CEPH(dir, rbytes),
11064 XATTR_NAME_CEPH(dir, rctime),
11065 {
11066 name: "ceph.quota",
11067 getxattr_cb: &Client::_vxattrcb_quota,
11068 readonly: false,
11069 hidden: true,
11070 exists_cb: &Client::_vxattrcb_quota_exists,
11071 },
11072 XATTR_QUOTA_FIELD(quota, max_bytes),
11073 XATTR_QUOTA_FIELD(quota, max_files),
11074 { name: "" } /* Required table terminator */
11075 };
11076
11077 const Client::VXattr Client::_file_vxattrs[] = {
11078 {
11079 name: "ceph.file.layout",
11080 getxattr_cb: &Client::_vxattrcb_layout,
11081 readonly: false,
11082 hidden: true,
11083 exists_cb: &Client::_vxattrcb_layout_exists,
11084 },
11085 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11086 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11087 XATTR_LAYOUT_FIELD(file, layout, object_size),
11088 XATTR_LAYOUT_FIELD(file, layout, pool),
11089 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11090 { name: "" } /* Required table terminator */
11091 };
11092
11093 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11094 {
11095 if (in->is_dir())
11096 return _dir_vxattrs;
11097 else if (in->is_file())
11098 return _file_vxattrs;
11099 return NULL;
11100 }
11101
11102 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11103 {
11104 if (strncmp(name, "ceph.", 5) == 0) {
11105 const VXattr *vxattr = _get_vxattrs(in);
11106 if (vxattr) {
11107 while (!vxattr->name.empty()) {
11108 if (vxattr->name == name)
11109 return vxattr;
11110 vxattr++;
11111 }
11112 }
11113 }
11114 return NULL;
11115 }
11116
11117 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11118 {
11119 size_t len = 0;
11120 while (!vxattr->name.empty()) {
11121 if (!vxattr->hidden)
11122 len += vxattr->name.length() + 1;
11123 vxattr++;
11124 }
11125 return len;
11126 }
11127
11128 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11129 {
11130 Mutex::Locker lock(client_lock);
11131
11132 vinodeno_t vino = _get_vino(in);
11133
11134 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11135 tout(cct) << "ll_readlink" << std::endl;
11136 tout(cct) << vino.ino.val << std::endl;
11137
11138 set<Dentry*>::iterator dn = in->dn_set.begin();
11139 while (dn != in->dn_set.end()) {
11140 touch_dn(*dn);
11141 ++dn;
11142 }
11143
11144 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11145 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11146 return r;
11147 }
11148
11149 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11150 const UserPerm& perms, InodeRef *inp)
11151 {
11152 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11153 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11154 << ", gid " << perms.gid() << ")" << dendl;
11155
11156 if (strlen(name) > NAME_MAX)
11157 return -ENAMETOOLONG;
11158
11159 if (dir->snapid != CEPH_NOSNAP) {
11160 return -EROFS;
11161 }
11162 if (is_quota_files_exceeded(dir, perms)) {
11163 return -EDQUOT;
11164 }
11165
11166 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11167
11168 filepath path;
11169 dir->make_nosnap_relative_path(path);
11170 path.push_dentry(name);
11171 req->set_filepath(path);
11172 req->set_inode(dir);
11173 req->head.args.mknod.rdev = rdev;
11174 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11175 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11176
11177 bufferlist xattrs_bl;
11178 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11179 if (res < 0)
11180 goto fail;
11181 req->head.args.mknod.mode = mode;
11182 if (xattrs_bl.length() > 0)
11183 req->set_data(xattrs_bl);
11184
11185 Dentry *de;
11186 res = get_or_create(dir, name, &de);
11187 if (res < 0)
11188 goto fail;
11189 req->set_dentry(de);
11190
11191 res = make_request(req, perms, inp);
11192
11193 trim_cache();
11194
11195 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11196 return res;
11197
11198 fail:
11199 put_request(req);
11200 return res;
11201 }
11202
11203 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11204 dev_t rdev, struct stat *attr, Inode **out,
11205 const UserPerm& perms)
11206 {
11207 Mutex::Locker lock(client_lock);
11208
11209 vinodeno_t vparent = _get_vino(parent);
11210
11211 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11212 tout(cct) << "ll_mknod" << std::endl;
11213 tout(cct) << vparent.ino.val << std::endl;
11214 tout(cct) << name << std::endl;
11215 tout(cct) << mode << std::endl;
11216 tout(cct) << rdev << std::endl;
11217
11218 if (!cct->_conf->fuse_default_permissions) {
11219 int r = may_create(parent, perms);
11220 if (r < 0)
11221 return r;
11222 }
11223
11224 InodeRef in;
11225 int r = _mknod(parent, name, mode, rdev, perms, &in);
11226 if (r == 0) {
11227 fill_stat(in, attr);
11228 _ll_get(in.get());
11229 }
11230 tout(cct) << attr->st_ino << std::endl;
11231 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11232 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11233 *out = in.get();
11234 return r;
11235 }
11236
11237 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11238 dev_t rdev, Inode **out,
11239 struct ceph_statx *stx, unsigned want, unsigned flags,
11240 const UserPerm& perms)
11241 {
11242 unsigned caps = statx_to_mask(flags, want);
11243 Mutex::Locker lock(client_lock);
11244
11245 vinodeno_t vparent = _get_vino(parent);
11246
11247 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11248 tout(cct) << "ll_mknodx" << std::endl;
11249 tout(cct) << vparent.ino.val << std::endl;
11250 tout(cct) << name << std::endl;
11251 tout(cct) << mode << std::endl;
11252 tout(cct) << rdev << std::endl;
11253
11254 if (!cct->_conf->fuse_default_permissions) {
11255 int r = may_create(parent, perms);
11256 if (r < 0)
11257 return r;
11258 }
11259
11260 InodeRef in;
11261 int r = _mknod(parent, name, mode, rdev, perms, &in);
11262 if (r == 0) {
11263 fill_statx(in, caps, stx);
11264 _ll_get(in.get());
11265 }
11266 tout(cct) << stx->stx_ino << std::endl;
11267 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11268 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11269 *out = in.get();
11270 return r;
11271 }
11272
11273 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11274 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11275 int object_size, const char *data_pool, bool *created,
11276 const UserPerm& perms)
11277 {
11278 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11279 mode << dec << ")" << dendl;
11280
11281 if (strlen(name) > NAME_MAX)
11282 return -ENAMETOOLONG;
11283 if (dir->snapid != CEPH_NOSNAP) {
11284 return -EROFS;
11285 }
11286 if (is_quota_files_exceeded(dir, perms)) {
11287 return -EDQUOT;
11288 }
11289
11290 // use normalized flags to generate cmode
11291 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11292 if (cmode < 0)
11293 return -EINVAL;
11294
11295 int64_t pool_id = -1;
11296 if (data_pool && *data_pool) {
11297 pool_id = objecter->with_osdmap(
11298 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11299 if (pool_id < 0)
11300 return -EINVAL;
11301 if (pool_id > 0xffffffffll)
11302 return -ERANGE; // bummer!
11303 }
11304
11305 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11306
11307 filepath path;
11308 dir->make_nosnap_relative_path(path);
11309 path.push_dentry(name);
11310 req->set_filepath(path);
11311 req->set_inode(dir);
11312 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11313
11314 req->head.args.open.stripe_unit = stripe_unit;
11315 req->head.args.open.stripe_count = stripe_count;
11316 req->head.args.open.object_size = object_size;
11317 if (cct->_conf->client_debug_getattr_caps)
11318 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11319 else
11320 req->head.args.open.mask = 0;
11321 req->head.args.open.pool = pool_id;
11322 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11323 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11324
11325 mode |= S_IFREG;
11326 bufferlist xattrs_bl;
11327 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11328 if (res < 0)
11329 goto fail;
11330 req->head.args.open.mode = mode;
11331 if (xattrs_bl.length() > 0)
11332 req->set_data(xattrs_bl);
11333
11334 Dentry *de;
11335 res = get_or_create(dir, name, &de);
11336 if (res < 0)
11337 goto fail;
11338 req->set_dentry(de);
11339
11340 res = make_request(req, perms, inp, created);
11341 if (res < 0) {
11342 goto reply_error;
11343 }
11344
11345 /* If the caller passed a value in fhp, do the open */
11346 if(fhp) {
11347 (*inp)->get_open_ref(cmode);
11348 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11349 }
11350
11351 reply_error:
11352 trim_cache();
11353
11354 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11355 << " layout " << stripe_unit
11356 << ' ' << stripe_count
11357 << ' ' << object_size
11358 <<") = " << res << dendl;
11359 return res;
11360
11361 fail:
11362 put_request(req);
11363 return res;
11364 }
11365
11366
11367 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11368 InodeRef *inp)
11369 {
11370 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11371 << mode << dec << ", uid " << perm.uid()
11372 << ", gid " << perm.gid() << ")" << dendl;
11373
11374 if (strlen(name) > NAME_MAX)
11375 return -ENAMETOOLONG;
11376
11377 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11378 return -EROFS;
11379 }
11380 if (is_quota_files_exceeded(dir, perm)) {
11381 return -EDQUOT;
11382 }
11383 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11384 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11385
11386 filepath path;
11387 dir->make_nosnap_relative_path(path);
11388 path.push_dentry(name);
11389 req->set_filepath(path);
11390 req->set_inode(dir);
11391 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11392 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11393
11394 mode |= S_IFDIR;
11395 bufferlist xattrs_bl;
11396 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11397 if (res < 0)
11398 goto fail;
11399 req->head.args.mkdir.mode = mode;
11400 if (xattrs_bl.length() > 0)
11401 req->set_data(xattrs_bl);
11402
11403 Dentry *de;
11404 res = get_or_create(dir, name, &de);
11405 if (res < 0)
11406 goto fail;
11407 req->set_dentry(de);
11408
11409 ldout(cct, 10) << "_mkdir: making request" << dendl;
11410 res = make_request(req, perm, inp);
11411 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11412
11413 trim_cache();
11414
11415 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11416 return res;
11417
11418 fail:
11419 put_request(req);
11420 return res;
11421 }
11422
11423 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11424 struct stat *attr, Inode **out, const UserPerm& perm)
11425 {
11426 Mutex::Locker lock(client_lock);
11427
11428 vinodeno_t vparent = _get_vino(parent);
11429
11430 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11431 tout(cct) << "ll_mkdir" << std::endl;
11432 tout(cct) << vparent.ino.val << std::endl;
11433 tout(cct) << name << std::endl;
11434 tout(cct) << mode << std::endl;
11435
11436 if (!cct->_conf->fuse_default_permissions) {
11437 int r = may_create(parent, perm);
11438 if (r < 0)
11439 return r;
11440 }
11441
11442 InodeRef in;
11443 int r = _mkdir(parent, name, mode, perm, &in);
11444 if (r == 0) {
11445 fill_stat(in, attr);
11446 _ll_get(in.get());
11447 }
11448 tout(cct) << attr->st_ino << std::endl;
11449 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11450 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11451 *out = in.get();
11452 return r;
11453 }
11454
11455 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11456 struct ceph_statx *stx, unsigned want, unsigned flags,
11457 const UserPerm& perms)
11458 {
11459 Mutex::Locker lock(client_lock);
11460
11461 vinodeno_t vparent = _get_vino(parent);
11462
11463 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11464 tout(cct) << "ll_mkdirx" << std::endl;
11465 tout(cct) << vparent.ino.val << std::endl;
11466 tout(cct) << name << std::endl;
11467 tout(cct) << mode << std::endl;
11468
11469 if (!cct->_conf->fuse_default_permissions) {
11470 int r = may_create(parent, perms);
11471 if (r < 0)
11472 return r;
11473 }
11474
11475 InodeRef in;
11476 int r = _mkdir(parent, name, mode, perms, &in);
11477 if (r == 0) {
11478 fill_statx(in, statx_to_mask(flags, want), stx);
11479 _ll_get(in.get());
11480 } else {
11481 stx->stx_ino = 0;
11482 stx->stx_mask = 0;
11483 }
11484 tout(cct) << stx->stx_ino << std::endl;
11485 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11486 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11487 *out = in.get();
11488 return r;
11489 }
11490
11491 int Client::_symlink(Inode *dir, const char *name, const char *target,
11492 const UserPerm& perms, InodeRef *inp)
11493 {
11494 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11495 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11496 << dendl;
11497
11498 if (strlen(name) > NAME_MAX)
11499 return -ENAMETOOLONG;
11500
11501 if (dir->snapid != CEPH_NOSNAP) {
11502 return -EROFS;
11503 }
11504 if (is_quota_files_exceeded(dir, perms)) {
11505 return -EDQUOT;
11506 }
11507
11508 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11509
11510 filepath path;
11511 dir->make_nosnap_relative_path(path);
11512 path.push_dentry(name);
11513 req->set_filepath(path);
11514 req->set_inode(dir);
11515 req->set_string2(target);
11516 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11517 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11518
11519 Dentry *de;
11520 int res = get_or_create(dir, name, &de);
11521 if (res < 0)
11522 goto fail;
11523 req->set_dentry(de);
11524
11525 res = make_request(req, perms, inp);
11526
11527 trim_cache();
11528 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11529 res << dendl;
11530 return res;
11531
11532 fail:
11533 put_request(req);
11534 return res;
11535 }
11536
11537 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11538 struct stat *attr, Inode **out, const UserPerm& perms)
11539 {
11540 Mutex::Locker lock(client_lock);
11541
11542 vinodeno_t vparent = _get_vino(parent);
11543
11544 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11545 << dendl;
11546 tout(cct) << "ll_symlink" << std::endl;
11547 tout(cct) << vparent.ino.val << std::endl;
11548 tout(cct) << name << std::endl;
11549 tout(cct) << value << std::endl;
11550
11551 if (!cct->_conf->fuse_default_permissions) {
11552 int r = may_create(parent, perms);
11553 if (r < 0)
11554 return r;
11555 }
11556
11557 InodeRef in;
11558 int r = _symlink(parent, name, value, perms, &in);
11559 if (r == 0) {
11560 fill_stat(in, attr);
11561 _ll_get(in.get());
11562 }
11563 tout(cct) << attr->st_ino << std::endl;
11564 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11565 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11566 *out = in.get();
11567 return r;
11568 }
11569
11570 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11571 Inode **out, struct ceph_statx *stx, unsigned want,
11572 unsigned flags, const UserPerm& perms)
11573 {
11574 Mutex::Locker lock(client_lock);
11575
11576 vinodeno_t vparent = _get_vino(parent);
11577
11578 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11579 << dendl;
11580 tout(cct) << "ll_symlinkx" << std::endl;
11581 tout(cct) << vparent.ino.val << std::endl;
11582 tout(cct) << name << std::endl;
11583 tout(cct) << value << std::endl;
11584
11585 if (!cct->_conf->fuse_default_permissions) {
11586 int r = may_create(parent, perms);
11587 if (r < 0)
11588 return r;
11589 }
11590
11591 InodeRef in;
11592 int r = _symlink(parent, name, value, perms, &in);
11593 if (r == 0) {
11594 fill_statx(in, statx_to_mask(flags, want), stx);
11595 _ll_get(in.get());
11596 }
11597 tout(cct) << stx->stx_ino << std::endl;
11598 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11599 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11600 *out = in.get();
11601 return r;
11602 }
11603
11604 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11605 {
11606 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11607 << " uid " << perm.uid() << " gid " << perm.gid()
11608 << ")" << dendl;
11609
11610 if (dir->snapid != CEPH_NOSNAP) {
11611 return -EROFS;
11612 }
11613
11614 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11615
11616 filepath path;
11617 dir->make_nosnap_relative_path(path);
11618 path.push_dentry(name);
11619 req->set_filepath(path);
11620
11621 InodeRef otherin;
11622
11623 Dentry *de;
11624 int res = get_or_create(dir, name, &de);
11625 if (res < 0)
11626 goto fail;
11627 req->set_dentry(de);
11628 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11629 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11630
11631 res = _lookup(dir, name, 0, &otherin, perm);
11632 if (res < 0)
11633 goto fail;
11634 req->set_other_inode(otherin.get());
11635 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11636
11637 req->set_inode(dir);
11638
11639 res = make_request(req, perm);
11640
11641 trim_cache();
11642 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
11643 return res;
11644
11645 fail:
11646 put_request(req);
11647 return res;
11648 }
11649
11650 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
11651 {
11652 Mutex::Locker lock(client_lock);
11653
11654 vinodeno_t vino = _get_vino(in);
11655
11656 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
11657 tout(cct) << "ll_unlink" << std::endl;
11658 tout(cct) << vino.ino.val << std::endl;
11659 tout(cct) << name << std::endl;
11660
11661 if (!cct->_conf->fuse_default_permissions) {
11662 int r = may_delete(in, name, perm);
11663 if (r < 0)
11664 return r;
11665 }
11666 return _unlink(in, name, perm);
11667 }
11668
11669 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
11670 {
11671 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
11672 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
11673
11674 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11675 return -EROFS;
11676 }
11677
11678 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP:CEPH_MDS_OP_RMDIR);
11679 filepath path;
11680 dir->make_nosnap_relative_path(path);
11681 path.push_dentry(name);
11682 req->set_filepath(path);
11683
11684 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11685 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11686 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11687
11688 InodeRef in;
11689
11690 Dentry *de;
11691 int res = get_or_create(dir, name, &de);
11692 if (res < 0)
11693 goto fail;
11694 res = _lookup(dir, name, 0, &in, perms);
11695 if (res < 0)
11696 goto fail;
11697 if (req->get_op() == CEPH_MDS_OP_RMDIR) {
11698 req->set_inode(dir);
11699 req->set_dentry(de);
11700 req->set_other_inode(in.get());
11701 } else {
11702 unlink(de, true, true);
11703 req->set_other_inode(in.get());
11704 }
11705
11706 res = make_request(req, perms);
11707
11708 trim_cache();
11709 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
11710 return res;
11711
11712 fail:
11713 put_request(req);
11714 return res;
11715 }
11716
11717 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
11718 {
11719 Mutex::Locker lock(client_lock);
11720
11721 vinodeno_t vino = _get_vino(in);
11722
11723 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
11724 tout(cct) << "ll_rmdir" << std::endl;
11725 tout(cct) << vino.ino.val << std::endl;
11726 tout(cct) << name << std::endl;
11727
11728 if (!cct->_conf->fuse_default_permissions) {
11729 int r = may_delete(in, name, perms);
11730 if (r < 0)
11731 return r;
11732 }
11733
11734 return _rmdir(in, name, perms);
11735 }
11736
11737 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
11738 {
11739 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
11740 << todir->ino << " " << toname
11741 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
11742 << dendl;
11743
11744 if (fromdir->snapid != todir->snapid)
11745 return -EXDEV;
11746
11747 int op = CEPH_MDS_OP_RENAME;
11748 if (fromdir->snapid != CEPH_NOSNAP) {
11749 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
11750 op = CEPH_MDS_OP_RENAMESNAP;
11751 else
11752 return -EROFS;
11753 }
11754 if (fromdir != todir) {
11755 Inode *fromdir_root =
11756 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
11757 Inode *todir_root =
11758 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
11759 if (fromdir_root != todir_root) {
11760 return -EXDEV;
11761 }
11762 }
11763
11764 InodeRef target;
11765 MetaRequest *req = new MetaRequest(op);
11766
11767 filepath from;
11768 fromdir->make_nosnap_relative_path(from);
11769 from.push_dentry(fromname);
11770 filepath to;
11771 todir->make_nosnap_relative_path(to);
11772 to.push_dentry(toname);
11773 req->set_filepath(to);
11774 req->set_filepath2(from);
11775
11776 Dentry *oldde;
11777 int res = get_or_create(fromdir, fromname, &oldde);
11778 if (res < 0)
11779 goto fail;
11780 Dentry *de;
11781 res = get_or_create(todir, toname, &de);
11782 if (res < 0)
11783 goto fail;
11784
11785 if (op == CEPH_MDS_OP_RENAME) {
11786 req->set_old_dentry(oldde);
11787 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
11788 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
11789
11790 req->set_dentry(de);
11791 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11792 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11793
11794 InodeRef oldin, otherin;
11795 res = _lookup(fromdir, fromname, 0, &oldin, perm);
11796 if (res < 0)
11797 goto fail;
11798 req->set_old_inode(oldin.get());
11799 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
11800
11801 res = _lookup(todir, toname, 0, &otherin, perm);
11802 if (res != 0 && res != -ENOENT) {
11803 goto fail;
11804 } else if (res == 0) {
11805 req->set_other_inode(otherin.get());
11806 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11807 }
11808
11809 req->set_inode(todir);
11810 } else {
11811 // renamesnap reply contains no tracedn, so we need to invalidate
11812 // dentry manually
11813 unlink(oldde, true, true);
11814 unlink(de, true, true);
11815 }
11816
11817 res = make_request(req, perm, &target);
11818 ldout(cct, 10) << "rename result is " << res << dendl;
11819
11820 // renamed item from our cache
11821
11822 trim_cache();
11823 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
11824 return res;
11825
11826 fail:
11827 put_request(req);
11828 return res;
11829 }
11830
11831 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
11832 const char *newname, const UserPerm& perm)
11833 {
11834 Mutex::Locker lock(client_lock);
11835
11836 vinodeno_t vparent = _get_vino(parent);
11837 vinodeno_t vnewparent = _get_vino(newparent);
11838
11839 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
11840 << vnewparent << " " << newname << dendl;
11841 tout(cct) << "ll_rename" << std::endl;
11842 tout(cct) << vparent.ino.val << std::endl;
11843 tout(cct) << name << std::endl;
11844 tout(cct) << vnewparent.ino.val << std::endl;
11845 tout(cct) << newname << std::endl;
11846
11847 if (!cct->_conf->fuse_default_permissions) {
11848 int r = may_delete(parent, name, perm);
11849 if (r < 0)
11850 return r;
11851 r = may_delete(newparent, newname, perm);
11852 if (r < 0 && r != -ENOENT)
11853 return r;
11854 }
11855
11856 return _rename(parent, name, newparent, newname, perm);
11857 }
11858
11859 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
11860 {
11861 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
11862 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
11863
11864 if (strlen(newname) > NAME_MAX)
11865 return -ENAMETOOLONG;
11866
11867 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
11868 return -EROFS;
11869 }
11870 if (is_quota_files_exceeded(dir, perm)) {
11871 return -EDQUOT;
11872 }
11873
11874 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
11875
11876 filepath path(newname, dir->ino);
11877 req->set_filepath(path);
11878 filepath existing(in->ino);
11879 req->set_filepath2(existing);
11880
11881 req->set_inode(dir);
11882 req->inode_drop = CEPH_CAP_FILE_SHARED;
11883 req->inode_unless = CEPH_CAP_FILE_EXCL;
11884
11885 Dentry *de;
11886 int res = get_or_create(dir, newname, &de);
11887 if (res < 0)
11888 goto fail;
11889 req->set_dentry(de);
11890
11891 res = make_request(req, perm, inp);
11892 ldout(cct, 10) << "link result is " << res << dendl;
11893
11894 trim_cache();
11895 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
11896 return res;
11897
11898 fail:
11899 put_request(req);
11900 return res;
11901 }
11902
11903 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
11904 const UserPerm& perm)
11905 {
11906 Mutex::Locker lock(client_lock);
11907
11908 vinodeno_t vino = _get_vino(in);
11909 vinodeno_t vnewparent = _get_vino(newparent);
11910
11911 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
11912 newname << dendl;
11913 tout(cct) << "ll_link" << std::endl;
11914 tout(cct) << vino.ino.val << std::endl;
11915 tout(cct) << vnewparent << std::endl;
11916 tout(cct) << newname << std::endl;
11917
11918 int r = 0;
11919 InodeRef target;
11920
11921 if (!cct->_conf->fuse_default_permissions) {
11922 if (S_ISDIR(in->mode))
11923 return -EPERM;
11924
11925 r = may_hardlink(in, perm);
11926 if (r < 0)
11927 return r;
11928
11929 r = may_create(newparent, perm);
11930 if (r < 0)
11931 return r;
11932 }
11933
11934 return _link(in, newparent, newname, perm, &target);
11935 }
11936
11937 int Client::ll_num_osds(void)
11938 {
11939 Mutex::Locker lock(client_lock);
11940 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
11941 }
11942
11943 int Client::ll_osdaddr(int osd, uint32_t *addr)
11944 {
11945 Mutex::Locker lock(client_lock);
11946 entity_addr_t g;
11947 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
11948 if (!o.exists(osd))
11949 return false;
11950 g = o.get_addr(osd);
11951 return true;
11952 });
11953 if (!exists)
11954 return -1;
11955 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
11956 *addr = ntohl(nb_addr);
11957 return 0;
11958 }
11959 uint32_t Client::ll_stripe_unit(Inode *in)
11960 {
11961 Mutex::Locker lock(client_lock);
11962 return in->layout.stripe_unit;
11963 }
11964
11965 uint64_t Client::ll_snap_seq(Inode *in)
11966 {
11967 Mutex::Locker lock(client_lock);
11968 return in->snaprealm->seq;
11969 }
11970
11971 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
11972 {
11973 Mutex::Locker lock(client_lock);
11974 *layout = in->layout;
11975 return 0;
11976 }
11977
11978 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
11979 {
11980 return ll_file_layout(fh->inode.get(), layout);
11981 }
11982
11983 /* Currently we cannot take advantage of redundancy in reads, since we
11984 would have to go through all possible placement groups (a
11985 potentially quite large number determined by a hash), and use CRUSH
11986 to calculate the appropriate set of OSDs for each placement group,
11987 then index into that. An array with one entry per OSD is much more
11988 tractable and works for demonstration purposes. */
11989
11990 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
11991 file_layout_t* layout)
11992 {
11993 Mutex::Locker lock(client_lock);
11994 inodeno_t ino = ll_get_inodeno(in);
11995 uint32_t object_size = layout->object_size;
11996 uint32_t su = layout->stripe_unit;
11997 uint32_t stripe_count = layout->stripe_count;
11998 uint64_t stripes_per_object = object_size / su;
11999
12000 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12001 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12002 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12003 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12004
12005 object_t oid = file_object_t(ino, objectno);
12006 return objecter->with_osdmap([&](const OSDMap& o) {
12007 ceph_object_layout olayout =
12008 o.file_to_object_layout(oid, *layout);
12009 pg_t pg = (pg_t)olayout.ol_pgid;
12010 vector<int> osds;
12011 int primary;
12012 o.pg_to_acting_osds(pg, &osds, &primary);
12013 return primary;
12014 });
12015 }
12016
12017 /* Return the offset of the block, internal to the object */
12018
12019 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12020 {
12021 Mutex::Locker lock(client_lock);
12022 file_layout_t *layout=&(in->layout);
12023 uint32_t object_size = layout->object_size;
12024 uint32_t su = layout->stripe_unit;
12025 uint64_t stripes_per_object = object_size / su;
12026
12027 return (blockno % stripes_per_object) * su;
12028 }
12029
12030 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12031 const UserPerm& perms)
12032 {
12033 Mutex::Locker lock(client_lock);
12034
12035 vinodeno_t vino = _get_vino(in);
12036
12037 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12038 tout(cct) << "ll_opendir" << std::endl;
12039 tout(cct) << vino.ino.val << std::endl;
12040
12041 if (!cct->_conf->fuse_default_permissions) {
12042 int r = may_open(in, flags, perms);
12043 if (r < 0)
12044 return r;
12045 }
12046
12047 int r = _opendir(in, dirpp, perms);
12048 tout(cct) << (unsigned long)*dirpp << std::endl;
12049
12050 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12051 << dendl;
12052 return r;
12053 }
12054
12055 int Client::ll_releasedir(dir_result_t *dirp)
12056 {
12057 Mutex::Locker lock(client_lock);
12058 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12059 tout(cct) << "ll_releasedir" << std::endl;
12060 tout(cct) << (unsigned long)dirp << std::endl;
12061 _closedir(dirp);
12062 return 0;
12063 }
12064
12065 int Client::ll_fsyncdir(dir_result_t *dirp)
12066 {
12067 Mutex::Locker lock(client_lock);
12068 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12069 tout(cct) << "ll_fsyncdir" << std::endl;
12070 tout(cct) << (unsigned long)dirp << std::endl;
12071
12072 return _fsync(dirp->inode.get(), false);
12073 }
12074
12075 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12076 {
12077 assert(!(flags & O_CREAT));
12078
12079 Mutex::Locker lock(client_lock);
12080
12081 vinodeno_t vino = _get_vino(in);
12082
12083 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12084 tout(cct) << "ll_open" << std::endl;
12085 tout(cct) << vino.ino.val << std::endl;
12086 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12087
12088 int r;
12089 if (!cct->_conf->fuse_default_permissions) {
12090 r = may_open(in, flags, perms);
12091 if (r < 0)
12092 goto out;
12093 }
12094
12095 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12096
12097 out:
12098 Fh *fhptr = fhp ? *fhp : NULL;
12099 if (fhptr) {
12100 ll_unclosed_fh_set.insert(fhptr);
12101 }
12102 tout(cct) << (unsigned long)fhptr << std::endl;
12103 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12104 " = " << r << " (" << fhptr << ")" << dendl;
12105 return r;
12106 }
12107
12108 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12109 int flags, InodeRef *in, int caps, Fh **fhp,
12110 const UserPerm& perms)
12111 {
12112 *fhp = NULL;
12113
12114 vinodeno_t vparent = _get_vino(parent);
12115
12116 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12117 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12118 << ", gid " << perms.gid() << dendl;
12119 tout(cct) << "ll_create" << std::endl;
12120 tout(cct) << vparent.ino.val << std::endl;
12121 tout(cct) << name << std::endl;
12122 tout(cct) << mode << std::endl;
12123 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12124
12125 bool created = false;
12126 int r = _lookup(parent, name, caps, in, perms);
12127
12128 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12129 return -EEXIST;
12130
12131 if (r == -ENOENT && (flags & O_CREAT)) {
12132 if (!cct->_conf->fuse_default_permissions) {
12133 r = may_create(parent, perms);
12134 if (r < 0)
12135 goto out;
12136 }
12137 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12138 perms);
12139 if (r < 0)
12140 goto out;
12141 }
12142
12143 if (r < 0)
12144 goto out;
12145
12146 assert(*in);
12147
12148 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12149 if (!created) {
12150 if (!cct->_conf->fuse_default_permissions) {
12151 r = may_open(in->get(), flags, perms);
12152 if (r < 0) {
12153 if (*fhp) {
12154 int release_r = _release_fh(*fhp);
12155 assert(release_r == 0); // during create, no async data ops should have happened
12156 }
12157 goto out;
12158 }
12159 }
12160 if (*fhp == NULL) {
12161 r = _open(in->get(), flags, mode, fhp, perms);
12162 if (r < 0)
12163 goto out;
12164 }
12165 }
12166
12167 out:
12168 if (*fhp) {
12169 ll_unclosed_fh_set.insert(*fhp);
12170 }
12171
12172 ino_t ino = 0;
12173 if (r >= 0) {
12174 Inode *inode = in->get();
12175 if (use_faked_inos())
12176 ino = inode->faked_ino;
12177 else
12178 ino = inode->ino;
12179 }
12180
12181 tout(cct) << (unsigned long)*fhp << std::endl;
12182 tout(cct) << ino << std::endl;
12183 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12184 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12185 *fhp << " " << hex << ino << dec << ")" << dendl;
12186
12187 return r;
12188 }
12189
12190 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12191 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12192 const UserPerm& perms)
12193 {
12194 Mutex::Locker lock(client_lock);
12195 InodeRef in;
12196
12197 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12198 fhp, perms);
12199 if (r >= 0) {
12200 assert(in);
12201
12202 // passing an Inode in outp requires an additional ref
12203 if (outp) {
12204 _ll_get(in.get());
12205 *outp = in.get();
12206 }
12207 fill_stat(in, attr);
12208 } else {
12209 attr->st_ino = 0;
12210 }
12211
12212 return r;
12213 }
12214
12215 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12216 int oflags, Inode **outp, Fh **fhp,
12217 struct ceph_statx *stx, unsigned want, unsigned lflags,
12218 const UserPerm& perms)
12219 {
12220 unsigned caps = statx_to_mask(lflags, want);
12221 Mutex::Locker lock(client_lock);
12222 InodeRef in;
12223
12224
12225 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12226 if (r >= 0) {
12227 assert(in);
12228
12229 // passing an Inode in outp requires an additional ref
12230 if (outp) {
12231 _ll_get(in.get());
12232 *outp = in.get();
12233 }
12234 fill_statx(in, caps, stx);
12235 } else {
12236 stx->stx_ino = 0;
12237 stx->stx_mask = 0;
12238 }
12239
12240 return r;
12241 }
12242
12243 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12244 {
12245 Mutex::Locker lock(client_lock);
12246 tout(cct) << "ll_lseek" << std::endl;
12247 tout(cct) << offset << std::endl;
12248 tout(cct) << whence << std::endl;
12249
12250 return _lseek(fh, offset, whence);
12251 }
12252
12253 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12254 {
12255 Mutex::Locker lock(client_lock);
12256 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12257 tout(cct) << "ll_read" << std::endl;
12258 tout(cct) << (unsigned long)fh << std::endl;
12259 tout(cct) << off << std::endl;
12260 tout(cct) << len << std::endl;
12261
12262 return _read(fh, off, len, bl);
12263 }
12264
12265 int Client::ll_read_block(Inode *in, uint64_t blockid,
12266 char *buf,
12267 uint64_t offset,
12268 uint64_t length,
12269 file_layout_t* layout)
12270 {
12271 Mutex::Locker lock(client_lock);
12272 vinodeno_t vino = ll_get_vino(in);
12273 object_t oid = file_object_t(vino.ino, blockid);
12274 C_SaferCond onfinish;
12275 bufferlist bl;
12276
12277 objecter->read(oid,
12278 object_locator_t(layout->pool_id),
12279 offset,
12280 length,
12281 vino.snapid,
12282 &bl,
12283 CEPH_OSD_FLAG_READ,
12284 &onfinish);
12285
12286 client_lock.Unlock();
12287 int r = onfinish.wait();
12288 client_lock.Lock();
12289
12290 if (r >= 0) {
12291 bl.copy(0, bl.length(), buf);
12292 r = bl.length();
12293 }
12294
12295 return r;
12296 }
12297
12298 /* It appears that the OSD doesn't return success unless the entire
12299 buffer was written, return the write length on success. */
12300
12301 int Client::ll_write_block(Inode *in, uint64_t blockid,
12302 char* buf, uint64_t offset,
12303 uint64_t length, file_layout_t* layout,
12304 uint64_t snapseq, uint32_t sync)
12305 {
12306 Mutex flock("Client::ll_write_block flock");
12307 vinodeno_t vino = ll_get_vino(in);
12308 Cond cond;
12309 bool done;
12310 int r = 0;
12311 Context *onsafe;
12312
12313 if (length == 0) {
12314 return -EINVAL;
12315 }
12316 if (true || sync) {
12317 /* if write is stable, the epilogue is waiting on
12318 * flock */
12319 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12320 done = false;
12321 } else {
12322 /* if write is unstable, we just place a barrier for
12323 * future commits to wait on */
12324 /*onsafe = new C_Block_Sync(this, vino.ino,
12325 barrier_interval(offset, offset + length), &r);
12326 */
12327 done = true;
12328 }
12329 object_t oid = file_object_t(vino.ino, blockid);
12330 SnapContext fakesnap;
12331 bufferptr bp;
12332 if (length > 0) bp = buffer::copy(buf, length);
12333 bufferlist bl;
12334 bl.push_back(bp);
12335
12336 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12337 << dendl;
12338
12339 fakesnap.seq = snapseq;
12340
12341 /* lock just in time */
12342 client_lock.Lock();
12343
12344 objecter->write(oid,
12345 object_locator_t(layout->pool_id),
12346 offset,
12347 length,
12348 fakesnap,
12349 bl,
12350 ceph::real_clock::now(),
12351 0,
12352 onsafe);
12353
12354 client_lock.Unlock();
12355 if (!done /* also !sync */) {
12356 flock.Lock();
12357 while (! done)
12358 cond.Wait(flock);
12359 flock.Unlock();
12360 }
12361
12362 if (r < 0) {
12363 return r;
12364 } else {
12365 return length;
12366 }
12367 }
12368
12369 int Client::ll_commit_blocks(Inode *in,
12370 uint64_t offset,
12371 uint64_t length)
12372 {
12373 Mutex::Locker lock(client_lock);
12374 /*
12375 BarrierContext *bctx;
12376 vinodeno_t vino = ll_get_vino(in);
12377 uint64_t ino = vino.ino;
12378
12379 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12380 << offset << " to " << length << dendl;
12381
12382 if (length == 0) {
12383 return -EINVAL;
12384 }
12385
12386 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12387 if (p != barriers.end()) {
12388 barrier_interval civ(offset, offset + length);
12389 p->second->commit_barrier(civ);
12390 }
12391 */
12392 return 0;
12393 }
12394
12395 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12396 {
12397 Mutex::Locker lock(client_lock);
12398 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12399 "~" << len << dendl;
12400 tout(cct) << "ll_write" << std::endl;
12401 tout(cct) << (unsigned long)fh << std::endl;
12402 tout(cct) << off << std::endl;
12403 tout(cct) << len << std::endl;
12404
12405 int r = _write(fh, off, len, data, NULL, 0);
12406 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12407 << dendl;
12408 return r;
12409 }
12410
12411 int Client::ll_flush(Fh *fh)
12412 {
12413 Mutex::Locker lock(client_lock);
12414 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12415 tout(cct) << "ll_flush" << std::endl;
12416 tout(cct) << (unsigned long)fh << std::endl;
12417
12418 return _flush(fh);
12419 }
12420
12421 int Client::ll_fsync(Fh *fh, bool syncdataonly)
12422 {
12423 Mutex::Locker lock(client_lock);
12424 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12425 tout(cct) << "ll_fsync" << std::endl;
12426 tout(cct) << (unsigned long)fh << std::endl;
12427
12428 int r = _fsync(fh, syncdataonly);
12429 if (r) {
12430 // If we're returning an error, clear it from the FH
12431 fh->take_async_err();
12432 }
12433 return r;
12434 }
12435
12436 #ifdef FALLOC_FL_PUNCH_HOLE
12437
12438 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12439 {
12440 if (offset < 0 || length <= 0)
12441 return -EINVAL;
12442
12443 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12444 return -EOPNOTSUPP;
12445
12446 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12447 return -EOPNOTSUPP;
12448
12449 Inode *in = fh->inode.get();
12450
12451 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12452 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12453 return -ENOSPC;
12454 }
12455
12456 if (in->snapid != CEPH_NOSNAP)
12457 return -EROFS;
12458
12459 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12460 return -EBADF;
12461
12462 uint64_t size = offset + length;
12463 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12464 size > in->size &&
12465 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12466 return -EDQUOT;
12467 }
12468
12469 int have;
12470 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12471 if (r < 0)
12472 return r;
12473
12474 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12475 Cond uninline_cond;
12476 bool uninline_done = false;
12477 int uninline_ret = 0;
12478 Context *onuninline = NULL;
12479
12480 if (mode & FALLOC_FL_PUNCH_HOLE) {
12481 if (in->inline_version < CEPH_INLINE_NONE &&
12482 (have & CEPH_CAP_FILE_BUFFER)) {
12483 bufferlist bl;
12484 int len = in->inline_data.length();
12485 if (offset < len) {
12486 if (offset > 0)
12487 in->inline_data.copy(0, offset, bl);
12488 int size = length;
12489 if (offset + size > len)
12490 size = len - offset;
12491 if (size > 0)
12492 bl.append_zero(size);
12493 if (offset + size < len)
12494 in->inline_data.copy(offset + size, len - offset - size, bl);
12495 in->inline_data = bl;
12496 in->inline_version++;
12497 }
12498 in->mtime = ceph_clock_now();
12499 in->change_attr++;
12500 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12501 } else {
12502 if (in->inline_version < CEPH_INLINE_NONE) {
12503 onuninline = new C_SafeCond(&uninline_flock,
12504 &uninline_cond,
12505 &uninline_done,
12506 &uninline_ret);
12507 uninline_data(in, onuninline);
12508 }
12509
12510 Mutex flock("Client::_punch_hole flock");
12511 Cond cond;
12512 bool done = false;
12513 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12514
12515 unsafe_sync_write++;
12516 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12517
12518 _invalidate_inode_cache(in, offset, length);
12519 filer->zero(in->ino, &in->layout,
12520 in->snaprealm->get_snap_context(),
12521 offset, length,
12522 ceph::real_clock::now(),
12523 0, true, onfinish);
12524 in->mtime = ceph_clock_now();
12525 in->change_attr++;
12526 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12527
12528 client_lock.Unlock();
12529 flock.Lock();
12530 while (!done)
12531 cond.Wait(flock);
12532 flock.Unlock();
12533 client_lock.Lock();
12534 _sync_write_commit(in);
12535 }
12536 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12537 uint64_t size = offset + length;
12538 if (size > in->size) {
12539 in->size = size;
12540 in->mtime = ceph_clock_now();
12541 in->change_attr++;
12542 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12543
12544 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12545 check_caps(in, CHECK_CAPS_NODELAY);
12546 } else if (is_max_size_approaching(in)) {
12547 check_caps(in, 0);
12548 }
12549 }
12550 }
12551
12552 if (onuninline) {
12553 client_lock.Unlock();
12554 uninline_flock.Lock();
12555 while (!uninline_done)
12556 uninline_cond.Wait(uninline_flock);
12557 uninline_flock.Unlock();
12558 client_lock.Lock();
12559
12560 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
12561 in->inline_data.clear();
12562 in->inline_version = CEPH_INLINE_NONE;
12563 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12564 check_caps(in, 0);
12565 } else
12566 r = uninline_ret;
12567 }
12568
12569 put_cap_ref(in, CEPH_CAP_FILE_WR);
12570 return r;
12571 }
12572 #else
12573
12574 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12575 {
12576 return -EOPNOTSUPP;
12577 }
12578
12579 #endif
12580
12581
12582 int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
12583 {
12584 Mutex::Locker lock(client_lock);
12585 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
12586 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
12587 tout(cct) << (unsigned long)fh << std::endl;
12588
12589 return _fallocate(fh, mode, offset, length);
12590 }
12591
12592 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
12593 {
12594 Mutex::Locker lock(client_lock);
12595 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
12596
12597 Fh *fh = get_filehandle(fd);
12598 if (!fh)
12599 return -EBADF;
12600 #if defined(__linux__) && defined(O_PATH)
12601 if (fh->flags & O_PATH)
12602 return -EBADF;
12603 #endif
12604 return _fallocate(fh, mode, offset, length);
12605 }
12606
12607 int Client::ll_release(Fh *fh)
12608 {
12609 Mutex::Locker lock(client_lock);
12610 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
12611 dendl;
12612 tout(cct) << "ll_release (fh)" << std::endl;
12613 tout(cct) << (unsigned long)fh << std::endl;
12614
12615 if (ll_unclosed_fh_set.count(fh))
12616 ll_unclosed_fh_set.erase(fh);
12617 return _release_fh(fh);
12618 }
12619
12620 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
12621 {
12622 Mutex::Locker lock(client_lock);
12623
12624 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
12625 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
12626
12627 return _getlk(fh, fl, owner);
12628 }
12629
12630 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
12631 {
12632 Mutex::Locker lock(client_lock);
12633
12634 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
12635 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
12636
12637 return _setlk(fh, fl, owner, sleep);
12638 }
12639
12640 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
12641 {
12642 Mutex::Locker lock(client_lock);
12643
12644 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
12645 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
12646
12647 return _flock(fh, cmd, owner);
12648 }
12649
12650 class C_Client_RequestInterrupt : public Context {
12651 private:
12652 Client *client;
12653 MetaRequest *req;
12654 public:
12655 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
12656 req->get();
12657 }
12658 void finish(int r) override {
12659 Mutex::Locker l(client->client_lock);
12660 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
12661 client->_interrupt_filelock(req);
12662 client->put_request(req);
12663 }
12664 };
12665
12666 void Client::ll_interrupt(void *d)
12667 {
12668 MetaRequest *req = static_cast<MetaRequest*>(d);
12669 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
12670 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
12671 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
12672 }
12673
12674 // =========================================
12675 // layout
12676
12677 // expose file layouts
12678
12679 int Client::describe_layout(const char *relpath, file_layout_t *lp,
12680 const UserPerm& perms)
12681 {
12682 Mutex::Locker lock(client_lock);
12683
12684 filepath path(relpath);
12685 InodeRef in;
12686 int r = path_walk(path, &in, perms);
12687 if (r < 0)
12688 return r;
12689
12690 *lp = in->layout;
12691
12692 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
12693 return 0;
12694 }
12695
12696 int Client::fdescribe_layout(int fd, file_layout_t *lp)
12697 {
12698 Mutex::Locker lock(client_lock);
12699
12700 Fh *f = get_filehandle(fd);
12701 if (!f)
12702 return -EBADF;
12703 Inode *in = f->inode.get();
12704
12705 *lp = in->layout;
12706
12707 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
12708 return 0;
12709 }
12710
12711
12712 // expose osdmap
12713
12714 int64_t Client::get_pool_id(const char *pool_name)
12715 {
12716 Mutex::Locker lock(client_lock);
12717 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
12718 pool_name);
12719 }
12720
12721 string Client::get_pool_name(int64_t pool)
12722 {
12723 Mutex::Locker lock(client_lock);
12724 return objecter->with_osdmap([pool](const OSDMap& o) {
12725 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
12726 });
12727 }
12728
12729 int Client::get_pool_replication(int64_t pool)
12730 {
12731 Mutex::Locker lock(client_lock);
12732 return objecter->with_osdmap([pool](const OSDMap& o) {
12733 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
12734 });
12735 }
12736
12737 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
12738 {
12739 Mutex::Locker lock(client_lock);
12740
12741 Fh *f = get_filehandle(fd);
12742 if (!f)
12743 return -EBADF;
12744 Inode *in = f->inode.get();
12745
12746 vector<ObjectExtent> extents;
12747 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
12748 assert(extents.size() == 1);
12749
12750 objecter->with_osdmap([&](const OSDMap& o) {
12751 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
12752 o.pg_to_acting_osds(pg, osds);
12753 });
12754
12755 if (osds.empty())
12756 return -EINVAL;
12757
12758 /*
12759 * Return the remainder of the extent (stripe unit)
12760 *
12761 * If length = 1 is passed to Striper::file_to_extents we get a single
12762 * extent back, but its length is one so we still need to compute the length
12763 * to the end of the stripe unit.
12764 *
12765 * If length = su then we may get 1 or 2 objects back in the extents vector
12766 * which would have to be examined. Even then, the offsets are local to the
12767 * object, so matching up to the file offset is extra work.
12768 *
12769 * It seems simpler to stick with length = 1 and manually compute the
12770 * remainder.
12771 */
12772 if (len) {
12773 uint64_t su = in->layout.stripe_unit;
12774 *len = su - (off % su);
12775 }
12776
12777 return 0;
12778 }
12779
12780 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
12781 {
12782 Mutex::Locker lock(client_lock);
12783 if (id < 0)
12784 return -EINVAL;
12785 return objecter->with_osdmap([&](const OSDMap& o) {
12786 return o.crush->get_full_location_ordered(id, path);
12787 });
12788 }
12789
12790 int Client::get_file_stripe_address(int fd, loff_t offset,
12791 vector<entity_addr_t>& address)
12792 {
12793 Mutex::Locker lock(client_lock);
12794
12795 Fh *f = get_filehandle(fd);
12796 if (!f)
12797 return -EBADF;
12798 Inode *in = f->inode.get();
12799
12800 // which object?
12801 vector<ObjectExtent> extents;
12802 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
12803 in->truncate_size, extents);
12804 assert(extents.size() == 1);
12805
12806 // now we have the object and its 'layout'
12807 return objecter->with_osdmap([&](const OSDMap& o) {
12808 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
12809 vector<int> osds;
12810 o.pg_to_acting_osds(pg, osds);
12811 if (osds.empty())
12812 return -EINVAL;
12813 for (unsigned i = 0; i < osds.size(); i++) {
12814 entity_addr_t addr = o.get_addr(osds[i]);
12815 address.push_back(addr);
12816 }
12817 return 0;
12818 });
12819 }
12820
12821 int Client::get_osd_addr(int osd, entity_addr_t& addr)
12822 {
12823 Mutex::Locker lock(client_lock);
12824 return objecter->with_osdmap([&](const OSDMap& o) {
12825 if (!o.exists(osd))
12826 return -ENOENT;
12827
12828 addr = o.get_addr(osd);
12829 return 0;
12830 });
12831 }
12832
12833 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
12834 loff_t length, loff_t offset)
12835 {
12836 Mutex::Locker lock(client_lock);
12837
12838 Fh *f = get_filehandle(fd);
12839 if (!f)
12840 return -EBADF;
12841 Inode *in = f->inode.get();
12842
12843 // map to a list of extents
12844 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
12845
12846 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
12847 return 0;
12848 }
12849
12850
12851 /*
12852 * find an osd with the same ip. -1 if none.
12853 */
12854 int Client::get_local_osd()
12855 {
12856 Mutex::Locker lock(client_lock);
12857 objecter->with_osdmap([this](const OSDMap& o) {
12858 if (o.get_epoch() != local_osd_epoch) {
12859 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
12860 local_osd_epoch = o.get_epoch();
12861 }
12862 });
12863 return local_osd;
12864 }
12865
12866
12867
12868
12869
12870
12871 // ===============================
12872
12873 void Client::ms_handle_connect(Connection *con)
12874 {
12875 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
12876 }
12877
12878 bool Client::ms_handle_reset(Connection *con)
12879 {
12880 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
12881 return false;
12882 }
12883
12884 void Client::ms_handle_remote_reset(Connection *con)
12885 {
12886 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
12887 Mutex::Locker l(client_lock);
12888 switch (con->get_peer_type()) {
12889 case CEPH_ENTITY_TYPE_MDS:
12890 {
12891 // kludge to figure out which mds this is; fixme with a Connection* state
12892 mds_rank_t mds = MDS_RANK_NONE;
12893 MetaSession *s = NULL;
12894 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
12895 p != mds_sessions.end();
12896 ++p) {
12897 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
12898 mds = p->first;
12899 s = p->second;
12900 }
12901 }
12902 if (mds >= 0) {
12903 switch (s->state) {
12904 case MetaSession::STATE_CLOSING:
12905 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
12906 _closed_mds_session(s);
12907 break;
12908
12909 case MetaSession::STATE_OPENING:
12910 {
12911 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
12912 list<Context*> waiters;
12913 waiters.swap(s->waiting_for_open);
12914 _closed_mds_session(s);
12915 MetaSession *news = _get_or_open_mds_session(mds);
12916 news->waiting_for_open.swap(waiters);
12917 }
12918 break;
12919
12920 case MetaSession::STATE_OPEN:
12921 {
12922 const md_config_t *conf = cct->_conf;
12923 if (conf->client_reconnect_stale) {
12924 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
12925 _closed_mds_session(s);
12926 } else {
12927 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
12928 s->state = MetaSession::STATE_STALE;
12929 }
12930 }
12931 break;
12932
12933 case MetaSession::STATE_NEW:
12934 case MetaSession::STATE_CLOSED:
12935 default:
12936 break;
12937 }
12938 }
12939 }
12940 break;
12941 }
12942 }
12943
12944 bool Client::ms_handle_refused(Connection *con)
12945 {
12946 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
12947 return false;
12948 }
12949
12950 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
12951 {
12952 if (dest_type == CEPH_ENTITY_TYPE_MON)
12953 return true;
12954 *authorizer = monclient->build_authorizer(dest_type);
12955 return true;
12956 }
12957
12958 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
12959 {
12960 Inode *cur = in;
12961 utime_t now = ceph_clock_now();
12962
12963 while (cur) {
12964 if (cur != in && cur->quota.is_enable())
12965 break;
12966
12967 Inode *parent_in = NULL;
12968 if (!cur->dn_set.empty()) {
12969 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
12970 Dentry *dn = *p;
12971 if (dn->lease_mds >= 0 &&
12972 dn->lease_ttl > now &&
12973 mds_sessions.count(dn->lease_mds)) {
12974 parent_in = dn->dir->parent_inode;
12975 } else {
12976 Inode *diri = dn->dir->parent_inode;
12977 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
12978 diri->shared_gen == dn->cap_shared_gen) {
12979 parent_in = dn->dir->parent_inode;
12980 }
12981 }
12982 if (parent_in)
12983 break;
12984 }
12985 } else if (root_parents.count(cur)) {
12986 parent_in = root_parents[cur].get();
12987 }
12988
12989 if (parent_in) {
12990 cur = parent_in;
12991 continue;
12992 }
12993
12994 if (cur == root_ancestor)
12995 break;
12996
12997 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
12998 filepath path(cur->ino);
12999 req->set_filepath(path);
13000 req->set_inode(cur);
13001
13002 InodeRef parent_ref;
13003 int ret = make_request(req, perms, &parent_ref);
13004 if (ret < 0) {
13005 ldout(cct, 1) << __func__ << " " << in->vino()
13006 << " failed to find parent of " << cur->vino()
13007 << " err " << ret << dendl;
13008 // FIXME: what to do?
13009 cur = root_ancestor;
13010 break;
13011 }
13012
13013 now = ceph_clock_now();
13014 if (cur == in)
13015 cur = parent_ref.get();
13016 else
13017 cur = in; // start over
13018 }
13019
13020 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13021 return cur;
13022 }
13023
13024 /**
13025 * Traverse quota ancestors of the Inode, return true
13026 * if any of them passes the passed function
13027 */
13028 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13029 std::function<bool (const Inode &in)> test)
13030 {
13031 while (true) {
13032 assert(in != NULL);
13033 if (test(*in)) {
13034 return true;
13035 }
13036
13037 if (in == root_ancestor) {
13038 // We're done traversing, drop out
13039 return false;
13040 } else {
13041 // Continue up the tree
13042 in = get_quota_root(in, perms);
13043 }
13044 }
13045
13046 return false;
13047 }
13048
13049 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13050 {
13051 return check_quota_condition(in, perms,
13052 [](const Inode &in) {
13053 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13054 });
13055 }
13056
13057 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13058 const UserPerm& perms)
13059 {
13060 return check_quota_condition(in, perms,
13061 [&new_bytes](const Inode &in) {
13062 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13063 > in.quota.max_bytes;
13064 });
13065 }
13066
13067 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13068 {
13069 return check_quota_condition(in, perms,
13070 [](const Inode &in) {
13071 if (in.quota.max_bytes) {
13072 if (in.rstat.rbytes >= in.quota.max_bytes) {
13073 return true;
13074 }
13075
13076 assert(in.size >= in.reported_size);
13077 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13078 const uint64_t size = in.size - in.reported_size;
13079 return (space >> 4) < size;
13080 } else {
13081 return false;
13082 }
13083 });
13084 }
13085
13086 enum {
13087 POOL_CHECKED = 1,
13088 POOL_CHECKING = 2,
13089 POOL_READ = 4,
13090 POOL_WRITE = 8,
13091 };
13092
13093 int Client::check_pool_perm(Inode *in, int need)
13094 {
13095 if (!cct->_conf->client_check_pool_perm)
13096 return 0;
13097
13098 int64_t pool_id = in->layout.pool_id;
13099 std::string pool_ns = in->layout.pool_ns;
13100 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13101 int have = 0;
13102 while (true) {
13103 auto it = pool_perms.find(perm_key);
13104 if (it == pool_perms.end())
13105 break;
13106 if (it->second == POOL_CHECKING) {
13107 // avoid concurrent checkings
13108 wait_on_list(waiting_for_pool_perm);
13109 } else {
13110 have = it->second;
13111 assert(have & POOL_CHECKED);
13112 break;
13113 }
13114 }
13115
13116 if (!have) {
13117 if (in->snapid != CEPH_NOSNAP) {
13118 // pool permission check needs to write to the first object. But for snapshot,
13119 // head of the first object may have alread been deleted. To avoid creating
13120 // orphan object, skip the check for now.
13121 return 0;
13122 }
13123
13124 pool_perms[perm_key] = POOL_CHECKING;
13125
13126 char oid_buf[32];
13127 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13128 object_t oid = oid_buf;
13129
13130 SnapContext nullsnapc;
13131
13132 C_SaferCond rd_cond;
13133 ObjectOperation rd_op;
13134 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13135
13136 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13137 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13138
13139 C_SaferCond wr_cond;
13140 ObjectOperation wr_op;
13141 wr_op.create(true);
13142
13143 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13144 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13145
13146 client_lock.Unlock();
13147 int rd_ret = rd_cond.wait();
13148 int wr_ret = wr_cond.wait();
13149 client_lock.Lock();
13150
13151 bool errored = false;
13152
13153 if (rd_ret == 0 || rd_ret == -ENOENT)
13154 have |= POOL_READ;
13155 else if (rd_ret != -EPERM) {
13156 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13157 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13158 errored = true;
13159 }
13160
13161 if (wr_ret == 0 || wr_ret == -EEXIST)
13162 have |= POOL_WRITE;
13163 else if (wr_ret != -EPERM) {
13164 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13165 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13166 errored = true;
13167 }
13168
13169 if (errored) {
13170 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13171 // Raise EIO because actual error code might be misleading for
13172 // userspace filesystem user.
13173 pool_perms.erase(perm_key);
13174 signal_cond_list(waiting_for_pool_perm);
13175 return -EIO;
13176 }
13177
13178 pool_perms[perm_key] = have | POOL_CHECKED;
13179 signal_cond_list(waiting_for_pool_perm);
13180 }
13181
13182 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13183 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13184 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13185 return -EPERM;
13186 }
13187 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13188 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13189 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13190 return -EPERM;
13191 }
13192
13193 return 0;
13194 }
13195
13196 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13197 {
13198 if (acl_type == POSIX_ACL) {
13199 if (in->xattrs.count(ACL_EA_ACCESS)) {
13200 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13201
13202 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13203 }
13204 }
13205 return -EAGAIN;
13206 }
13207
13208 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13209 {
13210 if (acl_type == NO_ACL)
13211 return 0;
13212
13213 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13214 if (r < 0)
13215 goto out;
13216
13217 if (acl_type == POSIX_ACL) {
13218 if (in->xattrs.count(ACL_EA_ACCESS)) {
13219 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13220 bufferptr acl(access_acl.c_str(), access_acl.length());
13221 r = posix_acl_access_chmod(acl, mode);
13222 if (r < 0)
13223 goto out;
13224 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13225 } else {
13226 r = 0;
13227 }
13228 }
13229 out:
13230 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13231 return r;
13232 }
13233
13234 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13235 const UserPerm& perms)
13236 {
13237 if (acl_type == NO_ACL)
13238 return 0;
13239
13240 if (S_ISLNK(*mode))
13241 return 0;
13242
13243 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13244 if (r < 0)
13245 goto out;
13246
13247 if (acl_type == POSIX_ACL) {
13248 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13249 map<string, bufferptr> xattrs;
13250
13251 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13252 bufferptr acl(default_acl.c_str(), default_acl.length());
13253 r = posix_acl_inherit_mode(acl, mode);
13254 if (r < 0)
13255 goto out;
13256
13257 if (r > 0) {
13258 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13259 if (r < 0)
13260 goto out;
13261 if (r > 0)
13262 xattrs[ACL_EA_ACCESS] = acl;
13263 }
13264
13265 if (S_ISDIR(*mode))
13266 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13267
13268 r = xattrs.size();
13269 if (r > 0)
13270 ::encode(xattrs, xattrs_bl);
13271 } else {
13272 if (umask_cb)
13273 *mode &= ~umask_cb(callback_handle);
13274 r = 0;
13275 }
13276 }
13277 out:
13278 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13279 return r;
13280 }
13281
13282 void Client::set_filer_flags(int flags)
13283 {
13284 Mutex::Locker l(client_lock);
13285 assert(flags == 0 ||
13286 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13287 objecter->add_global_op_flags(flags);
13288 }
13289
13290 void Client::clear_filer_flags(int flags)
13291 {
13292 Mutex::Locker l(client_lock);
13293 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13294 objecter->clear_global_op_flag(flags);
13295 }
13296
13297 /**
13298 * This is included in cap release messages, to cause
13299 * the MDS to wait until this OSD map epoch. It is necessary
13300 * in corner cases where we cancel RADOS ops, so that
13301 * nobody else tries to do IO to the same objects in
13302 * the same epoch as the cancelled ops.
13303 */
13304 void Client::set_cap_epoch_barrier(epoch_t e)
13305 {
13306 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13307 cap_epoch_barrier = e;
13308 }
13309
13310 const char** Client::get_tracked_conf_keys() const
13311 {
13312 static const char* keys[] = {
13313 "client_cache_size",
13314 "client_cache_mid",
13315 "client_acl_type",
13316 NULL
13317 };
13318 return keys;
13319 }
13320
13321 void Client::handle_conf_change(const struct md_config_t *conf,
13322 const std::set <std::string> &changed)
13323 {
13324 Mutex::Locker lock(client_lock);
13325
13326 if (changed.count("client_cache_size") ||
13327 changed.count("client_cache_mid")) {
13328 lru.lru_set_max(cct->_conf->client_cache_size);
13329 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13330 }
13331 if (changed.count("client_acl_type")) {
13332 acl_type = NO_ACL;
13333 if (cct->_conf->client_acl_type == "posix_acl")
13334 acl_type = POSIX_ACL;
13335 }
13336 }
13337
13338 void Client::init_groups(UserPerm *perms)
13339 {
13340 gid_t *sgids;
13341 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13342 perms->init_gids(sgids, count);
13343 }
13344
13345 void intrusive_ptr_add_ref(Inode *in)
13346 {
13347 in->get();
13348 }
13349
13350 void intrusive_ptr_release(Inode *in)
13351 {
13352 in->client->put_inode(in);
13353 }
13354
13355 mds_rank_t Client::_get_random_up_mds() const
13356 {
13357 assert(client_lock.is_locked_by_me());
13358
13359 std::set<mds_rank_t> up;
13360 mdsmap->get_up_mds_set(up);
13361
13362 if (up.empty())
13363 return MDS_RANK_NONE;
13364 std::set<mds_rank_t>::const_iterator p = up.begin();
13365 for (int n = rand() % up.size(); n; n--)
13366 ++p;
13367 return *p;
13368 }
13369
13370
13371 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13372 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13373 {
13374 monclient->set_messenger(m);
13375 objecter->set_client_incarnation(0);
13376 }
13377
13378 StandaloneClient::~StandaloneClient()
13379 {
13380 delete objecter;
13381 objecter = nullptr;
13382 }
13383
13384 int StandaloneClient::init()
13385 {
13386 timer.init();
13387 objectcacher->start();
13388 objecter->init();
13389
13390 client_lock.Lock();
13391 assert(!initialized);
13392
13393 messenger->add_dispatcher_tail(objecter);
13394 messenger->add_dispatcher_tail(this);
13395
13396 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13397 int r = monclient->init();
13398 if (r < 0) {
13399 // need to do cleanup because we're in an intermediate init state
13400 timer.shutdown();
13401 client_lock.Unlock();
13402 objecter->shutdown();
13403 objectcacher->stop();
13404 monclient->shutdown();
13405 return r;
13406 }
13407 objecter->start();
13408
13409 client_lock.Unlock();
13410 _finish_init();
13411
13412 return 0;
13413 }
13414
13415 void StandaloneClient::shutdown()
13416 {
13417 Client::shutdown();
13418 objecter->shutdown();
13419 monclient->shutdown();
13420 }
13421