]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Client.cc
update source to 12.2.11
[ceph.git] / ceph / src / client / Client.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 // unix-ey fs stuff
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <time.h>
20 #include <utime.h>
21 #include <sys/stat.h>
22 #include <sys/param.h>
23 #include <fcntl.h>
24 #include <sys/file.h>
25 #include <sys/utsname.h>
26 #include <sys/uio.h>
27
28 #include <boost/lexical_cast.hpp>
29 #include <boost/fusion/include/std_pair.hpp>
30
31 #if defined(__FreeBSD__)
32 #define XATTR_CREATE 0x1
33 #define XATTR_REPLACE 0x2
34 #else
35 #include <sys/xattr.h>
36 #endif
37
38 #if defined(__linux__)
39 #include <linux/falloc.h>
40 #endif
41
42 #include <sys/statvfs.h>
43
44 #include "common/config.h"
45 #include "common/version.h"
46
47 // ceph stuff
48 #include "messages/MClientSession.h"
49 #include "messages/MClientReconnect.h"
50 #include "messages/MClientRequest.h"
51 #include "messages/MClientRequestForward.h"
52 #include "messages/MClientReply.h"
53 #include "messages/MClientCaps.h"
54 #include "messages/MClientLease.h"
55 #include "messages/MClientSnap.h"
56 #include "messages/MCommandReply.h"
57 #include "messages/MOSDMap.h"
58 #include "messages/MClientQuota.h"
59 #include "messages/MClientCapRelease.h"
60 #include "messages/MMDSMap.h"
61 #include "messages/MFSMap.h"
62 #include "messages/MFSMapUser.h"
63
64 #include "mon/MonClient.h"
65
66 #include "mds/flock.h"
67 #include "osd/OSDMap.h"
68 #include "osdc/Filer.h"
69
70 #include "common/Cond.h"
71 #include "common/Mutex.h"
72 #include "common/perf_counters.h"
73 #include "common/admin_socket.h"
74 #include "common/errno.h"
75 #include "include/str_list.h"
76
77 #define dout_subsys ceph_subsys_client
78
79 #include "include/lru.h"
80 #include "include/compat.h"
81 #include "include/stringify.h"
82
83 #include "Client.h"
84 #include "Inode.h"
85 #include "Dentry.h"
86 #include "Delegation.h"
87 #include "Dir.h"
88 #include "ClientSnapRealm.h"
89 #include "Fh.h"
90 #include "MetaSession.h"
91 #include "MetaRequest.h"
92 #include "ObjecterWriteback.h"
93 #include "posix_acl.h"
94
95 #include "include/assert.h"
96 #include "include/stat.h"
97
98 #include "include/cephfs/ceph_statx.h"
99
100 #if HAVE_GETGROUPLIST
101 #include <grp.h>
102 #include <pwd.h>
103 #include <unistd.h>
104 #endif
105
106 #undef dout_prefix
107 #define dout_prefix *_dout << "client." << whoami << " "
108
109 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111 // FreeBSD fails to define this
112 #ifndef O_DSYNC
113 #define O_DSYNC 0x0
114 #endif
115 // Darwin fails to define this
116 #ifndef O_RSYNC
117 #define O_RSYNC 0x0
118 #endif
119
120 #ifndef O_DIRECT
121 #define O_DIRECT 0x0
122 #endif
123
124 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127 {
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130 }
131
132
133 // -------------
134
135 Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137 {
138 }
139
140 bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142 {
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163 }
164
165
166 // -------------
167
168 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174 void Client::_reset_faked_inos()
175 {
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181 }
182
183 void Client::_assign_faked_ino(Inode *in)
184 {
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201 }
202
203 void Client::_release_faked_ino(Inode *in)
204 {
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207 }
208
209 vinodeno_t Client::_map_faked_ino(ino_t ino)
210 {
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220 }
221
222 vinodeno_t Client::map_faked_ino(ino_t ino)
223 {
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226 }
227
228 // cons/des
229
230 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
241 async_ino_invalidator(m->cct),
242 async_dentry_invalidator(m->cct),
243 interrupt_finisher(m->cct),
244 remount_finisher(m->cct),
245 objecter_finisher(m->cct),
246 tick_event(NULL),
247 messenger(m), monclient(mc),
248 objecter(objecter_),
249 whoami(mc->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
251 initialized(false),
252 mounted(false), unmounting(false), blacklisted(false),
253 local_osd(-ENXIO), local_osd_epoch(0),
254 unsafe_sync_write(0),
255 client_lock("Client::client_lock"),
256 deleg_timeout(0)
257 {
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
275
276 // file handles
277 free_fd_set.insert(10, 1<<30);
278
279 mdsmap.reset(new MDSMap);
280
281 // osd interfaces
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
283 &client_lock));
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
286 (void*)this,
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
292 true));
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
295 objecter->enable_blacklist_events();
296 }
297
298
299 Client::~Client()
300 {
301 assert(!client_lock.is_locked());
302
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
306 client_lock.Lock();
307 tear_down_cache();
308 client_lock.Unlock();
309 }
310
311 void Client::tear_down_cache()
312 {
313 // fd's
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
315 it != fd_map.end();
316 ++it) {
317 Fh *fh = it->second;
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
319 _release_fh(fh);
320 }
321 fd_map.clear();
322
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
326 _closedir(dirp);
327 }
328
329 // caps!
330 // *** FIXME ***
331
332 // empty lru
333 trim_cache();
334 assert(lru.lru_get_size() == 0);
335
336 // close root ino
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
339 delete root;
340 root = 0;
341 root_ancestor = 0;
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
344 inode_map.clear();
345 _reset_faked_inos();
346 }
347
348 assert(inode_map.empty());
349 }
350
351 inodeno_t Client::get_root_ino()
352 {
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
356 else
357 return root->ino;
358 }
359
360 Inode *Client::get_root()
361 {
362 Mutex::Locker l(client_lock);
363 root->ll_get();
364 return root;
365 }
366
367
368 // debug crapola
369
370 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
371 {
372 filepath path;
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
377 << " " << path
378 << " ref " << in->get_num_ref()
379 << *in << dendl;
380
381 if (f) {
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
384 if (disconnected)
385 f->dump_int("disconnected", 1);
386 in->dump(f);
387 f->close_section();
388 }
389
390 did.insert(in);
391 if (in->dir) {
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
395 ++it) {
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
397 if (f) {
398 f->open_object_section("dentry");
399 it->second->dump(f);
400 f->close_section();
401 }
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
404 }
405 }
406 }
407
408 void Client::dump_cache(Formatter *f)
409 {
410 set<Inode*> did;
411
412 ldout(cct, 1) << "dump_cache" << dendl;
413
414 if (f)
415 f->open_array_section("cache");
416
417 if (root)
418 dump_inode(f, root, did, true);
419
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
423 ++it) {
424 if (did.count(it->second))
425 continue;
426 dump_inode(f, it->second, did, true);
427 }
428
429 if (f)
430 f->close_section();
431 }
432
433 void Client::dump_status(Formatter *f)
434 {
435 assert(client_lock.is_locked_by_me());
436
437 ldout(cct, 1) << __func__ << dendl;
438
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
441
442 if (f) {
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
446 f->close_section();
447
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
451 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
452 f->dump_object("inst", inst);
453 f->dump_stream("inst_str") << inst;
454 f->dump_stream("addr_str") << inst.addr;
455 f->dump_int("inode_count", inode_map.size());
456 f->dump_int("mds_epoch", mdsmap->get_epoch());
457 f->dump_int("osd_epoch", osd_epoch);
458 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
459 f->dump_bool("blacklisted", blacklisted);
460 }
461 }
462
463 int Client::init()
464 {
465 timer.init();
466 objectcacher->start();
467
468 client_lock.Lock();
469 assert(!initialized);
470
471 messenger->add_dispatcher_tail(this);
472 client_lock.Unlock();
473
474 _finish_init();
475 return 0;
476 }
477
478 void Client::_finish_init()
479 {
480 client_lock.Lock();
481 // logger
482 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
483 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
484 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
485 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
486 logger.reset(plb.create_perf_counters());
487 cct->get_perfcounters_collection()->add(logger.get());
488
489 client_lock.Unlock();
490
491 cct->_conf->add_observer(this);
492
493 AdminSocket* admin_socket = cct->get_admin_socket();
494 int ret = admin_socket->register_command("mds_requests",
495 "mds_requests",
496 &m_command_hook,
497 "show in-progress mds requests");
498 if (ret < 0) {
499 lderr(cct) << "error registering admin socket command: "
500 << cpp_strerror(-ret) << dendl;
501 }
502 ret = admin_socket->register_command("mds_sessions",
503 "mds_sessions",
504 &m_command_hook,
505 "show mds session state");
506 if (ret < 0) {
507 lderr(cct) << "error registering admin socket command: "
508 << cpp_strerror(-ret) << dendl;
509 }
510 ret = admin_socket->register_command("dump_cache",
511 "dump_cache",
512 &m_command_hook,
513 "show in-memory metadata cache contents");
514 if (ret < 0) {
515 lderr(cct) << "error registering admin socket command: "
516 << cpp_strerror(-ret) << dendl;
517 }
518 ret = admin_socket->register_command("kick_stale_sessions",
519 "kick_stale_sessions",
520 &m_command_hook,
521 "kick sessions that were remote reset");
522 if (ret < 0) {
523 lderr(cct) << "error registering admin socket command: "
524 << cpp_strerror(-ret) << dendl;
525 }
526 ret = admin_socket->register_command("status",
527 "status",
528 &m_command_hook,
529 "show overall client status");
530 if (ret < 0) {
531 lderr(cct) << "error registering admin socket command: "
532 << cpp_strerror(-ret) << dendl;
533 }
534
535 client_lock.Lock();
536 initialized = true;
537 client_lock.Unlock();
538 }
539
540 void Client::shutdown()
541 {
542 ldout(cct, 1) << "shutdown" << dendl;
543
544 // If we were not mounted, but were being used for sending
545 // MDS commands, we may have sessions that need closing.
546 client_lock.Lock();
547 _close_sessions();
548 client_lock.Unlock();
549
550 cct->_conf->remove_observer(this);
551
552 AdminSocket* admin_socket = cct->get_admin_socket();
553 admin_socket->unregister_command("mds_requests");
554 admin_socket->unregister_command("mds_sessions");
555 admin_socket->unregister_command("dump_cache");
556 admin_socket->unregister_command("kick_stale_sessions");
557 admin_socket->unregister_command("status");
558
559 if (ino_invalidate_cb) {
560 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
561 async_ino_invalidator.wait_for_empty();
562 async_ino_invalidator.stop();
563 }
564
565 if (dentry_invalidate_cb) {
566 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
567 async_dentry_invalidator.wait_for_empty();
568 async_dentry_invalidator.stop();
569 }
570
571 if (switch_interrupt_cb) {
572 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
573 interrupt_finisher.wait_for_empty();
574 interrupt_finisher.stop();
575 }
576
577 if (remount_cb) {
578 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
579 remount_finisher.wait_for_empty();
580 remount_finisher.stop();
581 }
582
583 objectcacher->stop(); // outside of client_lock! this does a join.
584
585 client_lock.Lock();
586 assert(initialized);
587 initialized = false;
588 timer.shutdown();
589 client_lock.Unlock();
590
591 objecter_finisher.wait_for_empty();
592 objecter_finisher.stop();
593
594 if (logger) {
595 cct->get_perfcounters_collection()->remove(logger.get());
596 logger.reset();
597 }
598 }
599
600
601 // ===================
602 // metadata cache stuff
603
604 void Client::trim_cache(bool trim_kernel_dcache)
605 {
606 uint64_t max = cct->_conf->client_cache_size;
607 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
608 unsigned last = 0;
609 while (lru.lru_get_size() != last) {
610 last = lru.lru_get_size();
611
612 if (!unmounting && lru.lru_get_size() <= max) break;
613
614 // trim!
615 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
616 if (!dn)
617 break; // done
618
619 trim_dentry(dn);
620 }
621
622 if (trim_kernel_dcache && lru.lru_get_size() > max)
623 _invalidate_kernel_dcache();
624
625 // hose root?
626 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
627 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
628 delete root;
629 root = 0;
630 root_ancestor = 0;
631 while (!root_parents.empty())
632 root_parents.erase(root_parents.begin());
633 inode_map.clear();
634 _reset_faked_inos();
635 }
636 }
637
638 void Client::trim_cache_for_reconnect(MetaSession *s)
639 {
640 mds_rank_t mds = s->mds_num;
641 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
642
643 int trimmed = 0;
644 list<Dentry*> skipped;
645 while (lru.lru_get_size() > 0) {
646 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
647 if (!dn)
648 break;
649
650 if ((dn->inode && dn->inode->caps.count(mds)) ||
651 dn->dir->parent_inode->caps.count(mds)) {
652 trim_dentry(dn);
653 trimmed++;
654 } else
655 skipped.push_back(dn);
656 }
657
658 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
659 lru.lru_insert_mid(*p);
660
661 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
662 << " trimmed " << trimmed << " dentries" << dendl;
663
664 if (s->caps.size() > 0)
665 _invalidate_kernel_dcache();
666 }
667
668 void Client::trim_dentry(Dentry *dn)
669 {
670 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
671 << " in dir " << hex << dn->dir->parent_inode->ino
672 << dendl;
673 if (dn->inode) {
674 Inode *diri = dn->dir->parent_inode;
675 diri->dir_release_count++;
676 clear_dir_complete_and_ordered(diri, true);
677 }
678 unlink(dn, false, false); // drop dir, drop dentry
679 }
680
681
682 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
683 uint64_t truncate_seq, uint64_t truncate_size)
684 {
685 uint64_t prior_size = in->size;
686
687 if (truncate_seq > in->truncate_seq ||
688 (truncate_seq == in->truncate_seq && size > in->size)) {
689 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
690 in->size = size;
691 in->reported_size = size;
692 if (truncate_seq != in->truncate_seq) {
693 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
694 << truncate_seq << dendl;
695 in->truncate_seq = truncate_seq;
696 in->oset.truncate_seq = truncate_seq;
697
698 // truncate cached file data
699 if (prior_size > size) {
700 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
701 }
702 }
703
704 // truncate inline data
705 if (in->inline_version < CEPH_INLINE_NONE) {
706 uint32_t len = in->inline_data.length();
707 if (size < len)
708 in->inline_data.splice(size, len - size);
709 }
710 }
711 if (truncate_seq >= in->truncate_seq &&
712 in->truncate_size != truncate_size) {
713 if (in->is_file()) {
714 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
715 << truncate_size << dendl;
716 in->truncate_size = truncate_size;
717 in->oset.truncate_size = truncate_size;
718 } else {
719 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
720 }
721 }
722 }
723
724 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
725 utime_t ctime, utime_t mtime, utime_t atime)
726 {
727 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
728 << " ctime " << ctime << " mtime " << mtime << dendl;
729
730 if (time_warp_seq > in->time_warp_seq)
731 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
732 << " is higher than local time_warp_seq "
733 << in->time_warp_seq << dendl;
734
735 int warn = false;
736 // be careful with size, mtime, atime
737 if (issued & (CEPH_CAP_FILE_EXCL|
738 CEPH_CAP_FILE_WR|
739 CEPH_CAP_FILE_BUFFER|
740 CEPH_CAP_AUTH_EXCL|
741 CEPH_CAP_XATTR_EXCL)) {
742 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
743 if (ctime > in->ctime)
744 in->ctime = ctime;
745 if (time_warp_seq > in->time_warp_seq) {
746 //the mds updated times, so take those!
747 in->mtime = mtime;
748 in->atime = atime;
749 in->time_warp_seq = time_warp_seq;
750 } else if (time_warp_seq == in->time_warp_seq) {
751 //take max times
752 if (mtime > in->mtime)
753 in->mtime = mtime;
754 if (atime > in->atime)
755 in->atime = atime;
756 } else if (issued & CEPH_CAP_FILE_EXCL) {
757 //ignore mds values as we have a higher seq
758 } else warn = true;
759 } else {
760 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
761 if (time_warp_seq >= in->time_warp_seq) {
762 in->ctime = ctime;
763 in->mtime = mtime;
764 in->atime = atime;
765 in->time_warp_seq = time_warp_seq;
766 } else warn = true;
767 }
768 if (warn) {
769 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
770 << time_warp_seq << " is lower than local time_warp_seq "
771 << in->time_warp_seq
772 << dendl;
773 }
774 }
775
776 void Client::_fragmap_remove_non_leaves(Inode *in)
777 {
778 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
779 if (!in->dirfragtree.is_leaf(p->first))
780 in->fragmap.erase(p++);
781 else
782 ++p;
783 }
784
785 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
786 {
787 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
788 if (p->second == mds)
789 in->fragmap.erase(p++);
790 else
791 ++p;
792 }
793
794 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
795 MetaSession *session,
796 const UserPerm& request_perms)
797 {
798 Inode *in;
799 bool was_new = false;
800 if (inode_map.count(st->vino)) {
801 in = inode_map[st->vino];
802 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
803 } else {
804 in = new Inode(this, st->vino, &st->layout);
805 inode_map[st->vino] = in;
806
807 if (use_faked_inos())
808 _assign_faked_ino(in);
809
810 if (!root) {
811 root = in;
812 root_ancestor = in;
813 cwd = root;
814 } else if (!mounted) {
815 root_parents[root_ancestor] = in;
816 root_ancestor = in;
817 }
818
819 // immutable bits
820 in->ino = st->vino.ino;
821 in->snapid = st->vino.snapid;
822 in->mode = st->mode & S_IFMT;
823 was_new = true;
824 }
825
826 in->rdev = st->rdev;
827 if (in->is_symlink())
828 in->symlink = st->symlink;
829
830 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
831 bool new_version = false;
832 if (in->version == 0 ||
833 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
834 (in->version & ~1) < st->version))
835 new_version = true;
836
837 int issued;
838 in->caps_issued(&issued);
839 issued |= in->caps_dirty();
840 int new_issued = ~issued & (int)st->cap.caps;
841
842 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
843 !(issued & CEPH_CAP_AUTH_EXCL)) {
844 in->mode = st->mode;
845 in->uid = st->uid;
846 in->gid = st->gid;
847 in->btime = st->btime;
848 }
849
850 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
851 !(issued & CEPH_CAP_LINK_EXCL)) {
852 in->nlink = st->nlink;
853 }
854
855 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
856 update_inode_file_time(in, issued, st->time_warp_seq,
857 st->ctime, st->mtime, st->atime);
858 }
859
860 if (new_version ||
861 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
862 in->layout = st->layout;
863 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
864 }
865
866 if (in->is_dir()) {
867 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
868 in->dirstat = st->dirstat;
869 }
870 // dir_layout/rstat/quota are not tracked by capability, update them only if
871 // the inode stat is from auth mds
872 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
873 in->dir_layout = st->dir_layout;
874 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
875 in->rstat = st->rstat;
876 in->quota = st->quota;
877 }
878 // move me if/when version reflects fragtree changes.
879 if (in->dirfragtree != st->dirfragtree) {
880 in->dirfragtree = st->dirfragtree;
881 _fragmap_remove_non_leaves(in);
882 }
883 }
884
885 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
886 st->xattrbl.length() &&
887 st->xattr_version > in->xattr_version) {
888 bufferlist::iterator p = st->xattrbl.begin();
889 ::decode(in->xattrs, p);
890 in->xattr_version = st->xattr_version;
891 }
892
893 if (st->inline_version > in->inline_version) {
894 in->inline_data = st->inline_data;
895 in->inline_version = st->inline_version;
896 }
897
898 /* always take a newer change attr */
899 if (st->change_attr > in->change_attr)
900 in->change_attr = st->change_attr;
901
902 if (st->version > in->version)
903 in->version = st->version;
904
905 if (was_new)
906 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
907
908 if (!st->cap.caps)
909 return in; // as with readdir returning indoes in different snaprealms (no caps!)
910
911 if (in->snapid == CEPH_NOSNAP) {
912 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
913 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
914 request_perms);
915 if (in->auth_cap && in->auth_cap->session == session) {
916 in->max_size = st->max_size;
917 in->rstat = st->rstat;
918 }
919
920 // setting I_COMPLETE needs to happen after adding the cap
921 if (in->is_dir() &&
922 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
923 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
924 in->dirstat.nfiles == 0 &&
925 in->dirstat.nsubdirs == 0) {
926 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
927 in->flags |= I_COMPLETE | I_DIR_ORDERED;
928 if (in->dir) {
929 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
930 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
931 in->dir->readdir_cache.clear();
932 for (const auto& p : in->dir->dentries) {
933 unlink(p.second, true, true); // keep dir, keep dentry
934 }
935 if (in->dir->dentries.empty())
936 close_dir(in->dir);
937 }
938 }
939 } else {
940 in->snap_caps |= st->cap.caps;
941 }
942
943 return in;
944 }
945
946
947 /*
948 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
949 */
950 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
951 Inode *in, utime_t from, MetaSession *session,
952 Dentry *old_dentry)
953 {
954 Dentry *dn = NULL;
955 if (dir->dentries.count(dname))
956 dn = dir->dentries[dname];
957
958 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
959 << " in dir " << dir->parent_inode->vino() << " dn " << dn
960 << dendl;
961
962 if (dn && dn->inode) {
963 if (dn->inode->vino() == in->vino()) {
964 touch_dn(dn);
965 ldout(cct, 12) << " had dentry " << dname
966 << " with correct vino " << dn->inode->vino()
967 << dendl;
968 } else {
969 ldout(cct, 12) << " had dentry " << dname
970 << " with WRONG vino " << dn->inode->vino()
971 << dendl;
972 unlink(dn, true, true); // keep dir, keep dentry
973 }
974 }
975
976 if (!dn || !dn->inode) {
977 InodeRef tmp_ref(in);
978 if (old_dentry) {
979 if (old_dentry->dir != dir) {
980 Inode *old_diri = old_dentry->dir->parent_inode;
981 old_diri->dir_ordered_count++;
982 clear_dir_complete_and_ordered(old_diri, false);
983 }
984 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
985 }
986 Inode *diri = dir->parent_inode;
987 diri->dir_ordered_count++;
988 clear_dir_complete_and_ordered(diri, false);
989 dn = link(dir, dname, in, dn);
990 }
991
992 update_dentry_lease(dn, dlease, from, session);
993 return dn;
994 }
995
996 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
997 {
998 utime_t dttl = from;
999 dttl += (float)dlease->duration_ms / 1000.0;
1000
1001 assert(dn);
1002
1003 if (dlease->mask & CEPH_LOCK_DN) {
1004 if (dttl > dn->lease_ttl) {
1005 ldout(cct, 10) << "got dentry lease on " << dn->name
1006 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1007 dn->lease_ttl = dttl;
1008 dn->lease_mds = session->mds_num;
1009 dn->lease_seq = dlease->seq;
1010 dn->lease_gen = session->cap_gen;
1011 }
1012 }
1013 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1014 }
1015
1016
1017 /*
1018 * update MDS location cache for a single inode
1019 */
1020 void Client::update_dir_dist(Inode *in, DirStat *dst)
1021 {
1022 // auth
1023 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1024 if (dst->auth >= 0) {
1025 in->fragmap[dst->frag] = dst->auth;
1026 } else {
1027 in->fragmap.erase(dst->frag);
1028 }
1029 if (!in->dirfragtree.is_leaf(dst->frag)) {
1030 in->dirfragtree.force_to_leaf(cct, dst->frag);
1031 _fragmap_remove_non_leaves(in);
1032 }
1033
1034 // replicated
1035 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1036
1037 // dist
1038 /*
1039 if (!st->dirfrag_dist.empty()) { // FIXME
1040 set<int> dist = st->dirfrag_dist.begin()->second;
1041 if (dist.empty() && !in->dir_contacts.empty())
1042 ldout(cct, 9) << "lost dist spec for " << in->ino
1043 << " " << dist << dendl;
1044 if (!dist.empty() && in->dir_contacts.empty())
1045 ldout(cct, 9) << "got dist spec for " << in->ino
1046 << " " << dist << dendl;
1047 in->dir_contacts = dist;
1048 }
1049 */
1050 }
1051
1052 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1053 {
1054 if (diri->flags & I_COMPLETE) {
1055 if (complete) {
1056 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1057 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1058 } else {
1059 if (diri->flags & I_DIR_ORDERED) {
1060 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1061 diri->flags &= ~I_DIR_ORDERED;
1062 }
1063 }
1064 if (diri->dir)
1065 diri->dir->readdir_cache.clear();
1066 }
1067 }
1068
1069 /*
1070 * insert results from readdir or lssnap into the metadata cache.
1071 */
1072 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1073
1074 MClientReply *reply = request->reply;
1075 ConnectionRef con = request->reply->get_connection();
1076 uint64_t features = con->get_features();
1077
1078 dir_result_t *dirp = request->dirp;
1079 assert(dirp);
1080
1081 // the extra buffer list is only set for readdir and lssnap replies
1082 bufferlist::iterator p = reply->get_extra_bl().begin();
1083 if (!p.end()) {
1084 // snapdir?
1085 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1086 assert(diri);
1087 diri = open_snapdir(diri);
1088 }
1089
1090 // only open dir if we're actually adding stuff to it!
1091 Dir *dir = diri->open_dir();
1092 assert(dir);
1093
1094 // dirstat
1095 DirStat dst(p);
1096 __u32 numdn;
1097 __u16 flags;
1098 ::decode(numdn, p);
1099 ::decode(flags, p);
1100
1101 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1102 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1103
1104 frag_t fg = (unsigned)request->head.args.readdir.frag;
1105 unsigned readdir_offset = dirp->next_offset;
1106 string readdir_start = dirp->last_name;
1107 assert(!readdir_start.empty() || readdir_offset == 2);
1108
1109 unsigned last_hash = 0;
1110 if (hash_order) {
1111 if (!readdir_start.empty()) {
1112 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1113 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1114 /* mds understands offset_hash */
1115 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1116 }
1117 }
1118
1119 if (fg != dst.frag) {
1120 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1121 fg = dst.frag;
1122 if (!hash_order) {
1123 readdir_offset = 2;
1124 readdir_start.clear();
1125 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1126 }
1127 }
1128
1129 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1130 << ", hash_order=" << hash_order
1131 << ", readdir_start " << readdir_start
1132 << ", last_hash " << last_hash
1133 << ", next_offset " << readdir_offset << dendl;
1134
1135 if (diri->snapid != CEPH_SNAPDIR &&
1136 fg.is_leftmost() && readdir_offset == 2 &&
1137 !(hash_order && last_hash)) {
1138 dirp->release_count = diri->dir_release_count;
1139 dirp->ordered_count = diri->dir_ordered_count;
1140 dirp->start_shared_gen = diri->shared_gen;
1141 dirp->cache_index = 0;
1142 }
1143
1144 dirp->buffer_frag = fg;
1145
1146 _readdir_drop_dirp_buffer(dirp);
1147 dirp->buffer.reserve(numdn);
1148
1149 string dname;
1150 LeaseStat dlease;
1151 for (unsigned i=0; i<numdn; i++) {
1152 ::decode(dname, p);
1153 ::decode(dlease, p);
1154 InodeStat ist(p, features);
1155
1156 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1157
1158 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1159 request->perms);
1160 Dentry *dn;
1161 if (diri->dir->dentries.count(dname)) {
1162 Dentry *olddn = diri->dir->dentries[dname];
1163 if (olddn->inode != in) {
1164 // replace incorrect dentry
1165 unlink(olddn, true, true); // keep dir, dentry
1166 dn = link(dir, dname, in, olddn);
1167 assert(dn == olddn);
1168 } else {
1169 // keep existing dn
1170 dn = olddn;
1171 touch_dn(dn);
1172 }
1173 } else {
1174 // new dn
1175 dn = link(dir, dname, in, NULL);
1176 }
1177
1178 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1179 if (hash_order) {
1180 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1181 if (hash != last_hash)
1182 readdir_offset = 2;
1183 last_hash = hash;
1184 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1185 } else {
1186 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1187 }
1188 // add to readdir cache
1189 if (dirp->release_count == diri->dir_release_count &&
1190 dirp->ordered_count == diri->dir_ordered_count &&
1191 dirp->start_shared_gen == diri->shared_gen) {
1192 if (dirp->cache_index == dir->readdir_cache.size()) {
1193 if (i == 0) {
1194 assert(!dirp->inode->is_complete_and_ordered());
1195 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1196 }
1197 dir->readdir_cache.push_back(dn);
1198 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1199 if (dirp->inode->is_complete_and_ordered())
1200 assert(dir->readdir_cache[dirp->cache_index] == dn);
1201 else
1202 dir->readdir_cache[dirp->cache_index] = dn;
1203 } else {
1204 assert(0 == "unexpected readdir buffer idx");
1205 }
1206 dirp->cache_index++;
1207 }
1208 // add to cached result list
1209 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1210 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1211 }
1212
1213 if (numdn > 0)
1214 dirp->last_name = dname;
1215 if (end)
1216 dirp->next_offset = 2;
1217 else
1218 dirp->next_offset = readdir_offset;
1219
1220 if (dir->is_empty())
1221 close_dir(dir);
1222 }
1223 }
1224
1225 /** insert_trace
1226 *
1227 * insert a trace from a MDS reply into the cache.
1228 */
1229 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1230 {
1231 MClientReply *reply = request->reply;
1232 int op = request->get_op();
1233
1234 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1235 << " is_target=" << (int)reply->head.is_target
1236 << " is_dentry=" << (int)reply->head.is_dentry
1237 << dendl;
1238
1239 bufferlist::iterator p = reply->get_trace_bl().begin();
1240 if (request->got_unsafe) {
1241 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1242 assert(p.end());
1243 return NULL;
1244 }
1245
1246 if (p.end()) {
1247 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1248
1249 Dentry *d = request->dentry();
1250 if (d) {
1251 Inode *diri = d->dir->parent_inode;
1252 diri->dir_release_count++;
1253 clear_dir_complete_and_ordered(diri, true);
1254 }
1255
1256 if (d && reply->get_result() == 0) {
1257 if (op == CEPH_MDS_OP_RENAME) {
1258 // rename
1259 Dentry *od = request->old_dentry();
1260 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1261 assert(od);
1262 unlink(od, true, true); // keep dir, dentry
1263 } else if (op == CEPH_MDS_OP_RMDIR ||
1264 op == CEPH_MDS_OP_UNLINK) {
1265 // unlink, rmdir
1266 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1267 unlink(d, true, true); // keep dir, dentry
1268 }
1269 }
1270 return NULL;
1271 }
1272
1273 ConnectionRef con = request->reply->get_connection();
1274 uint64_t features = con->get_features();
1275 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1276
1277 // snap trace
1278 SnapRealm *realm = NULL;
1279 if (reply->snapbl.length())
1280 update_snap_trace(reply->snapbl, &realm);
1281
1282 ldout(cct, 10) << " hrm "
1283 << " is_target=" << (int)reply->head.is_target
1284 << " is_dentry=" << (int)reply->head.is_dentry
1285 << dendl;
1286
1287 InodeStat dirst;
1288 DirStat dst;
1289 string dname;
1290 LeaseStat dlease;
1291 InodeStat ist;
1292
1293 if (reply->head.is_dentry) {
1294 dirst.decode(p, features);
1295 dst.decode(p);
1296 ::decode(dname, p);
1297 ::decode(dlease, p);
1298 }
1299
1300 Inode *in = 0;
1301 if (reply->head.is_target) {
1302 ist.decode(p, features);
1303 if (cct->_conf->client_debug_getattr_caps) {
1304 unsigned wanted = 0;
1305 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1306 wanted = request->head.args.getattr.mask;
1307 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1308 wanted = request->head.args.open.mask;
1309
1310 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1311 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1312 assert(0 == "MDS reply does not contain xattrs");
1313 }
1314
1315 in = add_update_inode(&ist, request->sent_stamp, session,
1316 request->perms);
1317 }
1318
1319 Inode *diri = NULL;
1320 if (reply->head.is_dentry) {
1321 diri = add_update_inode(&dirst, request->sent_stamp, session,
1322 request->perms);
1323 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1324
1325 if (in) {
1326 Dir *dir = diri->open_dir();
1327 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1328 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1329 } else {
1330 Dentry *dn = NULL;
1331 if (diri->dir && diri->dir->dentries.count(dname)) {
1332 dn = diri->dir->dentries[dname];
1333 if (dn->inode) {
1334 diri->dir_ordered_count++;
1335 clear_dir_complete_and_ordered(diri, false);
1336 unlink(dn, true, true); // keep dir, dentry
1337 }
1338 }
1339 if (dlease.duration_ms > 0) {
1340 if (!dn) {
1341 Dir *dir = diri->open_dir();
1342 dn = link(dir, dname, NULL, NULL);
1343 }
1344 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1345 }
1346 }
1347 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1348 op == CEPH_MDS_OP_MKSNAP) {
1349 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1350 // fake it for snap lookup
1351 vinodeno_t vino = ist.vino;
1352 vino.snapid = CEPH_SNAPDIR;
1353 assert(inode_map.count(vino));
1354 diri = inode_map[vino];
1355
1356 string dname = request->path.last_dentry();
1357
1358 LeaseStat dlease;
1359 dlease.duration_ms = 0;
1360
1361 if (in) {
1362 Dir *dir = diri->open_dir();
1363 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1364 } else {
1365 if (diri->dir && diri->dir->dentries.count(dname)) {
1366 Dentry *dn = diri->dir->dentries[dname];
1367 if (dn->inode)
1368 unlink(dn, true, true); // keep dir, dentry
1369 }
1370 }
1371 }
1372
1373 if (in) {
1374 if (op == CEPH_MDS_OP_READDIR ||
1375 op == CEPH_MDS_OP_LSSNAP) {
1376 insert_readdir_results(request, session, in);
1377 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1378 // hack: return parent inode instead
1379 in = diri;
1380 }
1381
1382 if (request->dentry() == NULL && in != request->inode()) {
1383 // pin the target inode if its parent dentry is not pinned
1384 request->set_other_inode(in);
1385 }
1386 }
1387
1388 if (realm)
1389 put_snap_realm(realm);
1390
1391 request->target = in;
1392 return in;
1393 }
1394
1395 // -------
1396
1397 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1398 {
1399 mds_rank_t mds = MDS_RANK_NONE;
1400 __u32 hash = 0;
1401 bool is_hash = false;
1402
1403 Inode *in = NULL;
1404 Dentry *de = NULL;
1405 Cap *cap = NULL;
1406
1407 if (req->resend_mds >= 0) {
1408 mds = req->resend_mds;
1409 req->resend_mds = -1;
1410 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1411 goto out;
1412 }
1413
1414 if (cct->_conf->client_use_random_mds)
1415 goto random_mds;
1416
1417 in = req->inode();
1418 de = req->dentry();
1419 if (in) {
1420 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1421 if (req->path.depth()) {
1422 hash = in->hash_dentry_name(req->path[0]);
1423 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1424 << " on " << req->path[0]
1425 << " => " << hash << dendl;
1426 is_hash = true;
1427 }
1428 } else if (de) {
1429 if (de->inode) {
1430 in = de->inode.get();
1431 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1432 } else {
1433 in = de->dir->parent_inode;
1434 hash = in->hash_dentry_name(de->name);
1435 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1436 << " on " << de->name
1437 << " => " << hash << dendl;
1438 is_hash = true;
1439 }
1440 }
1441 if (in) {
1442 if (in->snapid != CEPH_NOSNAP) {
1443 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1444 while (in->snapid != CEPH_NOSNAP) {
1445 if (in->snapid == CEPH_SNAPDIR)
1446 in = in->snapdir_parent.get();
1447 else if (!in->dn_set.empty())
1448 /* In most cases there will only be one dentry, so getting it
1449 * will be the correct action. If there are multiple hard links,
1450 * I think the MDS should be able to redirect as needed*/
1451 in = in->get_first_parent()->dir->parent_inode;
1452 else {
1453 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1454 break;
1455 }
1456 }
1457 is_hash = false;
1458 }
1459
1460 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1461 << " hash=" << hash << dendl;
1462
1463 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1464 frag_t fg = in->dirfragtree[hash];
1465 if (in->fragmap.count(fg)) {
1466 mds = in->fragmap[fg];
1467 if (phash_diri)
1468 *phash_diri = in;
1469 } else if (in->auth_cap) {
1470 mds = in->auth_cap->session->mds_num;
1471 }
1472 if (mds >= 0) {
1473 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1474 goto out;
1475 }
1476 }
1477
1478 if (req->auth_is_best())
1479 cap = in->auth_cap;
1480 if (!cap && !in->caps.empty())
1481 cap = in->caps.begin()->second;
1482 if (!cap)
1483 goto random_mds;
1484 mds = cap->session->mds_num;
1485 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1486
1487 goto out;
1488 }
1489
1490 random_mds:
1491 if (mds < 0) {
1492 mds = _get_random_up_mds();
1493 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1494 }
1495
1496 out:
1497 ldout(cct, 20) << "mds is " << mds << dendl;
1498 return mds;
1499 }
1500
1501
1502 void Client::connect_mds_targets(mds_rank_t mds)
1503 {
1504 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1505 assert(mds_sessions.count(mds));
1506 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1507 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1508 q != info.export_targets.end();
1509 ++q) {
1510 if (mds_sessions.count(*q) == 0 &&
1511 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1512 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1513 << " export target mds." << *q << dendl;
1514 _open_mds_session(*q);
1515 }
1516 }
1517 }
1518
1519 void Client::dump_mds_sessions(Formatter *f)
1520 {
1521 f->dump_int("id", get_nodeid().v);
1522 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
1523 f->dump_object("inst", inst);
1524 f->dump_stream("inst_str") << inst;
1525 f->dump_stream("addr_str") << inst.addr;
1526 f->open_array_section("sessions");
1527 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1528 f->open_object_section("session");
1529 p->second->dump(f);
1530 f->close_section();
1531 }
1532 f->close_section();
1533 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1534 }
1535 void Client::dump_mds_requests(Formatter *f)
1536 {
1537 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1538 p != mds_requests.end();
1539 ++p) {
1540 f->open_object_section("request");
1541 p->second->dump(f);
1542 f->close_section();
1543 }
1544 }
1545
1546 int Client::verify_reply_trace(int r,
1547 MetaRequest *request, MClientReply *reply,
1548 InodeRef *ptarget, bool *pcreated,
1549 const UserPerm& perms)
1550 {
1551 // check whether this request actually did the create, and set created flag
1552 bufferlist extra_bl;
1553 inodeno_t created_ino;
1554 bool got_created_ino = false;
1555 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1556
1557 extra_bl.claim(reply->get_extra_bl());
1558 if (extra_bl.length() >= 8) {
1559 // if the extra bufferlist has a buffer, we assume its the created inode
1560 // and that this request to create succeeded in actually creating
1561 // the inode (won the race with other create requests)
1562 ::decode(created_ino, extra_bl);
1563 got_created_ino = true;
1564 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1565 }
1566
1567 if (pcreated)
1568 *pcreated = got_created_ino;
1569
1570 if (request->target) {
1571 *ptarget = request->target;
1572 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1573 } else {
1574 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1575 (*ptarget) = p->second;
1576 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1577 } else {
1578 // we got a traceless reply, and need to look up what we just
1579 // created. for now, do this by name. someday, do this by the
1580 // ino... which we know! FIXME.
1581 InodeRef target;
1582 Dentry *d = request->dentry();
1583 if (d) {
1584 if (d->dir) {
1585 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1586 << d->dir->parent_inode->ino << "/" << d->name
1587 << " got_ino " << got_created_ino
1588 << " ino " << created_ino
1589 << dendl;
1590 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1591 &target, perms);
1592 } else {
1593 // if the dentry is not linked, just do our best. see #5021.
1594 assert(0 == "how did this happen? i want logs!");
1595 }
1596 } else {
1597 Inode *in = request->inode();
1598 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1599 << in->ino << dendl;
1600 r = _getattr(in, request->regetattr_mask, perms, true);
1601 target = in;
1602 }
1603 if (r >= 0) {
1604 // verify ino returned in reply and trace_dist are the same
1605 if (got_created_ino &&
1606 created_ino.val != target->ino.val) {
1607 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1608 r = -EINTR;
1609 }
1610 if (ptarget)
1611 ptarget->swap(target);
1612 }
1613 }
1614 }
1615
1616 return r;
1617 }
1618
1619
1620 /**
1621 * make a request
1622 *
1623 * Blocking helper to make an MDS request.
1624 *
1625 * If the ptarget flag is set, behavior changes slightly: the caller
1626 * expects to get a pointer to the inode we are creating or operating
1627 * on. As a result, we will follow up any traceless mutation reply
1628 * with a getattr or lookup to transparently handle a traceless reply
1629 * from the MDS (as when the MDS restarts and the client has to replay
1630 * a request).
1631 *
1632 * @param request the MetaRequest to execute
1633 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1634 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1635 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1636 * @param use_mds [optional] prefer a specific mds (-1 for default)
1637 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1638 */
1639 int Client::make_request(MetaRequest *request,
1640 const UserPerm& perms,
1641 InodeRef *ptarget, bool *pcreated,
1642 mds_rank_t use_mds,
1643 bufferlist *pdirbl)
1644 {
1645 int r = 0;
1646
1647 // assign a unique tid
1648 ceph_tid_t tid = ++last_tid;
1649 request->set_tid(tid);
1650
1651 // and timestamp
1652 request->op_stamp = ceph_clock_now();
1653
1654 // make note
1655 mds_requests[tid] = request->get();
1656 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1657 oldest_tid = tid;
1658
1659 request->set_caller_perms(perms);
1660
1661 if (cct->_conf->client_inject_fixed_oldest_tid) {
1662 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1663 request->set_oldest_client_tid(1);
1664 } else {
1665 request->set_oldest_client_tid(oldest_tid);
1666 }
1667
1668 // hack target mds?
1669 if (use_mds >= 0)
1670 request->resend_mds = use_mds;
1671
1672 while (1) {
1673 if (request->aborted())
1674 break;
1675
1676 if (blacklisted) {
1677 request->abort(-EBLACKLISTED);
1678 break;
1679 }
1680
1681 // set up wait cond
1682 Cond caller_cond;
1683 request->caller_cond = &caller_cond;
1684
1685 // choose mds
1686 Inode *hash_diri = NULL;
1687 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1688 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1689 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1690 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1691 if (hash_diri) {
1692 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1693 _fragmap_remove_stopped_mds(hash_diri, mds);
1694 } else {
1695 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1696 request->resend_mds = _get_random_up_mds();
1697 }
1698 } else {
1699 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1700 wait_on_list(waiting_for_mdsmap);
1701 }
1702 continue;
1703 }
1704
1705 // open a session?
1706 MetaSession *session = NULL;
1707 if (!have_open_session(mds)) {
1708 session = _get_or_open_mds_session(mds);
1709
1710 // wait
1711 if (session->state == MetaSession::STATE_OPENING) {
1712 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1713 wait_on_context_list(session->waiting_for_open);
1714 // Abort requests on REJECT from MDS
1715 if (rejected_by_mds.count(mds)) {
1716 request->abort(-EPERM);
1717 break;
1718 }
1719 continue;
1720 }
1721
1722 if (!have_open_session(mds))
1723 continue;
1724 } else {
1725 session = mds_sessions[mds];
1726 }
1727
1728 // send request.
1729 send_request(request, session);
1730
1731 // wait for signal
1732 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1733 request->kick = false;
1734 while (!request->reply && // reply
1735 request->resend_mds < 0 && // forward
1736 !request->kick)
1737 caller_cond.Wait(client_lock);
1738 request->caller_cond = NULL;
1739
1740 // did we get a reply?
1741 if (request->reply)
1742 break;
1743 }
1744
1745 if (!request->reply) {
1746 assert(request->aborted());
1747 assert(!request->got_unsafe);
1748 r = request->get_abort_code();
1749 request->item.remove_myself();
1750 unregister_request(request);
1751 put_request(request); // ours
1752 return r;
1753 }
1754
1755 // got it!
1756 MClientReply *reply = request->reply;
1757 request->reply = NULL;
1758 r = reply->get_result();
1759 if (r >= 0)
1760 request->success = true;
1761
1762 // kick dispatcher (we've got it!)
1763 assert(request->dispatch_cond);
1764 request->dispatch_cond->Signal();
1765 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1766 request->dispatch_cond = 0;
1767
1768 if (r >= 0 && ptarget)
1769 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1770
1771 if (pdirbl)
1772 pdirbl->claim(reply->get_extra_bl());
1773
1774 // -- log times --
1775 utime_t lat = ceph_clock_now();
1776 lat -= request->sent_stamp;
1777 ldout(cct, 20) << "lat " << lat << dendl;
1778 logger->tinc(l_c_lat, lat);
1779 logger->tinc(l_c_reply, lat);
1780
1781 put_request(request);
1782
1783 reply->put();
1784 return r;
1785 }
1786
1787 void Client::unregister_request(MetaRequest *req)
1788 {
1789 mds_requests.erase(req->tid);
1790 if (req->tid == oldest_tid) {
1791 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1792 while (true) {
1793 if (p == mds_requests.end()) {
1794 oldest_tid = 0;
1795 break;
1796 }
1797 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1798 oldest_tid = p->first;
1799 break;
1800 }
1801 ++p;
1802 }
1803 }
1804 put_request(req);
1805 }
1806
1807 void Client::put_request(MetaRequest *request)
1808 {
1809 if (request->_put()) {
1810 int op = -1;
1811 if (request->success)
1812 op = request->get_op();
1813 InodeRef other_in;
1814 request->take_other_inode(&other_in);
1815 delete request;
1816
1817 if (other_in &&
1818 (op == CEPH_MDS_OP_RMDIR ||
1819 op == CEPH_MDS_OP_RENAME ||
1820 op == CEPH_MDS_OP_RMSNAP)) {
1821 _try_to_trim_inode(other_in.get(), false);
1822 }
1823 }
1824 }
1825
1826 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1827 mds_rank_t mds, int drop,
1828 int unless, int force)
1829 {
1830 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1831 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1832 << ", have:" << ", force:" << force << ")" << dendl;
1833 int released = 0;
1834 if (in->caps.count(mds)) {
1835 Cap *caps = in->caps[mds];
1836 drop &= ~(in->dirty_caps | get_caps_used(in));
1837 if ((drop & caps->issued) &&
1838 !(unless & caps->issued)) {
1839 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1840 caps->issued &= ~drop;
1841 caps->implemented &= ~drop;
1842 released = 1;
1843 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1844 } else {
1845 released = force;
1846 }
1847 if (released) {
1848 ceph_mds_request_release rel;
1849 rel.ino = in->ino;
1850 rel.cap_id = caps->cap_id;
1851 rel.seq = caps->seq;
1852 rel.issue_seq = caps->issue_seq;
1853 rel.mseq = caps->mseq;
1854 rel.caps = caps->implemented;
1855 rel.wanted = caps->wanted;
1856 rel.dname_len = 0;
1857 rel.dname_seq = 0;
1858 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1859 }
1860 }
1861 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1862 << released << dendl;
1863 return released;
1864 }
1865
1866 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1867 mds_rank_t mds, int drop, int unless)
1868 {
1869 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1870 << dn << ")" << dendl;
1871 int released = 0;
1872 if (dn->dir)
1873 released = encode_inode_release(dn->dir->parent_inode, req,
1874 mds, drop, unless, 1);
1875 if (released && dn->lease_mds == mds) {
1876 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1877 MClientRequest::Release& rel = req->cap_releases.back();
1878 rel.item.dname_len = dn->name.length();
1879 rel.item.dname_seq = dn->lease_seq;
1880 rel.dname = dn->name;
1881 }
1882 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1883 << dn << ")" << dendl;
1884 }
1885
1886
1887 /*
1888 * This requires the MClientRequest *request member to be set.
1889 * It will error out horribly without one.
1890 * Additionally, if you set any *drop member, you'd better have
1891 * set the corresponding dentry!
1892 */
1893 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1894 {
1895 ldout(cct, 20) << "encode_cap_releases enter (req: "
1896 << req << ", mds: " << mds << ")" << dendl;
1897 if (req->inode_drop && req->inode())
1898 encode_inode_release(req->inode(), req,
1899 mds, req->inode_drop,
1900 req->inode_unless);
1901
1902 if (req->old_inode_drop && req->old_inode())
1903 encode_inode_release(req->old_inode(), req,
1904 mds, req->old_inode_drop,
1905 req->old_inode_unless);
1906 if (req->other_inode_drop && req->other_inode())
1907 encode_inode_release(req->other_inode(), req,
1908 mds, req->other_inode_drop,
1909 req->other_inode_unless);
1910
1911 if (req->dentry_drop && req->dentry())
1912 encode_dentry_release(req->dentry(), req,
1913 mds, req->dentry_drop,
1914 req->dentry_unless);
1915
1916 if (req->old_dentry_drop && req->old_dentry())
1917 encode_dentry_release(req->old_dentry(), req,
1918 mds, req->old_dentry_drop,
1919 req->old_dentry_unless);
1920 ldout(cct, 25) << "encode_cap_releases exit (req: "
1921 << req << ", mds " << mds <<dendl;
1922 }
1923
1924 bool Client::have_open_session(mds_rank_t mds)
1925 {
1926 return
1927 mds_sessions.count(mds) &&
1928 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1929 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1930 }
1931
1932 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1933 {
1934 if (mds_sessions.count(mds) == 0)
1935 return NULL;
1936 MetaSession *s = mds_sessions[mds];
1937 if (s->con != con)
1938 return NULL;
1939 return s;
1940 }
1941
1942 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1943 {
1944 if (mds_sessions.count(mds))
1945 return mds_sessions[mds];
1946 return _open_mds_session(mds);
1947 }
1948
1949 /**
1950 * Populate a map of strings with client-identifying metadata,
1951 * such as the hostname. Call this once at initialization.
1952 */
1953 void Client::populate_metadata(const std::string &mount_root)
1954 {
1955 // Hostname
1956 struct utsname u;
1957 int r = uname(&u);
1958 if (r >= 0) {
1959 metadata["hostname"] = u.nodename;
1960 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1961 } else {
1962 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1963 }
1964
1965 metadata["pid"] = stringify(getpid());
1966
1967 // Ceph entity id (the '0' in "client.0")
1968 metadata["entity_id"] = cct->_conf->name.get_id();
1969
1970 // Our mount position
1971 if (!mount_root.empty()) {
1972 metadata["root"] = mount_root;
1973 }
1974
1975 // Ceph version
1976 metadata["ceph_version"] = pretty_version_to_str();
1977 metadata["ceph_sha1"] = git_version_to_str();
1978
1979 // Apply any metadata from the user's configured overrides
1980 std::vector<std::string> tokens;
1981 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1982 for (const auto &i : tokens) {
1983 auto eqpos = i.find("=");
1984 // Throw out anything that isn't of the form "<str>=<str>"
1985 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1986 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1987 continue;
1988 }
1989 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1990 }
1991 }
1992
1993 /**
1994 * Optionally add or override client metadata fields.
1995 */
1996 void Client::update_metadata(std::string const &k, std::string const &v)
1997 {
1998 Mutex::Locker l(client_lock);
1999 assert(initialized);
2000
2001 if (metadata.count(k)) {
2002 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2003 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
2004 }
2005
2006 metadata[k] = v;
2007 }
2008
2009 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2010 {
2011 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
2012 assert(mds_sessions.count(mds) == 0);
2013 MetaSession *session = new MetaSession;
2014 session->mds_num = mds;
2015 session->seq = 0;
2016 session->inst = mdsmap->get_inst(mds);
2017 session->con = messenger->get_connection(session->inst);
2018 session->state = MetaSession::STATE_OPENING;
2019 session->mds_state = MDSMap::STATE_NULL;
2020 mds_sessions[mds] = session;
2021
2022 // Maybe skip sending a request to open if this MDS daemon
2023 // has previously sent us a REJECT.
2024 if (rejected_by_mds.count(mds)) {
2025 if (rejected_by_mds[mds] == session->inst) {
2026 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2027 "because we were rejected" << dendl;
2028 return session;
2029 } else {
2030 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2031 "rejected us, trying with new inst" << dendl;
2032 rejected_by_mds.erase(mds);
2033 }
2034 }
2035
2036 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2037 m->client_meta = metadata;
2038 session->con->send_message(m);
2039 return session;
2040 }
2041
2042 void Client::_close_mds_session(MetaSession *s)
2043 {
2044 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2045 s->state = MetaSession::STATE_CLOSING;
2046 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2047 }
2048
2049 void Client::_closed_mds_session(MetaSession *s)
2050 {
2051 s->state = MetaSession::STATE_CLOSED;
2052 s->con->mark_down();
2053 signal_context_list(s->waiting_for_open);
2054 mount_cond.Signal();
2055 remove_session_caps(s);
2056 kick_requests_closed(s);
2057 mds_sessions.erase(s->mds_num);
2058 delete s;
2059 }
2060
2061 void Client::handle_client_session(MClientSession *m)
2062 {
2063 mds_rank_t from = mds_rank_t(m->get_source().num());
2064 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2065
2066 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2067 if (!session) {
2068 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2069 m->put();
2070 return;
2071 }
2072
2073 switch (m->get_op()) {
2074 case CEPH_SESSION_OPEN:
2075 renew_caps(session);
2076 session->state = MetaSession::STATE_OPEN;
2077 if (unmounting)
2078 mount_cond.Signal();
2079 else
2080 connect_mds_targets(from);
2081 signal_context_list(session->waiting_for_open);
2082 break;
2083
2084 case CEPH_SESSION_CLOSE:
2085 _closed_mds_session(session);
2086 break;
2087
2088 case CEPH_SESSION_RENEWCAPS:
2089 if (session->cap_renew_seq == m->get_seq()) {
2090 session->cap_ttl =
2091 session->last_cap_renew_request + mdsmap->get_session_timeout();
2092 wake_inode_waiters(session);
2093 }
2094 break;
2095
2096 case CEPH_SESSION_STALE:
2097 // invalidate session caps/leases
2098 session->cap_gen++;
2099 session->cap_ttl = ceph_clock_now();
2100 session->cap_ttl -= 1;
2101 renew_caps(session);
2102 break;
2103
2104 case CEPH_SESSION_RECALL_STATE:
2105 trim_caps(session, m->get_max_caps());
2106 break;
2107
2108 case CEPH_SESSION_FLUSHMSG:
2109 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2110 break;
2111
2112 case CEPH_SESSION_FORCE_RO:
2113 force_session_readonly(session);
2114 break;
2115
2116 case CEPH_SESSION_REJECT:
2117 rejected_by_mds[session->mds_num] = session->inst;
2118 _closed_mds_session(session);
2119
2120 break;
2121
2122 default:
2123 ceph_abort();
2124 }
2125
2126 m->put();
2127 }
2128
2129 bool Client::_any_stale_sessions() const
2130 {
2131 assert(client_lock.is_locked_by_me());
2132
2133 for (const auto &i : mds_sessions) {
2134 if (i.second->state == MetaSession::STATE_STALE) {
2135 return true;
2136 }
2137 }
2138
2139 return false;
2140 }
2141
2142 void Client::_kick_stale_sessions()
2143 {
2144 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2145
2146 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2147 p != mds_sessions.end(); ) {
2148 MetaSession *s = p->second;
2149 ++p;
2150 if (s->state == MetaSession::STATE_STALE)
2151 _closed_mds_session(s);
2152 }
2153 }
2154
2155 void Client::send_request(MetaRequest *request, MetaSession *session,
2156 bool drop_cap_releases)
2157 {
2158 // make the request
2159 mds_rank_t mds = session->mds_num;
2160 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2161 << " for mds." << mds << dendl;
2162 MClientRequest *r = build_client_request(request);
2163 if (request->dentry()) {
2164 r->set_dentry_wanted();
2165 }
2166 if (request->got_unsafe) {
2167 r->set_replayed_op();
2168 if (request->target)
2169 r->head.ino = request->target->ino;
2170 } else {
2171 encode_cap_releases(request, mds);
2172 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2173 request->cap_releases.clear();
2174 else
2175 r->releases.swap(request->cap_releases);
2176 }
2177 r->set_mdsmap_epoch(mdsmap->get_epoch());
2178 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2179 objecter->with_osdmap([r](const OSDMap& o) {
2180 r->set_osdmap_epoch(o.get_epoch());
2181 });
2182 }
2183
2184 if (request->mds == -1) {
2185 request->sent_stamp = ceph_clock_now();
2186 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2187 }
2188 request->mds = mds;
2189
2190 Inode *in = request->inode();
2191 if (in && in->caps.count(mds))
2192 request->sent_on_mseq = in->caps[mds]->mseq;
2193
2194 session->requests.push_back(&request->item);
2195
2196 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2197 session->con->send_message(r);
2198 }
2199
2200 MClientRequest* Client::build_client_request(MetaRequest *request)
2201 {
2202 MClientRequest *req = new MClientRequest(request->get_op());
2203 req->set_tid(request->tid);
2204 req->set_stamp(request->op_stamp);
2205 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2206
2207 // if the filepath's haven't been set, set them!
2208 if (request->path.empty()) {
2209 Inode *in = request->inode();
2210 Dentry *de = request->dentry();
2211 if (in)
2212 in->make_nosnap_relative_path(request->path);
2213 else if (de) {
2214 if (de->inode)
2215 de->inode->make_nosnap_relative_path(request->path);
2216 else if (de->dir) {
2217 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2218 request->path.push_dentry(de->name);
2219 }
2220 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2221 << " No path, inode, or appropriately-endowed dentry given!"
2222 << dendl;
2223 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2224 << " No path, inode, or dentry given!"
2225 << dendl;
2226 }
2227 req->set_filepath(request->get_filepath());
2228 req->set_filepath2(request->get_filepath2());
2229 req->set_data(request->data);
2230 req->set_retry_attempt(request->retry_attempt++);
2231 req->head.num_fwd = request->num_fwd;
2232 const gid_t *_gids;
2233 int gid_count = request->perms.get_gids(&_gids);
2234 req->set_gid_list(gid_count, _gids);
2235 return req;
2236 }
2237
2238
2239
2240 void Client::handle_client_request_forward(MClientRequestForward *fwd)
2241 {
2242 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2243 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2244 if (!session) {
2245 fwd->put();
2246 return;
2247 }
2248 ceph_tid_t tid = fwd->get_tid();
2249
2250 if (mds_requests.count(tid) == 0) {
2251 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2252 fwd->put();
2253 return;
2254 }
2255
2256 MetaRequest *request = mds_requests[tid];
2257 assert(request);
2258
2259 // reset retry counter
2260 request->retry_attempt = 0;
2261
2262 // request not forwarded, or dest mds has no session.
2263 // resend.
2264 ldout(cct, 10) << "handle_client_request tid " << tid
2265 << " fwd " << fwd->get_num_fwd()
2266 << " to mds." << fwd->get_dest_mds()
2267 << ", resending to " << fwd->get_dest_mds()
2268 << dendl;
2269
2270 request->mds = -1;
2271 request->item.remove_myself();
2272 request->num_fwd = fwd->get_num_fwd();
2273 request->resend_mds = fwd->get_dest_mds();
2274 request->caller_cond->Signal();
2275
2276 fwd->put();
2277 }
2278
2279 bool Client::is_dir_operation(MetaRequest *req)
2280 {
2281 int op = req->get_op();
2282 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2283 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2284 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2285 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2286 return true;
2287 return false;
2288 }
2289
2290 void Client::handle_client_reply(MClientReply *reply)
2291 {
2292 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2293 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2294 if (!session) {
2295 reply->put();
2296 return;
2297 }
2298
2299 ceph_tid_t tid = reply->get_tid();
2300 bool is_safe = reply->is_safe();
2301
2302 if (mds_requests.count(tid) == 0) {
2303 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2304 << " safe is:" << is_safe << dendl;
2305 reply->put();
2306 return;
2307 }
2308 MetaRequest *request = mds_requests.at(tid);
2309
2310 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2311 << " tid " << tid << dendl;
2312
2313 if (request->got_unsafe && !is_safe) {
2314 //duplicate response
2315 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2316 << mds_num << " safe:" << is_safe << dendl;
2317 reply->put();
2318 return;
2319 }
2320
2321 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2322 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2323 << " from mds." << request->mds << dendl;
2324 request->send_to_auth = true;
2325 request->resend_mds = choose_target_mds(request);
2326 Inode *in = request->inode();
2327 if (request->resend_mds >= 0 &&
2328 request->resend_mds == request->mds &&
2329 (in == NULL ||
2330 in->caps.count(request->resend_mds) == 0 ||
2331 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2332 // have to return ESTALE
2333 } else {
2334 request->caller_cond->Signal();
2335 reply->put();
2336 return;
2337 }
2338 ldout(cct, 20) << "have to return ESTALE" << dendl;
2339 }
2340
2341 assert(request->reply == NULL);
2342 request->reply = reply;
2343 insert_trace(request, session);
2344
2345 // Handle unsafe reply
2346 if (!is_safe) {
2347 request->got_unsafe = true;
2348 session->unsafe_requests.push_back(&request->unsafe_item);
2349 if (is_dir_operation(request)) {
2350 Inode *dir = request->inode();
2351 assert(dir);
2352 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2353 }
2354 if (request->target) {
2355 InodeRef &in = request->target;
2356 in->unsafe_ops.push_back(&request->unsafe_target_item);
2357 }
2358 }
2359
2360 // Only signal the caller once (on the first reply):
2361 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2362 if (!is_safe || !request->got_unsafe) {
2363 Cond cond;
2364 request->dispatch_cond = &cond;
2365
2366 // wake up waiter
2367 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2368 request->caller_cond->Signal();
2369
2370 // wake for kick back
2371 while (request->dispatch_cond) {
2372 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2373 cond.Wait(client_lock);
2374 }
2375 }
2376
2377 if (is_safe) {
2378 // the filesystem change is committed to disk
2379 // we're done, clean up
2380 if (request->got_unsafe) {
2381 request->unsafe_item.remove_myself();
2382 request->unsafe_dir_item.remove_myself();
2383 request->unsafe_target_item.remove_myself();
2384 signal_cond_list(request->waitfor_safe);
2385 }
2386 request->item.remove_myself();
2387 unregister_request(request);
2388 }
2389 if (unmounting)
2390 mount_cond.Signal();
2391 }
2392
2393 void Client::_handle_full_flag(int64_t pool)
2394 {
2395 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2396 << "on " << pool << dendl;
2397 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2398 // to do this rather than blocking, because otherwise when we fill up we
2399 // potentially lock caps forever on files with dirty pages, and we need
2400 // to be able to release those caps to the MDS so that it can delete files
2401 // and free up space.
2402 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2403
2404 // For all inodes with layouts in this pool and a pending flush write op
2405 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2406 // from ObjectCacher so that it doesn't re-issue the write in response to
2407 // the ENOSPC error.
2408 // Fortunately since we're cancelling everything in a given pool, we don't
2409 // need to know which ops belong to which ObjectSet, we can just blow all
2410 // the un-flushed cached data away and mark any dirty inodes' async_err
2411 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2412 // affecting this pool, and all the objectsets we're purging were also
2413 // in this pool.
2414 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2415 i != inode_map.end(); ++i)
2416 {
2417 Inode *inode = i->second;
2418 if (inode->oset.dirty_or_tx
2419 && (pool == -1 || inode->layout.pool_id == pool)) {
2420 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2421 << " has dirty objects, purging and setting ENOSPC" << dendl;
2422 objectcacher->purge_set(&inode->oset);
2423 inode->set_async_err(-ENOSPC);
2424 }
2425 }
2426
2427 if (cancelled_epoch != (epoch_t)-1) {
2428 set_cap_epoch_barrier(cancelled_epoch);
2429 }
2430 }
2431
2432 void Client::handle_osd_map(MOSDMap *m)
2433 {
2434 std::set<entity_addr_t> new_blacklists;
2435 objecter->consume_blacklist_events(&new_blacklists);
2436
2437 const auto myaddr = messenger->get_myaddr();
2438 if (!blacklisted && new_blacklists.count(myaddr)) {
2439 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2440 return o.get_epoch();
2441 });
2442 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2443 blacklisted = true;
2444 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2445 p != mds_requests.end(); ) {
2446 auto req = p->second;
2447 ++p;
2448 req->abort(-EBLACKLISTED);
2449 if (req->caller_cond) {
2450 req->kick = true;
2451 req->caller_cond->Signal();
2452 }
2453 }
2454
2455 // Progress aborts on any requests that were on this waitlist. Any
2456 // requests that were on a waiting_for_open session waitlist
2457 // will get kicked during close session below.
2458 signal_cond_list(waiting_for_mdsmap);
2459
2460 // Force-close all sessions: assume this is not abandoning any state
2461 // on the MDS side because the MDS will have seen the blacklist too.
2462 while(!mds_sessions.empty()) {
2463 auto i = mds_sessions.begin();
2464 auto session = i->second;
2465 _closed_mds_session(session);
2466 }
2467
2468 // Since we know all our OSD ops will fail, cancel them all preemtively,
2469 // so that on an unhealthy cluster we can umount promptly even if e.g.
2470 // some PGs were inaccessible.
2471 objecter->op_cancel_writes(-EBLACKLISTED);
2472
2473 } else if (blacklisted) {
2474 // Handle case where we were blacklisted but no longer are
2475 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2476 return o.is_blacklisted(myaddr);});
2477 }
2478
2479 // Always subscribe to next osdmap for blacklisted client
2480 // until this client is not blacklisted.
2481 if (blacklisted) {
2482 objecter->maybe_request_map();
2483 }
2484
2485 if (objecter->osdmap_full_flag()) {
2486 _handle_full_flag(-1);
2487 } else {
2488 // Accumulate local list of full pools so that I can drop
2489 // the objecter lock before re-entering objecter in
2490 // cancel_writes
2491 std::vector<int64_t> full_pools;
2492
2493 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2494 for (const auto& kv : o.get_pools()) {
2495 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2496 full_pools.push_back(kv.first);
2497 }
2498 }
2499 });
2500
2501 for (auto p : full_pools)
2502 _handle_full_flag(p);
2503
2504 // Subscribe to subsequent maps to watch for the full flag going
2505 // away. For the global full flag objecter does this for us, but
2506 // it pays no attention to the per-pool full flag so in this branch
2507 // we do it ourselves.
2508 if (!full_pools.empty()) {
2509 objecter->maybe_request_map();
2510 }
2511 }
2512
2513 m->put();
2514 }
2515
2516
2517 // ------------------------
2518 // incoming messages
2519
2520
2521 bool Client::ms_dispatch(Message *m)
2522 {
2523 Mutex::Locker l(client_lock);
2524 if (!initialized) {
2525 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2526 m->put();
2527 return true;
2528 }
2529
2530 switch (m->get_type()) {
2531 // mounting and mds sessions
2532 case CEPH_MSG_MDS_MAP:
2533 handle_mds_map(static_cast<MMDSMap*>(m));
2534 break;
2535 case CEPH_MSG_FS_MAP:
2536 handle_fs_map(static_cast<MFSMap*>(m));
2537 break;
2538 case CEPH_MSG_FS_MAP_USER:
2539 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2540 break;
2541 case CEPH_MSG_CLIENT_SESSION:
2542 handle_client_session(static_cast<MClientSession*>(m));
2543 break;
2544
2545 case CEPH_MSG_OSD_MAP:
2546 handle_osd_map(static_cast<MOSDMap*>(m));
2547 break;
2548
2549 // requests
2550 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2551 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2552 break;
2553 case CEPH_MSG_CLIENT_REPLY:
2554 handle_client_reply(static_cast<MClientReply*>(m));
2555 break;
2556
2557 case CEPH_MSG_CLIENT_SNAP:
2558 handle_snap(static_cast<MClientSnap*>(m));
2559 break;
2560 case CEPH_MSG_CLIENT_CAPS:
2561 handle_caps(static_cast<MClientCaps*>(m));
2562 break;
2563 case CEPH_MSG_CLIENT_LEASE:
2564 handle_lease(static_cast<MClientLease*>(m));
2565 break;
2566 case MSG_COMMAND_REPLY:
2567 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2568 handle_command_reply(static_cast<MCommandReply*>(m));
2569 } else {
2570 return false;
2571 }
2572 break;
2573 case CEPH_MSG_CLIENT_QUOTA:
2574 handle_quota(static_cast<MClientQuota*>(m));
2575 break;
2576
2577 default:
2578 return false;
2579 }
2580
2581 // unmounting?
2582 if (unmounting) {
2583 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2584 << "+" << inode_map.size() << dendl;
2585 long unsigned size = lru.lru_get_size() + inode_map.size();
2586 trim_cache();
2587 if (size < lru.lru_get_size() + inode_map.size()) {
2588 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2589 mount_cond.Signal();
2590 } else {
2591 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2592 << "+" << inode_map.size() << dendl;
2593 }
2594 }
2595
2596 return true;
2597 }
2598
2599 void Client::handle_fs_map(MFSMap *m)
2600 {
2601 fsmap.reset(new FSMap(m->get_fsmap()));
2602 m->put();
2603
2604 signal_cond_list(waiting_for_fsmap);
2605
2606 monclient->sub_got("fsmap", fsmap->get_epoch());
2607 }
2608
2609 void Client::handle_fs_map_user(MFSMapUser *m)
2610 {
2611 fsmap_user.reset(new FSMapUser);
2612 *fsmap_user = m->get_fsmap();
2613 m->put();
2614
2615 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2616 signal_cond_list(waiting_for_fsmap);
2617 }
2618
2619 void Client::handle_mds_map(MMDSMap* m)
2620 {
2621 mds_gid_t old_inc, new_inc;
2622 if (m->get_epoch() <= mdsmap->get_epoch()) {
2623 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2624 << " is identical to or older than our "
2625 << mdsmap->get_epoch() << dendl;
2626 m->put();
2627 return;
2628 }
2629
2630 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2631
2632 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2633 oldmap.swap(mdsmap);
2634
2635 mdsmap->decode(m->get_encoded());
2636
2637 // Cancel any commands for missing or laggy GIDs
2638 std::list<ceph_tid_t> cancel_ops;
2639 auto &commands = command_table.get_commands();
2640 for (const auto &i : commands) {
2641 auto &op = i.second;
2642 const mds_gid_t op_mds_gid = op.mds_gid;
2643 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2644 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2645 cancel_ops.push_back(i.first);
2646 if (op.outs) {
2647 std::ostringstream ss;
2648 ss << "MDS " << op_mds_gid << " went away";
2649 *(op.outs) = ss.str();
2650 }
2651 op.con->mark_down();
2652 if (op.on_finish) {
2653 op.on_finish->complete(-ETIMEDOUT);
2654 }
2655 }
2656 }
2657
2658 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2659 i != cancel_ops.end(); ++i) {
2660 command_table.erase(*i);
2661 }
2662
2663 // reset session
2664 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2665 p != mds_sessions.end(); ) {
2666 mds_rank_t mds = p->first;
2667 MetaSession *session = p->second;
2668 ++p;
2669
2670 int oldstate = oldmap->get_state(mds);
2671 int newstate = mdsmap->get_state(mds);
2672 if (!mdsmap->is_up(mds)) {
2673 session->con->mark_down();
2674 } else if (mdsmap->get_inst(mds) != session->inst) {
2675 old_inc = oldmap->get_incarnation(mds);
2676 new_inc = mdsmap->get_incarnation(mds);
2677 if (old_inc != new_inc) {
2678 ldout(cct, 1) << "mds incarnation changed from "
2679 << old_inc << " to " << new_inc << dendl;
2680 oldstate = MDSMap::STATE_NULL;
2681 }
2682 session->con->mark_down();
2683 session->inst = mdsmap->get_inst(mds);
2684 // When new MDS starts to take over, notify kernel to trim unused entries
2685 // in its dcache/icache. Hopefully, the kernel will release some unused
2686 // inodes before the new MDS enters reconnect state.
2687 trim_cache_for_reconnect(session);
2688 } else if (oldstate == newstate)
2689 continue; // no change
2690
2691 session->mds_state = newstate;
2692 if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
2693 // missed reconnect close the session so that it can be reopened
2694 _closed_mds_session(session);
2695 continue;
2696 }
2697 if (newstate == MDSMap::STATE_RECONNECT) {
2698 session->con = messenger->get_connection(session->inst);
2699 send_reconnect(session);
2700 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2701 if (oldstate < MDSMap::STATE_ACTIVE) {
2702 // kick new requests
2703 kick_requests(session);
2704 kick_flushing_caps(session);
2705 signal_context_list(session->waiting_for_open);
2706 kick_maxsize_requests(session);
2707 wake_inode_waiters(session);
2708 }
2709 connect_mds_targets(mds);
2710 } else if (newstate == MDSMap::STATE_NULL &&
2711 mds >= mdsmap->get_max_mds()) {
2712 _closed_mds_session(session);
2713 }
2714 }
2715
2716 // kick any waiting threads
2717 signal_cond_list(waiting_for_mdsmap);
2718
2719 m->put();
2720
2721 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2722 }
2723
2724 void Client::send_reconnect(MetaSession *session)
2725 {
2726 mds_rank_t mds = session->mds_num;
2727 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2728
2729 // trim unused caps to reduce MDS's cache rejoin time
2730 trim_cache_for_reconnect(session);
2731
2732 session->readonly = false;
2733
2734 if (session->release) {
2735 session->release->put();
2736 session->release = NULL;
2737 }
2738
2739 // reset my cap seq number
2740 session->seq = 0;
2741 //connect to the mds' offload targets
2742 connect_mds_targets(mds);
2743 //make sure unsafe requests get saved
2744 resend_unsafe_requests(session);
2745
2746 MClientReconnect *m = new MClientReconnect;
2747
2748 // i have an open session.
2749 ceph::unordered_set<inodeno_t> did_snaprealm;
2750 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2751 p != inode_map.end();
2752 ++p) {
2753 Inode *in = p->second;
2754 if (in->caps.count(mds)) {
2755 ldout(cct, 10) << " caps on " << p->first
2756 << " " << ccap_string(in->caps[mds]->issued)
2757 << " wants " << ccap_string(in->caps_wanted())
2758 << dendl;
2759 filepath path;
2760 in->make_long_path(path);
2761 ldout(cct, 10) << " path " << path << dendl;
2762
2763 bufferlist flockbl;
2764 _encode_filelocks(in, flockbl);
2765
2766 Cap *cap = in->caps[mds];
2767 cap->seq = 0; // reset seq.
2768 cap->issue_seq = 0; // reset seq.
2769 cap->mseq = 0; // reset seq.
2770 cap->issued = cap->implemented;
2771
2772 snapid_t snap_follows = 0;
2773 if (!in->cap_snaps.empty())
2774 snap_follows = in->cap_snaps.begin()->first;
2775
2776 m->add_cap(p->first.ino,
2777 cap->cap_id,
2778 path.get_ino(), path.get_path(), // ino
2779 in->caps_wanted(), // wanted
2780 cap->issued, // issued
2781 in->snaprealm->ino,
2782 snap_follows,
2783 flockbl);
2784
2785 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2786 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2787 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2788 did_snaprealm.insert(in->snaprealm->ino);
2789 }
2790 }
2791 }
2792
2793 early_kick_flushing_caps(session);
2794
2795 session->con->send_message(m);
2796
2797 mount_cond.Signal();
2798 }
2799
2800
2801 void Client::kick_requests(MetaSession *session)
2802 {
2803 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2804 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2805 p != mds_requests.end();
2806 ++p) {
2807 MetaRequest *req = p->second;
2808 if (req->got_unsafe)
2809 continue;
2810 if (req->aborted()) {
2811 if (req->caller_cond) {
2812 req->kick = true;
2813 req->caller_cond->Signal();
2814 }
2815 continue;
2816 }
2817 if (req->retry_attempt > 0)
2818 continue; // new requests only
2819 if (req->mds == session->mds_num) {
2820 send_request(p->second, session);
2821 }
2822 }
2823 }
2824
2825 void Client::resend_unsafe_requests(MetaSession *session)
2826 {
2827 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2828 !iter.end();
2829 ++iter)
2830 send_request(*iter, session);
2831
2832 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2833 // process completed requests in clientreplay stage.
2834 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2835 p != mds_requests.end();
2836 ++p) {
2837 MetaRequest *req = p->second;
2838 if (req->got_unsafe)
2839 continue;
2840 if (req->aborted())
2841 continue;
2842 if (req->retry_attempt == 0)
2843 continue; // old requests only
2844 if (req->mds == session->mds_num)
2845 send_request(req, session, true);
2846 }
2847 }
2848
2849 void Client::wait_unsafe_requests()
2850 {
2851 list<MetaRequest*> last_unsafe_reqs;
2852 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2853 p != mds_sessions.end();
2854 ++p) {
2855 MetaSession *s = p->second;
2856 if (!s->unsafe_requests.empty()) {
2857 MetaRequest *req = s->unsafe_requests.back();
2858 req->get();
2859 last_unsafe_reqs.push_back(req);
2860 }
2861 }
2862
2863 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2864 p != last_unsafe_reqs.end();
2865 ++p) {
2866 MetaRequest *req = *p;
2867 if (req->unsafe_item.is_on_list())
2868 wait_on_list(req->waitfor_safe);
2869 put_request(req);
2870 }
2871 }
2872
2873 void Client::kick_requests_closed(MetaSession *session)
2874 {
2875 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2876 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2877 p != mds_requests.end(); ) {
2878 MetaRequest *req = p->second;
2879 ++p;
2880 if (req->mds == session->mds_num) {
2881 if (req->caller_cond) {
2882 req->kick = true;
2883 req->caller_cond->Signal();
2884 }
2885 req->item.remove_myself();
2886 if (req->got_unsafe) {
2887 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2888 req->unsafe_item.remove_myself();
2889 req->unsafe_dir_item.remove_myself();
2890 req->unsafe_target_item.remove_myself();
2891 signal_cond_list(req->waitfor_safe);
2892 unregister_request(req);
2893 }
2894 }
2895 }
2896 assert(session->requests.empty());
2897 assert(session->unsafe_requests.empty());
2898 }
2899
2900
2901
2902
2903 /************
2904 * leases
2905 */
2906
2907 void Client::got_mds_push(MetaSession *s)
2908 {
2909 s->seq++;
2910 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2911 if (s->state == MetaSession::STATE_CLOSING) {
2912 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2913 }
2914 }
2915
2916 void Client::handle_lease(MClientLease *m)
2917 {
2918 ldout(cct, 10) << "handle_lease " << *m << dendl;
2919
2920 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2921
2922 mds_rank_t mds = mds_rank_t(m->get_source().num());
2923 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2924 if (!session) {
2925 m->put();
2926 return;
2927 }
2928
2929 got_mds_push(session);
2930
2931 ceph_seq_t seq = m->get_seq();
2932
2933 Inode *in;
2934 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2935 if (inode_map.count(vino) == 0) {
2936 ldout(cct, 10) << " don't have vino " << vino << dendl;
2937 goto revoke;
2938 }
2939 in = inode_map[vino];
2940
2941 if (m->get_mask() & CEPH_LOCK_DN) {
2942 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2943 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2944 goto revoke;
2945 }
2946 Dentry *dn = in->dir->dentries[m->dname];
2947 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2948 dn->lease_mds = -1;
2949 }
2950
2951 revoke:
2952 m->get_connection()->send_message(
2953 new MClientLease(
2954 CEPH_MDS_LEASE_RELEASE, seq,
2955 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2956 m->put();
2957 }
2958
2959 void Client::put_inode(Inode *in, int n)
2960 {
2961 ldout(cct, 10) << "put_inode on " << *in << dendl;
2962 int left = in->_put(n);
2963 if (left == 0) {
2964 // release any caps
2965 remove_all_caps(in);
2966
2967 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2968 bool unclean = objectcacher->release_set(&in->oset);
2969 assert(!unclean);
2970 inode_map.erase(in->vino());
2971 if (use_faked_inos())
2972 _release_faked_ino(in);
2973
2974 if (in == root) {
2975 root = 0;
2976 root_ancestor = 0;
2977 while (!root_parents.empty())
2978 root_parents.erase(root_parents.begin());
2979 }
2980
2981 delete in;
2982 }
2983 }
2984
2985 void Client::close_dir(Dir *dir)
2986 {
2987 Inode *in = dir->parent_inode;
2988 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2989 assert(dir->is_empty());
2990 assert(in->dir == dir);
2991 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2992 if (!in->dn_set.empty())
2993 in->get_first_parent()->put(); // unpin dentry
2994
2995 delete in->dir;
2996 in->dir = 0;
2997 put_inode(in); // unpin inode
2998 }
2999
3000 /**
3001 * Don't call this with in==NULL, use get_or_create for that
3002 * leave dn set to default NULL unless you're trying to add
3003 * a new inode to a pre-created Dentry
3004 */
3005 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3006 {
3007 if (!dn) {
3008 // create a new Dentry
3009 dn = new Dentry;
3010 dn->name = name;
3011
3012 // link to dir
3013 dn->dir = dir;
3014 dir->dentries[dn->name] = dn;
3015 lru.lru_insert_mid(dn); // mid or top?
3016 if (!in)
3017 dir->num_null_dentries++;
3018
3019 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3020 << " dn " << dn << " (new dn)" << dendl;
3021 } else {
3022 assert(!dn->inode);
3023 if (in)
3024 dir->num_null_dentries--;
3025 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3026 << " dn " << dn << " (old dn)" << dendl;
3027 }
3028
3029 if (in) { // link to inode
3030 dn->inode = in;
3031 if (in->is_dir()) {
3032 if (in->dir)
3033 dn->get(); // dir -> dn pin
3034 if (in->ll_ref)
3035 dn->get(); // ll_ref -> dn pin
3036 }
3037
3038 assert(in->dn_set.count(dn) == 0);
3039
3040 // only one parent for directories!
3041 if (in->is_dir() && !in->dn_set.empty()) {
3042 Dentry *olddn = in->get_first_parent();
3043 assert(olddn->dir != dir || olddn->name != name);
3044 Inode *old_diri = olddn->dir->parent_inode;
3045 old_diri->dir_release_count++;
3046 clear_dir_complete_and_ordered(old_diri, true);
3047 unlink(olddn, true, true); // keep dir, dentry
3048 }
3049
3050 in->dn_set.insert(dn);
3051
3052 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3053 }
3054
3055 return dn;
3056 }
3057
3058 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3059 {
3060 InodeRef in;
3061 in.swap(dn->inode);
3062 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3063 << " inode " << dn->inode << dendl;
3064
3065 // unlink from inode
3066 if (in) {
3067 if (in->is_dir()) {
3068 if (in->dir)
3069 dn->put(); // dir -> dn pin
3070 if (in->ll_ref)
3071 dn->put(); // ll_ref -> dn pin
3072 }
3073 dn->inode = 0;
3074 assert(in->dn_set.count(dn));
3075 in->dn_set.erase(dn);
3076 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3077 }
3078
3079 if (keepdentry) {
3080 dn->lease_mds = -1;
3081 if (in)
3082 dn->dir->num_null_dentries++;
3083 } else {
3084 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3085
3086 // unlink from dir
3087 dn->dir->dentries.erase(dn->name);
3088 if (!in)
3089 dn->dir->num_null_dentries--;
3090 if (dn->dir->is_empty() && !keepdir)
3091 close_dir(dn->dir);
3092 dn->dir = 0;
3093
3094 // delete den
3095 lru.lru_remove(dn);
3096 dn->put();
3097 }
3098 }
3099
3100 /**
3101 * For asynchronous flushes, check for errors from the IO and
3102 * update the inode if necessary
3103 */
3104 class C_Client_FlushComplete : public Context {
3105 private:
3106 Client *client;
3107 InodeRef inode;
3108 public:
3109 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3110 void finish(int r) override {
3111 assert(client->client_lock.is_locked_by_me());
3112 if (r != 0) {
3113 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3114 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3115 << " 0x" << std::hex << inode->ino << std::dec
3116 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3117 inode->set_async_err(r);
3118 }
3119 }
3120 };
3121
3122
3123 /****
3124 * caps
3125 */
3126
3127 void Client::get_cap_ref(Inode *in, int cap)
3128 {
3129 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3130 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3131 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3132 in->get();
3133 }
3134 if ((cap & CEPH_CAP_FILE_CACHE) &&
3135 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3136 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3137 in->get();
3138 }
3139 in->get_cap_ref(cap);
3140 }
3141
3142 void Client::put_cap_ref(Inode *in, int cap)
3143 {
3144 int last = in->put_cap_ref(cap);
3145 if (last) {
3146 int put_nref = 0;
3147 int drop = last & ~in->caps_issued();
3148 if (in->snapid == CEPH_NOSNAP) {
3149 if ((last & CEPH_CAP_FILE_WR) &&
3150 !in->cap_snaps.empty() &&
3151 in->cap_snaps.rbegin()->second.writing) {
3152 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3153 in->cap_snaps.rbegin()->second.writing = 0;
3154 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3155 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3156 }
3157 if (last & CEPH_CAP_FILE_BUFFER) {
3158 for (auto &p : in->cap_snaps)
3159 p.second.dirty_data = 0;
3160 signal_cond_list(in->waitfor_commit);
3161 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3162 ++put_nref;
3163 }
3164 }
3165 if (last & CEPH_CAP_FILE_CACHE) {
3166 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3167 ++put_nref;
3168 }
3169 if (drop)
3170 check_caps(in, 0);
3171 if (put_nref)
3172 put_inode(in, put_nref);
3173 }
3174 }
3175
3176 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3177 {
3178 int r = check_pool_perm(in, need);
3179 if (r < 0)
3180 return r;
3181
3182 while (1) {
3183 int file_wanted = in->caps_file_wanted();
3184 if ((file_wanted & need) != need) {
3185 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3186 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3187 << dendl;
3188 return -EBADF;
3189 }
3190
3191 int implemented;
3192 int have = in->caps_issued(&implemented);
3193
3194 bool waitfor_caps = false;
3195 bool waitfor_commit = false;
3196
3197 if (have & need & CEPH_CAP_FILE_WR) {
3198 if (endoff > 0 &&
3199 (endoff >= (loff_t)in->max_size ||
3200 endoff > (loff_t)(in->size << 1)) &&
3201 endoff > (loff_t)in->wanted_max_size) {
3202 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3203 in->wanted_max_size = endoff;
3204 check_caps(in, 0);
3205 }
3206
3207 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3208 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3209 waitfor_caps = true;
3210 }
3211 if (!in->cap_snaps.empty()) {
3212 if (in->cap_snaps.rbegin()->second.writing) {
3213 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3214 waitfor_caps = true;
3215 }
3216 for (auto &p : in->cap_snaps) {
3217 if (p.second.dirty_data) {
3218 waitfor_commit = true;
3219 break;
3220 }
3221 }
3222 if (waitfor_commit) {
3223 _flush(in, new C_Client_FlushComplete(this, in));
3224 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3225 }
3226 }
3227 }
3228
3229 if (!waitfor_caps && !waitfor_commit) {
3230 if ((have & need) == need) {
3231 int revoking = implemented & ~have;
3232 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3233 << " need " << ccap_string(need) << " want " << ccap_string(want)
3234 << " revoking " << ccap_string(revoking)
3235 << dendl;
3236 if ((revoking & want) == 0) {
3237 *phave = need | (have & want);
3238 in->get_cap_ref(need);
3239 return 0;
3240 }
3241 }
3242 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3243 waitfor_caps = true;
3244 }
3245
3246 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3247 in->auth_cap->session->readonly)
3248 return -EROFS;
3249
3250 if (in->flags & I_CAP_DROPPED) {
3251 int mds_wanted = in->caps_mds_wanted();
3252 if ((mds_wanted & need) != need) {
3253 int ret = _renew_caps(in);
3254 if (ret < 0)
3255 return ret;
3256 continue;
3257 }
3258 if ((mds_wanted & file_wanted) ==
3259 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3260 in->flags &= ~I_CAP_DROPPED;
3261 }
3262 }
3263
3264 if (waitfor_caps)
3265 wait_on_list(in->waitfor_caps);
3266 else if (waitfor_commit)
3267 wait_on_list(in->waitfor_commit);
3268 }
3269 }
3270
3271 int Client::get_caps_used(Inode *in)
3272 {
3273 unsigned used = in->caps_used();
3274 if (!(used & CEPH_CAP_FILE_CACHE) &&
3275 !objectcacher->set_is_empty(&in->oset))
3276 used |= CEPH_CAP_FILE_CACHE;
3277 return used;
3278 }
3279
3280 void Client::cap_delay_requeue(Inode *in)
3281 {
3282 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3283 in->hold_caps_until = ceph_clock_now();
3284 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3285 delayed_list.push_back(&in->delay_cap_item);
3286 }
3287
3288 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3289 bool sync, int used, int want, int retain,
3290 int flush, ceph_tid_t flush_tid)
3291 {
3292 int held = cap->issued | cap->implemented;
3293 int revoking = cap->implemented & ~cap->issued;
3294 retain &= ~revoking;
3295 int dropping = cap->issued & ~retain;
3296 int op = CEPH_CAP_OP_UPDATE;
3297
3298 ldout(cct, 10) << "send_cap " << *in
3299 << " mds." << session->mds_num << " seq " << cap->seq
3300 << (sync ? " sync " : " async ")
3301 << " used " << ccap_string(used)
3302 << " want " << ccap_string(want)
3303 << " flush " << ccap_string(flush)
3304 << " retain " << ccap_string(retain)
3305 << " held "<< ccap_string(held)
3306 << " revoking " << ccap_string(revoking)
3307 << " dropping " << ccap_string(dropping)
3308 << dendl;
3309
3310 if (cct->_conf->client_inject_release_failure && revoking) {
3311 const int would_have_issued = cap->issued & retain;
3312 const int would_have_implemented = cap->implemented & (cap->issued | used);
3313 // Simulated bug:
3314 // - tell the server we think issued is whatever they issued plus whatever we implemented
3315 // - leave what we have implemented in place
3316 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3317 cap->issued = cap->issued | cap->implemented;
3318
3319 // Make an exception for revoking xattr caps: we are injecting
3320 // failure to release other caps, but allow xattr because client
3321 // will block on xattr ops if it can't release these to MDS (#9800)
3322 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3323 cap->issued ^= xattr_mask & revoking;
3324 cap->implemented ^= xattr_mask & revoking;
3325
3326 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3327 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3328 } else {
3329 // Normal behaviour
3330 cap->issued &= retain;
3331 cap->implemented &= cap->issued | used;
3332 }
3333
3334 snapid_t follows = 0;
3335
3336 if (flush)
3337 follows = in->snaprealm->get_snap_context().seq;
3338
3339 MClientCaps *m = new MClientCaps(op,
3340 in->ino,
3341 0,
3342 cap->cap_id, cap->seq,
3343 cap->implemented,
3344 want,
3345 flush,
3346 cap->mseq,
3347 cap_epoch_barrier);
3348 m->caller_uid = in->cap_dirtier_uid;
3349 m->caller_gid = in->cap_dirtier_gid;
3350
3351 m->head.issue_seq = cap->issue_seq;
3352 m->set_tid(flush_tid);
3353
3354 m->head.uid = in->uid;
3355 m->head.gid = in->gid;
3356 m->head.mode = in->mode;
3357
3358 m->head.nlink = in->nlink;
3359
3360 if (flush & CEPH_CAP_XATTR_EXCL) {
3361 ::encode(in->xattrs, m->xattrbl);
3362 m->head.xattr_version = in->xattr_version;
3363 }
3364
3365 m->size = in->size;
3366 m->max_size = in->max_size;
3367 m->truncate_seq = in->truncate_seq;
3368 m->truncate_size = in->truncate_size;
3369 m->mtime = in->mtime;
3370 m->atime = in->atime;
3371 m->ctime = in->ctime;
3372 m->btime = in->btime;
3373 m->time_warp_seq = in->time_warp_seq;
3374 m->change_attr = in->change_attr;
3375 if (sync)
3376 m->flags |= CLIENT_CAPS_SYNC;
3377
3378 if (flush & CEPH_CAP_FILE_WR) {
3379 m->inline_version = in->inline_version;
3380 m->inline_data = in->inline_data;
3381 }
3382
3383 in->reported_size = in->size;
3384 m->set_snap_follows(follows);
3385 cap->wanted = want;
3386 if (cap == in->auth_cap) {
3387 m->set_max_size(in->wanted_max_size);
3388 in->requested_max_size = in->wanted_max_size;
3389 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3390 }
3391
3392 if (!session->flushing_caps_tids.empty())
3393 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3394
3395 session->con->send_message(m);
3396 }
3397
3398 static bool is_max_size_approaching(Inode *in)
3399 {
3400 /* mds will adjust max size according to the reported size */
3401 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3402 return false;
3403 if (in->size >= in->max_size)
3404 return true;
3405 /* half of previous max_size increment has been used */
3406 if (in->max_size > in->reported_size &&
3407 (in->size << 1) >= in->max_size + in->reported_size)
3408 return true;
3409 return false;
3410 }
3411
3412 /**
3413 * check_caps
3414 *
3415 * Examine currently used and wanted versus held caps. Release, flush or ack
3416 * revoked caps to the MDS as appropriate.
3417 *
3418 * @param in the inode to check
3419 * @param flags flags to apply to cap check
3420 */
3421 void Client::check_caps(Inode *in, unsigned flags)
3422 {
3423 unsigned wanted = in->caps_wanted();
3424 unsigned used = get_caps_used(in);
3425 unsigned cap_used;
3426
3427 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3428 // we do this here because we don't want to drop to Fs (and then
3429 // drop the Fs if we do a create!) if that alone makes us send lookups
3430 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3431 wanted |= CEPH_CAP_FILE_EXCL;
3432 }
3433
3434 int implemented;
3435 int issued = in->caps_issued(&implemented);
3436 int revoking = implemented & ~issued;
3437
3438 int retain = wanted | used | CEPH_CAP_PIN;
3439 if (!unmounting) {
3440 if (wanted)
3441 retain |= CEPH_CAP_ANY;
3442 else
3443 retain |= CEPH_CAP_ANY_SHARED;
3444 }
3445
3446 ldout(cct, 10) << "check_caps on " << *in
3447 << " wanted " << ccap_string(wanted)
3448 << " used " << ccap_string(used)
3449 << " issued " << ccap_string(issued)
3450 << " revoking " << ccap_string(revoking)
3451 << " flags=" << flags
3452 << dendl;
3453
3454 if (in->snapid != CEPH_NOSNAP)
3455 return; //snap caps last forever, can't write
3456
3457 if (in->caps.empty())
3458 return; // guard if at end of func
3459
3460 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3461 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3462 if (_release(in))
3463 used &= ~CEPH_CAP_FILE_CACHE;
3464 }
3465
3466 if (!in->cap_snaps.empty())
3467 flush_snaps(in);
3468
3469 if (flags & CHECK_CAPS_NODELAY)
3470 in->hold_caps_until = utime_t();
3471 else
3472 cap_delay_requeue(in);
3473
3474 utime_t now = ceph_clock_now();
3475
3476 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3477 while (it != in->caps.end()) {
3478 mds_rank_t mds = it->first;
3479 Cap *cap = it->second;
3480 ++it;
3481
3482 MetaSession *session = mds_sessions[mds];
3483 assert(session);
3484
3485 cap_used = used;
3486 if (in->auth_cap && cap != in->auth_cap)
3487 cap_used &= ~in->auth_cap->issued;
3488
3489 revoking = cap->implemented & ~cap->issued;
3490
3491 ldout(cct, 10) << " cap mds." << mds
3492 << " issued " << ccap_string(cap->issued)
3493 << " implemented " << ccap_string(cap->implemented)
3494 << " revoking " << ccap_string(revoking) << dendl;
3495
3496 if (in->wanted_max_size > in->max_size &&
3497 in->wanted_max_size > in->requested_max_size &&
3498 cap == in->auth_cap)
3499 goto ack;
3500
3501 /* approaching file_max? */
3502 if ((cap->issued & CEPH_CAP_FILE_WR) &&
3503 cap == in->auth_cap &&
3504 is_max_size_approaching(in)) {
3505 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3506 << ", reported " << in->reported_size << dendl;
3507 goto ack;
3508 }
3509
3510 /* completed revocation? */
3511 if (revoking && (revoking & cap_used) == 0) {
3512 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3513 goto ack;
3514 }
3515
3516 /* want more caps from mds? */
3517 if (wanted & ~(cap->wanted | cap->issued))
3518 goto ack;
3519
3520 if (!revoking && unmounting && (cap_used == 0))
3521 goto ack;
3522
3523 if (wanted == cap->wanted && // mds knows what we want.
3524 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3525 !in->dirty_caps) // and we have no dirty caps
3526 continue;
3527
3528 if (now < in->hold_caps_until) {
3529 ldout(cct, 10) << "delaying cap release" << dendl;
3530 continue;
3531 }
3532
3533 ack:
3534 // re-send old cap/snapcap flushes first.
3535 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3536 session->mds_state < MDSMap::STATE_ACTIVE &&
3537 session->early_flushing_caps.count(in) == 0) {
3538 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3539 << " to mds." << session->mds_num << dendl;
3540 session->early_flushing_caps.insert(in);
3541 if (in->cap_snaps.size())
3542 flush_snaps(in, true);
3543 if (in->flushing_caps)
3544 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3545 }
3546
3547 int flushing;
3548 ceph_tid_t flush_tid;
3549 if (in->auth_cap == cap && in->dirty_caps) {
3550 flushing = mark_caps_flushing(in, &flush_tid);
3551 } else {
3552 flushing = 0;
3553 flush_tid = 0;
3554 }
3555
3556 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3557 retain, flushing, flush_tid);
3558 }
3559 }
3560
3561
3562 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3563 {
3564 int used = get_caps_used(in);
3565 int dirty = in->caps_dirty();
3566 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3567
3568 if (in->cap_snaps.size() &&
3569 in->cap_snaps.rbegin()->second.writing) {
3570 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3571 return;
3572 } else if (in->caps_dirty() ||
3573 (used & CEPH_CAP_FILE_WR) ||
3574 (dirty & CEPH_CAP_ANY_WR)) {
3575 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3576 assert(capsnapem.second == true); /* element inserted */
3577 CapSnap &capsnap = capsnapem.first->second;
3578 capsnap.context = old_snapc;
3579 capsnap.issued = in->caps_issued();
3580 capsnap.dirty = in->caps_dirty();
3581
3582 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3583
3584 capsnap.uid = in->uid;
3585 capsnap.gid = in->gid;
3586 capsnap.mode = in->mode;
3587 capsnap.btime = in->btime;
3588 capsnap.xattrs = in->xattrs;
3589 capsnap.xattr_version = in->xattr_version;
3590
3591 if (used & CEPH_CAP_FILE_WR) {
3592 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3593 capsnap.writing = 1;
3594 } else {
3595 finish_cap_snap(in, capsnap, used);
3596 }
3597 } else {
3598 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3599 }
3600 }
3601
3602 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3603 {
3604 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3605 capsnap.size = in->size;
3606 capsnap.mtime = in->mtime;
3607 capsnap.atime = in->atime;
3608 capsnap.ctime = in->ctime;
3609 capsnap.time_warp_seq = in->time_warp_seq;
3610 capsnap.change_attr = in->change_attr;
3611
3612 capsnap.dirty |= in->caps_dirty();
3613
3614 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3615 capsnap.inline_data = in->inline_data;
3616 capsnap.inline_version = in->inline_version;
3617 }
3618
3619 if (used & CEPH_CAP_FILE_BUFFER) {
3620 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3621 << " WRBUFFER, delaying" << dendl;
3622 } else {
3623 capsnap.dirty_data = 0;
3624 flush_snaps(in);
3625 }
3626 }
3627
3628 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3629 {
3630 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3631 in->cap_snaps.at(seq).dirty_data = 0;
3632 flush_snaps(in);
3633 }
3634
3635 void Client::flush_snaps(Inode *in, bool all_again)
3636 {
3637 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3638 assert(in->cap_snaps.size());
3639
3640 // pick auth mds
3641 assert(in->auth_cap);
3642 MetaSession *session = in->auth_cap->session;
3643 int mseq = in->auth_cap->mseq;
3644
3645 for (auto &p : in->cap_snaps) {
3646 CapSnap &capsnap = p.second;
3647 if (!all_again) {
3648 // only flush once per session
3649 if (capsnap.flush_tid > 0)
3650 continue;
3651 }
3652
3653 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3654 << " follows " << p.first
3655 << " size " << capsnap.size
3656 << " mtime " << capsnap.mtime
3657 << " dirty_data=" << capsnap.dirty_data
3658 << " writing=" << capsnap.writing
3659 << " on " << *in << dendl;
3660 if (capsnap.dirty_data || capsnap.writing)
3661 continue;
3662
3663 if (capsnap.flush_tid == 0) {
3664 capsnap.flush_tid = ++last_flush_tid;
3665 if (!in->flushing_cap_item.is_on_list())
3666 session->flushing_caps.push_back(&in->flushing_cap_item);
3667 session->flushing_caps_tids.insert(capsnap.flush_tid);
3668 }
3669
3670 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3671 cap_epoch_barrier);
3672 if (user_id >= 0)
3673 m->caller_uid = user_id;
3674 if (group_id >= 0)
3675 m->caller_gid = group_id;
3676
3677 m->set_client_tid(capsnap.flush_tid);
3678 m->head.snap_follows = p.first;
3679
3680 m->head.caps = capsnap.issued;
3681 m->head.dirty = capsnap.dirty;
3682
3683 m->head.uid = capsnap.uid;
3684 m->head.gid = capsnap.gid;
3685 m->head.mode = capsnap.mode;
3686 m->btime = capsnap.btime;
3687
3688 m->size = capsnap.size;
3689
3690 m->head.xattr_version = capsnap.xattr_version;
3691 ::encode(capsnap.xattrs, m->xattrbl);
3692
3693 m->ctime = capsnap.ctime;
3694 m->btime = capsnap.btime;
3695 m->mtime = capsnap.mtime;
3696 m->atime = capsnap.atime;
3697 m->time_warp_seq = capsnap.time_warp_seq;
3698 m->change_attr = capsnap.change_attr;
3699
3700 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3701 m->inline_version = in->inline_version;
3702 m->inline_data = in->inline_data;
3703 }
3704
3705 assert(!session->flushing_caps_tids.empty());
3706 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3707
3708 session->con->send_message(m);
3709 }
3710 }
3711
3712
3713
3714 void Client::wait_on_list(list<Cond*>& ls)
3715 {
3716 Cond cond;
3717 ls.push_back(&cond);
3718 cond.Wait(client_lock);
3719 ls.remove(&cond);
3720 }
3721
3722 void Client::signal_cond_list(list<Cond*>& ls)
3723 {
3724 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3725 (*it)->Signal();
3726 }
3727
3728 void Client::wait_on_context_list(list<Context*>& ls)
3729 {
3730 Cond cond;
3731 bool done = false;
3732 int r;
3733 ls.push_back(new C_Cond(&cond, &done, &r));
3734 while (!done)
3735 cond.Wait(client_lock);
3736 }
3737
3738 void Client::signal_context_list(list<Context*>& ls)
3739 {
3740 while (!ls.empty()) {
3741 ls.front()->complete(0);
3742 ls.pop_front();
3743 }
3744 }
3745
3746 void Client::wake_inode_waiters(MetaSession *s)
3747 {
3748 xlist<Cap*>::iterator iter = s->caps.begin();
3749 while (!iter.end()){
3750 signal_cond_list((*iter)->inode->waitfor_caps);
3751 ++iter;
3752 }
3753 }
3754
3755
3756 // flush dirty data (from objectcache)
3757
3758 class C_Client_CacheInvalidate : public Context {
3759 private:
3760 Client *client;
3761 vinodeno_t ino;
3762 int64_t offset, length;
3763 public:
3764 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3765 client(c), offset(off), length(len) {
3766 if (client->use_faked_inos())
3767 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3768 else
3769 ino = in->vino();
3770 }
3771 void finish(int r) override {
3772 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3773 assert(!client->client_lock.is_locked_by_me());
3774 client->_async_invalidate(ino, offset, length);
3775 }
3776 };
3777
3778 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3779 {
3780 if (unmounting)
3781 return;
3782 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3783 ino_invalidate_cb(callback_handle, ino, off, len);
3784 }
3785
3786 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3787
3788 if (ino_invalidate_cb)
3789 // we queue the invalidate, which calls the callback and decrements the ref
3790 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3791 }
3792
3793 void Client::_invalidate_inode_cache(Inode *in)
3794 {
3795 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3796
3797 // invalidate our userspace inode cache
3798 if (cct->_conf->client_oc) {
3799 objectcacher->release_set(&in->oset);
3800 if (!objectcacher->set_is_empty(&in->oset))
3801 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3802 }
3803
3804 _schedule_invalidate_callback(in, 0, 0);
3805 }
3806
3807 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3808 {
3809 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3810
3811 // invalidate our userspace inode cache
3812 if (cct->_conf->client_oc) {
3813 vector<ObjectExtent> ls;
3814 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3815 objectcacher->discard_writeback(&in->oset, ls, nullptr);
3816 }
3817
3818 _schedule_invalidate_callback(in, off, len);
3819 }
3820
3821 bool Client::_release(Inode *in)
3822 {
3823 ldout(cct, 20) << "_release " << *in << dendl;
3824 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3825 _invalidate_inode_cache(in);
3826 return true;
3827 }
3828 return false;
3829 }
3830
3831 bool Client::_flush(Inode *in, Context *onfinish)
3832 {
3833 ldout(cct, 10) << "_flush " << *in << dendl;
3834
3835 if (!in->oset.dirty_or_tx) {
3836 ldout(cct, 10) << " nothing to flush" << dendl;
3837 onfinish->complete(0);
3838 return true;
3839 }
3840
3841 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3842 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3843 objectcacher->purge_set(&in->oset);
3844 if (onfinish) {
3845 onfinish->complete(-ENOSPC);
3846 }
3847 return true;
3848 }
3849
3850 return objectcacher->flush_set(&in->oset, onfinish);
3851 }
3852
3853 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3854 {
3855 assert(client_lock.is_locked());
3856 if (!in->oset.dirty_or_tx) {
3857 ldout(cct, 10) << " nothing to flush" << dendl;
3858 return;
3859 }
3860
3861 Mutex flock("Client::_flush_range flock");
3862 Cond cond;
3863 bool safe = false;
3864 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3865 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3866 offset, size, onflush);
3867 if (!ret) {
3868 // wait for flush
3869 client_lock.Unlock();
3870 flock.Lock();
3871 while (!safe)
3872 cond.Wait(flock);
3873 flock.Unlock();
3874 client_lock.Lock();
3875 }
3876 }
3877
3878 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3879 {
3880 // Mutex::Locker l(client_lock);
3881 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3882 Inode *in = static_cast<Inode *>(oset->parent);
3883 assert(in);
3884 _flushed(in);
3885 }
3886
3887 void Client::_flushed(Inode *in)
3888 {
3889 ldout(cct, 10) << "_flushed " << *in << dendl;
3890
3891 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3892 }
3893
3894
3895
3896 // checks common to add_update_cap, handle_cap_grant
3897 void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3898 {
3899 unsigned had = in->caps_issued();
3900
3901 if ((issued & CEPH_CAP_FILE_CACHE) &&
3902 !(had & CEPH_CAP_FILE_CACHE))
3903 in->cache_gen++;
3904
3905 if ((issued & CEPH_CAP_FILE_SHARED) &&
3906 !(had & CEPH_CAP_FILE_SHARED)) {
3907 in->shared_gen++;
3908
3909 if (in->is_dir())
3910 clear_dir_complete_and_ordered(in, true);
3911 }
3912 }
3913
3914 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3915 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3916 int flags, const UserPerm& cap_perms)
3917 {
3918 Cap *cap = 0;
3919 mds_rank_t mds = mds_session->mds_num;
3920 if (in->caps.count(mds)) {
3921 cap = in->caps[mds];
3922
3923 /*
3924 * auth mds of the inode changed. we received the cap export
3925 * message, but still haven't received the cap import message.
3926 * handle_cap_export() updated the new auth MDS' cap.
3927 *
3928 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3929 * a message that was send before the cap import message. So
3930 * don't remove caps.
3931 */
3932 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3933 assert(cap == in->auth_cap);
3934 assert(cap->cap_id == cap_id);
3935 seq = cap->seq;
3936 mseq = cap->mseq;
3937 issued |= cap->issued;
3938 flags |= CEPH_CAP_FLAG_AUTH;
3939 }
3940 } else {
3941 mds_session->num_caps++;
3942 if (!in->is_any_caps()) {
3943 assert(in->snaprealm == 0);
3944 in->snaprealm = get_snap_realm(realm);
3945 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3946 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3947 }
3948 in->caps[mds] = cap = new Cap;
3949
3950 mds_session->caps.push_back(&cap->cap_item);
3951 cap->session = mds_session;
3952 cap->inode = in;
3953 cap->gen = mds_session->cap_gen;
3954 }
3955
3956 check_cap_issue(in, cap, issued);
3957
3958 if (flags & CEPH_CAP_FLAG_AUTH) {
3959 if (in->auth_cap != cap &&
3960 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3961 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3962 ldout(cct, 10) << "add_update_cap changing auth cap: "
3963 << "add myself to new auth MDS' flushing caps list" << dendl;
3964 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3965 }
3966 in->auth_cap = cap;
3967 }
3968 }
3969
3970 unsigned old_caps = cap->issued;
3971 cap->cap_id = cap_id;
3972 cap->issued = issued;
3973 cap->implemented |= issued;
3974 cap->seq = seq;
3975 cap->issue_seq = seq;
3976 cap->mseq = mseq;
3977 cap->gen = mds_session->cap_gen;
3978 cap->latest_perms = cap_perms;
3979 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3980 << " from mds." << mds
3981 << " on " << *in
3982 << dendl;
3983
3984 if ((issued & ~old_caps) && in->auth_cap == cap) {
3985 // non-auth MDS is revoking the newly grant caps ?
3986 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3987 if (it->second == cap)
3988 continue;
3989 if (it->second->implemented & ~it->second->issued & issued) {
3990 check_caps(in, CHECK_CAPS_NODELAY);
3991 break;
3992 }
3993 }
3994 }
3995
3996 if (issued & ~old_caps)
3997 signal_cond_list(in->waitfor_caps);
3998 }
3999
4000 void Client::remove_cap(Cap *cap, bool queue_release)
4001 {
4002 Inode *in = cap->inode;
4003 MetaSession *session = cap->session;
4004 mds_rank_t mds = cap->session->mds_num;
4005
4006 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
4007
4008 if (queue_release) {
4009 session->enqueue_cap_release(
4010 in->ino,
4011 cap->cap_id,
4012 cap->issue_seq,
4013 cap->mseq,
4014 cap_epoch_barrier);
4015 }
4016
4017 if (in->auth_cap == cap) {
4018 if (in->flushing_cap_item.is_on_list()) {
4019 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4020 in->flushing_cap_item.remove_myself();
4021 }
4022 in->auth_cap = NULL;
4023 }
4024 assert(in->caps.count(mds));
4025 in->caps.erase(mds);
4026
4027 cap->cap_item.remove_myself();
4028 delete cap;
4029 cap = nullptr;
4030
4031 if (!in->is_any_caps()) {
4032 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
4033 in->snaprealm_item.remove_myself();
4034 put_snap_realm(in->snaprealm);
4035 in->snaprealm = 0;
4036 }
4037 }
4038
4039 void Client::remove_all_caps(Inode *in)
4040 {
4041 while (!in->caps.empty())
4042 remove_cap(in->caps.begin()->second, true);
4043 }
4044
4045 void Client::remove_session_caps(MetaSession *s)
4046 {
4047 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
4048
4049 while (s->caps.size()) {
4050 Cap *cap = *s->caps.begin();
4051 Inode *in = cap->inode;
4052 bool dirty_caps = false, cap_snaps = false;
4053 if (in->auth_cap == cap) {
4054 cap_snaps = !in->cap_snaps.empty();
4055 dirty_caps = in->dirty_caps | in->flushing_caps;
4056 in->wanted_max_size = 0;
4057 in->requested_max_size = 0;
4058 in->flags |= I_CAP_DROPPED;
4059 }
4060 remove_cap(cap, false);
4061 signal_cond_list(in->waitfor_caps);
4062 if (cap_snaps) {
4063 InodeRef tmp_ref(in);
4064 in->cap_snaps.clear();
4065 }
4066 if (dirty_caps) {
4067 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4068 if (in->flushing_caps) {
4069 num_flushing_caps--;
4070 in->flushing_cap_tids.clear();
4071 }
4072 in->flushing_caps = 0;
4073 in->mark_caps_clean();
4074 put_inode(in);
4075 }
4076 }
4077 s->flushing_caps_tids.clear();
4078 sync_cond.Signal();
4079 }
4080
4081 int Client::_do_remount(bool retry_on_error)
4082 {
4083 uint64_t max_retries = cct->_conf->get_val<uint64_t>("mds_max_retries_on_remount_failure");
4084
4085 errno = 0;
4086 int r = remount_cb(callback_handle);
4087 if (r == 0) {
4088 retries_on_invalidate = 0;
4089 } else {
4090 int e = errno;
4091 client_t whoami = get_nodeid();
4092 if (r == -1) {
4093 lderr(cct) <<
4094 "failed to remount (to trim kernel dentries): "
4095 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4096 } else {
4097 lderr(cct) <<
4098 "failed to remount (to trim kernel dentries): "
4099 "return code = " << r << dendl;
4100 }
4101 bool should_abort =
4102 (cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4103 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4104 !(retry_on_error && (++retries_on_invalidate < max_retries));
4105 if (should_abort && !unmounting) {
4106 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4107 ceph_abort();
4108 }
4109 }
4110 return r;
4111 }
4112
4113 class C_Client_Remount : public Context {
4114 private:
4115 Client *client;
4116 public:
4117 explicit C_Client_Remount(Client *c) : client(c) {}
4118 void finish(int r) override {
4119 assert(r == 0);
4120 client->_do_remount(true);
4121 }
4122 };
4123
4124 void Client::_invalidate_kernel_dcache()
4125 {
4126 if (unmounting)
4127 return;
4128 if (can_invalidate_dentries) {
4129 if (dentry_invalidate_cb && root->dir) {
4130 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4131 p != root->dir->dentries.end();
4132 ++p) {
4133 if (p->second->inode)
4134 _schedule_invalidate_dentry_callback(p->second, false);
4135 }
4136 }
4137 } else if (remount_cb) {
4138 // Hacky:
4139 // when remounting a file system, linux kernel trims all unused dentries in the fs
4140 remount_finisher.queue(new C_Client_Remount(this));
4141 }
4142 }
4143
4144 void Client::_trim_negative_child_dentries(InodeRef& in)
4145 {
4146 if (!in->is_dir())
4147 return;
4148
4149 Dir* dir = in->dir;
4150 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4151 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4152 Dentry *dn = p->second;
4153 ++p;
4154 assert(!dn->inode);
4155 if (dn->lru_is_expireable())
4156 unlink(dn, true, false); // keep dir, drop dentry
4157 }
4158 if (dir->dentries.empty()) {
4159 close_dir(dir);
4160 }
4161 }
4162
4163 if (in->flags & I_SNAPDIR_OPEN) {
4164 InodeRef snapdir = open_snapdir(in.get());
4165 _trim_negative_child_dentries(snapdir);
4166 }
4167 }
4168
4169 void Client::trim_caps(MetaSession *s, uint64_t max)
4170 {
4171 mds_rank_t mds = s->mds_num;
4172 size_t caps_size = s->caps.size();
4173 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4174 << " caps " << caps_size << dendl;
4175
4176 uint64_t trimmed = 0;
4177 auto p = s->caps.begin();
4178 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4179 * looking at from getting deleted during traversal. */
4180 while ((caps_size - trimmed) > max && !p.end()) {
4181 Cap *cap = *p;
4182 InodeRef in(cap->inode);
4183
4184 // Increment p early because it will be invalidated if cap
4185 // is deleted inside remove_cap
4186 ++p;
4187
4188 if (in->caps.size() > 1 && cap != in->auth_cap) {
4189 int mine = cap->issued | cap->implemented;
4190 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4191 // disposable non-auth cap
4192 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4193 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4194 cap = (remove_cap(cap, true), nullptr);
4195 trimmed++;
4196 }
4197 } else {
4198 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4199 _trim_negative_child_dentries(in);
4200 bool all = true;
4201 set<Dentry*>::iterator q = in->dn_set.begin();
4202 while (q != in->dn_set.end()) {
4203 Dentry *dn = *q++;
4204 if (dn->lru_is_expireable()) {
4205 if (can_invalidate_dentries &&
4206 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4207 // Only issue one of these per DN for inodes in root: handle
4208 // others more efficiently by calling for root-child DNs at
4209 // the end of this function.
4210 _schedule_invalidate_dentry_callback(dn, true);
4211 }
4212 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4213 to_trim.insert(dn);
4214 } else {
4215 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4216 all = false;
4217 }
4218 }
4219 if (all && in->ino != MDS_INO_ROOT) {
4220 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4221 trimmed++;
4222 }
4223 }
4224 }
4225 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4226 for (const auto &dn : to_trim) {
4227 trim_dentry(dn);
4228 }
4229 to_trim.clear();
4230
4231 caps_size = s->caps.size();
4232 if (caps_size > max)
4233 _invalidate_kernel_dcache();
4234 }
4235
4236 void Client::force_session_readonly(MetaSession *s)
4237 {
4238 s->readonly = true;
4239 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4240 Inode *in = (*p)->inode;
4241 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4242 signal_cond_list(in->waitfor_caps);
4243 }
4244 }
4245
4246 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4247 {
4248 MetaSession *session = in->auth_cap->session;
4249
4250 int flushing = in->dirty_caps;
4251 assert(flushing);
4252
4253 ceph_tid_t flush_tid = ++last_flush_tid;
4254 in->flushing_cap_tids[flush_tid] = flushing;
4255
4256 if (!in->flushing_caps) {
4257 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4258 num_flushing_caps++;
4259 } else {
4260 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4261 }
4262
4263 in->flushing_caps |= flushing;
4264 in->mark_caps_clean();
4265
4266 if (!in->flushing_cap_item.is_on_list())
4267 session->flushing_caps.push_back(&in->flushing_cap_item);
4268 session->flushing_caps_tids.insert(flush_tid);
4269
4270 *ptid = flush_tid;
4271 return flushing;
4272 }
4273
4274 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4275 {
4276 for (auto &p : in->cap_snaps) {
4277 CapSnap &capsnap = p.second;
4278 if (capsnap.flush_tid > 0) {
4279 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4280 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4281 }
4282 }
4283 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4284 it != in->flushing_cap_tids.end();
4285 ++it) {
4286 old_s->flushing_caps_tids.erase(it->first);
4287 new_s->flushing_caps_tids.insert(it->first);
4288 }
4289 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4290 }
4291
4292 /*
4293 * Flush all caps back to the MDS. Because the callers generally wait on the
4294 * result of this function (syncfs and umount cases), we set
4295 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4296 */
4297 void Client::flush_caps_sync()
4298 {
4299 ldout(cct, 10) << __func__ << dendl;
4300 xlist<Inode*>::iterator p = delayed_list.begin();
4301 while (!p.end()) {
4302 unsigned flags = CHECK_CAPS_NODELAY;
4303 Inode *in = *p;
4304
4305 ++p;
4306 delayed_list.pop_front();
4307 if (p.end() && dirty_list.empty())
4308 flags |= CHECK_CAPS_SYNCHRONOUS;
4309 check_caps(in, flags);
4310 }
4311
4312 // other caps, too
4313 p = dirty_list.begin();
4314 while (!p.end()) {
4315 unsigned flags = CHECK_CAPS_NODELAY;
4316 Inode *in = *p;
4317
4318 ++p;
4319 if (p.end())
4320 flags |= CHECK_CAPS_SYNCHRONOUS;
4321 check_caps(in, flags);
4322 }
4323 }
4324
4325 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4326 {
4327 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4328 Cap *cap = in->auth_cap;
4329 assert(cap->session == session);
4330
4331 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4332 p != in->flushing_cap_tids.end();
4333 ++p) {
4334 bool req_sync = false;
4335
4336 /* If this is a synchronous request, then flush the journal on last one */
4337 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4338 req_sync = true;
4339
4340 send_cap(in, session, cap, req_sync,
4341 (get_caps_used(in) | in->caps_dirty()),
4342 in->caps_wanted(), (cap->issued | cap->implemented),
4343 p->second, p->first);
4344 }
4345 }
4346
4347 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4348 {
4349 while (in->flushing_caps) {
4350 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4351 assert(it != in->flushing_cap_tids.end());
4352 if (it->first > want)
4353 break;
4354 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4355 << ccap_string(it->second) << " want " << want
4356 << " last " << it->first << dendl;
4357 wait_on_list(in->waitfor_caps);
4358 }
4359 }
4360
4361 void Client::wait_sync_caps(ceph_tid_t want)
4362 {
4363 retry:
4364 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4365 << num_flushing_caps << " total flushing)" << dendl;
4366 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4367 p != mds_sessions.end();
4368 ++p) {
4369 MetaSession *s = p->second;
4370 if (s->flushing_caps_tids.empty())
4371 continue;
4372 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4373 if (oldest_tid <= want) {
4374 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4375 << " (want " << want << ")" << dendl;
4376 sync_cond.Wait(client_lock);
4377 goto retry;
4378 }
4379 }
4380 }
4381
4382 void Client::kick_flushing_caps(MetaSession *session)
4383 {
4384 mds_rank_t mds = session->mds_num;
4385 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4386
4387 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4388 Inode *in = *p;
4389 if (session->early_flushing_caps.count(in))
4390 continue;
4391 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4392 if (in->cap_snaps.size())
4393 flush_snaps(in, true);
4394 if (in->flushing_caps)
4395 flush_caps(in, session);
4396 }
4397
4398 session->early_flushing_caps.clear();
4399 }
4400
4401 void Client::early_kick_flushing_caps(MetaSession *session)
4402 {
4403 session->early_flushing_caps.clear();
4404
4405 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4406 Inode *in = *p;
4407 assert(in->auth_cap);
4408
4409 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4410 // stage. This guarantees that MDS processes the cap flush message before issuing
4411 // the flushing caps to other client.
4412 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4413 continue;
4414
4415 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4416 << " to mds." << session->mds_num << dendl;
4417
4418 session->early_flushing_caps.insert(in);
4419
4420 if (in->cap_snaps.size())
4421 flush_snaps(in, true);
4422 if (in->flushing_caps)
4423 flush_caps(in, session);
4424
4425 }
4426 }
4427
4428 void Client::kick_maxsize_requests(MetaSession *session)
4429 {
4430 xlist<Cap*>::iterator iter = session->caps.begin();
4431 while (!iter.end()){
4432 (*iter)->inode->requested_max_size = 0;
4433 (*iter)->inode->wanted_max_size = 0;
4434 signal_cond_list((*iter)->inode->waitfor_caps);
4435 ++iter;
4436 }
4437 }
4438
4439 void SnapRealm::build_snap_context()
4440 {
4441 set<snapid_t> snaps;
4442 snapid_t max_seq = seq;
4443
4444 // start with prior_parents?
4445 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4446 snaps.insert(prior_parent_snaps[i]);
4447
4448 // current parent's snaps
4449 if (pparent) {
4450 const SnapContext& psnapc = pparent->get_snap_context();
4451 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4452 if (psnapc.snaps[i] >= parent_since)
4453 snaps.insert(psnapc.snaps[i]);
4454 if (psnapc.seq > max_seq)
4455 max_seq = psnapc.seq;
4456 }
4457
4458 // my snaps
4459 for (unsigned i=0; i<my_snaps.size(); i++)
4460 snaps.insert(my_snaps[i]);
4461
4462 // ok!
4463 cached_snap_context.seq = max_seq;
4464 cached_snap_context.snaps.resize(0);
4465 cached_snap_context.snaps.reserve(snaps.size());
4466 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4467 cached_snap_context.snaps.push_back(*p);
4468 }
4469
4470 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4471 {
4472 list<SnapRealm*> q;
4473 q.push_back(realm);
4474
4475 while (!q.empty()) {
4476 realm = q.front();
4477 q.pop_front();
4478
4479 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4480 realm->invalidate_cache();
4481
4482 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4483 p != realm->pchildren.end();
4484 ++p)
4485 q.push_back(*p);
4486 }
4487 }
4488
4489 SnapRealm *Client::get_snap_realm(inodeno_t r)
4490 {
4491 SnapRealm *realm = snap_realms[r];
4492 if (!realm)
4493 snap_realms[r] = realm = new SnapRealm(r);
4494 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4495 realm->nref++;
4496 return realm;
4497 }
4498
4499 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4500 {
4501 if (snap_realms.count(r) == 0) {
4502 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4503 return NULL;
4504 }
4505 SnapRealm *realm = snap_realms[r];
4506 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4507 realm->nref++;
4508 return realm;
4509 }
4510
4511 void Client::put_snap_realm(SnapRealm *realm)
4512 {
4513 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4514 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4515 if (--realm->nref == 0) {
4516 snap_realms.erase(realm->ino);
4517 if (realm->pparent) {
4518 realm->pparent->pchildren.erase(realm);
4519 put_snap_realm(realm->pparent);
4520 }
4521 delete realm;
4522 }
4523 }
4524
4525 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4526 {
4527 if (realm->parent != parent) {
4528 ldout(cct, 10) << "adjust_realm_parent " << *realm
4529 << " " << realm->parent << " -> " << parent << dendl;
4530 realm->parent = parent;
4531 if (realm->pparent) {
4532 realm->pparent->pchildren.erase(realm);
4533 put_snap_realm(realm->pparent);
4534 }
4535 realm->pparent = get_snap_realm(parent);
4536 realm->pparent->pchildren.insert(realm);
4537 return true;
4538 }
4539 return false;
4540 }
4541
4542 static bool has_new_snaps(const SnapContext& old_snapc,
4543 const SnapContext& new_snapc)
4544 {
4545 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4546 }
4547
4548
4549 void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4550 {
4551 SnapRealm *first_realm = NULL;
4552 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4553
4554 map<SnapRealm*, SnapContext> dirty_realms;
4555
4556 bufferlist::iterator p = bl.begin();
4557 while (!p.end()) {
4558 SnapRealmInfo info;
4559 ::decode(info, p);
4560 SnapRealm *realm = get_snap_realm(info.ino());
4561
4562 bool invalidate = false;
4563
4564 if (info.seq() > realm->seq) {
4565 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4566 << dendl;
4567
4568 if (flush) {
4569 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4570 // flush me + children
4571 list<SnapRealm*> q;
4572 q.push_back(realm);
4573 while (!q.empty()) {
4574 SnapRealm *realm = q.front();
4575 q.pop_front();
4576
4577 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4578 p != realm->pchildren.end();
4579 ++p)
4580 q.push_back(*p);
4581
4582 if (dirty_realms.count(realm) == 0) {
4583 realm->nref++;
4584 dirty_realms[realm] = realm->get_snap_context();
4585 }
4586 }
4587 }
4588
4589 // update
4590 realm->seq = info.seq();
4591 realm->created = info.created();
4592 realm->parent_since = info.parent_since();
4593 realm->prior_parent_snaps = info.prior_parent_snaps;
4594 realm->my_snaps = info.my_snaps;
4595 invalidate = true;
4596 }
4597
4598 // _always_ verify parent
4599 if (adjust_realm_parent(realm, info.parent()))
4600 invalidate = true;
4601
4602 if (invalidate) {
4603 invalidate_snaprealm_and_children(realm);
4604 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4605 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4606 } else {
4607 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4608 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4609 }
4610
4611 if (!first_realm)
4612 first_realm = realm;
4613 else
4614 put_snap_realm(realm);
4615 }
4616
4617 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4618 q != dirty_realms.end();
4619 ++q) {
4620 SnapRealm *realm = q->first;
4621 // if there are new snaps ?
4622 if (has_new_snaps(q->second, realm->get_snap_context())) {
4623 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4624 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4625 while (!r.end()) {
4626 Inode *in = *r;
4627 ++r;
4628 queue_cap_snap(in, q->second);
4629 }
4630 } else {
4631 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4632 }
4633 put_snap_realm(realm);
4634 }
4635
4636 if (realm_ret)
4637 *realm_ret = first_realm;
4638 else
4639 put_snap_realm(first_realm);
4640 }
4641
4642 void Client::handle_snap(MClientSnap *m)
4643 {
4644 ldout(cct, 10) << "handle_snap " << *m << dendl;
4645 mds_rank_t mds = mds_rank_t(m->get_source().num());
4646 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4647 if (!session) {
4648 m->put();
4649 return;
4650 }
4651
4652 got_mds_push(session);
4653
4654 map<Inode*, SnapContext> to_move;
4655 SnapRealm *realm = 0;
4656
4657 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4658 assert(m->head.split);
4659 SnapRealmInfo info;
4660 bufferlist::iterator p = m->bl.begin();
4661 ::decode(info, p);
4662 assert(info.ino() == m->head.split);
4663
4664 // flush, then move, ino's.
4665 realm = get_snap_realm(info.ino());
4666 ldout(cct, 10) << " splitting off " << *realm << dendl;
4667 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4668 p != m->split_inos.end();
4669 ++p) {
4670 vinodeno_t vino(*p, CEPH_NOSNAP);
4671 if (inode_map.count(vino)) {
4672 Inode *in = inode_map[vino];
4673 if (!in->snaprealm || in->snaprealm == realm)
4674 continue;
4675 if (in->snaprealm->created > info.created()) {
4676 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4677 << *in->snaprealm << dendl;
4678 continue;
4679 }
4680 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4681
4682
4683 in->snaprealm_item.remove_myself();
4684 to_move[in] = in->snaprealm->get_snap_context();
4685 put_snap_realm(in->snaprealm);
4686 }
4687 }
4688
4689 // move child snaprealms, too
4690 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4691 p != m->split_realms.end();
4692 ++p) {
4693 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4694 SnapRealm *child = get_snap_realm_maybe(*p);
4695 if (!child)
4696 continue;
4697 adjust_realm_parent(child, realm->ino);
4698 put_snap_realm(child);
4699 }
4700 }
4701
4702 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4703
4704 if (realm) {
4705 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4706 Inode *in = p->first;
4707 in->snaprealm = realm;
4708 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4709 realm->nref++;
4710 // queue for snap writeback
4711 if (has_new_snaps(p->second, realm->get_snap_context()))
4712 queue_cap_snap(in, p->second);
4713 }
4714 put_snap_realm(realm);
4715 }
4716
4717 m->put();
4718 }
4719
4720 void Client::handle_quota(MClientQuota *m)
4721 {
4722 mds_rank_t mds = mds_rank_t(m->get_source().num());
4723 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4724 if (!session) {
4725 m->put();
4726 return;
4727 }
4728
4729 got_mds_push(session);
4730
4731 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4732
4733 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4734 if (inode_map.count(vino)) {
4735 Inode *in = NULL;
4736 in = inode_map[vino];
4737
4738 if (in) {
4739 in->quota = m->quota;
4740 in->rstat = m->rstat;
4741 }
4742 }
4743
4744 m->put();
4745 }
4746
4747 void Client::handle_caps(MClientCaps *m)
4748 {
4749 mds_rank_t mds = mds_rank_t(m->get_source().num());
4750 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4751 if (!session) {
4752 m->put();
4753 return;
4754 }
4755
4756 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4757 // Pause RADOS operations until we see the required epoch
4758 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4759 }
4760
4761 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4762 // Record the barrier so that we will transmit it to MDS when releasing
4763 set_cap_epoch_barrier(m->osd_epoch_barrier);
4764 }
4765
4766 got_mds_push(session);
4767
4768 m->clear_payload(); // for if/when we send back to MDS
4769
4770 Inode *in = 0;
4771 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4772 if (inode_map.count(vino))
4773 in = inode_map[vino];
4774 if (!in) {
4775 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4776 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4777 session->enqueue_cap_release(
4778 m->get_ino(),
4779 m->get_cap_id(),
4780 m->get_seq(),
4781 m->get_mseq(),
4782 cap_epoch_barrier);
4783 } else {
4784 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4785 }
4786 m->put();
4787
4788 // in case the mds is waiting on e.g. a revocation
4789 flush_cap_releases();
4790 return;
4791 }
4792
4793 switch (m->get_op()) {
4794 case CEPH_CAP_OP_EXPORT:
4795 return handle_cap_export(session, in, m);
4796 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4797 return handle_cap_flushsnap_ack(session, in, m);
4798 case CEPH_CAP_OP_IMPORT:
4799 handle_cap_import(session, in, m);
4800 }
4801
4802 if (in->caps.count(mds) == 0) {
4803 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4804 m->put();
4805 return;
4806 }
4807
4808 Cap *cap = in->caps[mds];
4809
4810 switch (m->get_op()) {
4811 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4812 case CEPH_CAP_OP_IMPORT:
4813 case CEPH_CAP_OP_REVOKE:
4814 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4815 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4816 default:
4817 m->put();
4818 }
4819 }
4820
4821 void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4822 {
4823 mds_rank_t mds = session->mds_num;
4824
4825 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4826 << " IMPORT from mds." << mds << dendl;
4827
4828 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4829 Cap *cap = NULL;
4830 UserPerm cap_perms;
4831 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4832 cap = in->caps[peer_mds];
4833 if (cap) {
4834 cap_perms = cap->latest_perms;
4835 }
4836 }
4837
4838 // add/update it
4839 SnapRealm *realm = NULL;
4840 update_snap_trace(m->snapbl, &realm);
4841
4842 add_update_cap(in, session, m->get_cap_id(),
4843 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4844 CEPH_CAP_FLAG_AUTH, cap_perms);
4845
4846 if (cap && cap->cap_id == m->peer.cap_id) {
4847 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4848 }
4849
4850 if (realm)
4851 put_snap_realm(realm);
4852
4853 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4854 // reflush any/all caps (if we are now the auth_cap)
4855 if (in->cap_snaps.size())
4856 flush_snaps(in, true);
4857 if (in->flushing_caps)
4858 flush_caps(in, session);
4859 }
4860 }
4861
4862 void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4863 {
4864 mds_rank_t mds = session->mds_num;
4865
4866 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4867 << " EXPORT from mds." << mds << dendl;
4868
4869 Cap *cap = NULL;
4870 if (in->caps.count(mds))
4871 cap = in->caps[mds];
4872
4873 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4874
4875 if (cap && cap->cap_id == m->get_cap_id()) {
4876 if (m->peer.cap_id) {
4877 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4878 if (in->caps.count(peer_mds)) {
4879 Cap *tcap = in->caps[peer_mds];
4880 if (tcap->cap_id == m->peer.cap_id &&
4881 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4882 tcap->cap_id = m->peer.cap_id;
4883 tcap->seq = m->peer.seq - 1;
4884 tcap->issue_seq = tcap->seq;
4885 tcap->issued |= cap->issued;
4886 tcap->implemented |= cap->issued;
4887 if (cap == in->auth_cap)
4888 in->auth_cap = tcap;
4889 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4890 adjust_session_flushing_caps(in, session, tsession);
4891 }
4892 } else {
4893 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4894 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4895 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4896 cap->latest_perms);
4897 }
4898 } else {
4899 if (cap == in->auth_cap)
4900 in->flags |= I_CAP_DROPPED;
4901 }
4902
4903 remove_cap(cap, false);
4904 }
4905
4906 m->put();
4907 }
4908
4909 void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4910 {
4911 mds_rank_t mds = session->mds_num;
4912 assert(in->caps[mds]);
4913
4914 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4915 << " size " << in->size << " -> " << m->get_size()
4916 << dendl;
4917
4918 int issued;
4919 in->caps_issued(&issued);
4920 issued |= in->caps_dirty();
4921 update_inode_file_size(in, issued, m->get_size(),
4922 m->get_truncate_seq(), m->get_truncate_size());
4923 m->put();
4924 }
4925
4926 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4927 {
4928 ceph_tid_t flush_ack_tid = m->get_client_tid();
4929 int dirty = m->get_dirty();
4930 int cleaned = 0;
4931 int flushed = 0;
4932
4933 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4934 it != in->flushing_cap_tids.end(); ) {
4935 if (it->first == flush_ack_tid)
4936 cleaned = it->second;
4937 if (it->first <= flush_ack_tid) {
4938 session->flushing_caps_tids.erase(it->first);
4939 in->flushing_cap_tids.erase(it++);
4940 ++flushed;
4941 continue;
4942 }
4943 cleaned &= ~it->second;
4944 if (!cleaned)
4945 break;
4946 ++it;
4947 }
4948
4949 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4950 << " cleaned " << ccap_string(cleaned) << " on " << *in
4951 << " with " << ccap_string(dirty) << dendl;
4952
4953 if (flushed) {
4954 signal_cond_list(in->waitfor_caps);
4955 if (session->flushing_caps_tids.empty() ||
4956 *session->flushing_caps_tids.begin() > flush_ack_tid)
4957 sync_cond.Signal();
4958 }
4959
4960 if (!dirty) {
4961 in->cap_dirtier_uid = -1;
4962 in->cap_dirtier_gid = -1;
4963 }
4964
4965 if (!cleaned) {
4966 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4967 } else {
4968 if (in->flushing_caps) {
4969 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4970 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4971 in->flushing_caps &= ~cleaned;
4972 if (in->flushing_caps == 0) {
4973 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4974 num_flushing_caps--;
4975 if (in->cap_snaps.empty())
4976 in->flushing_cap_item.remove_myself();
4977 }
4978 if (!in->caps_dirty())
4979 put_inode(in);
4980 }
4981 }
4982
4983 m->put();
4984 }
4985
4986
4987 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4988 {
4989 mds_rank_t mds = session->mds_num;
4990 assert(in->caps[mds]);
4991 snapid_t follows = m->get_snap_follows();
4992
4993 if (in->cap_snaps.count(follows)) {
4994 CapSnap &capsnap = in->cap_snaps.at(follows);
4995 if (m->get_client_tid() != capsnap.flush_tid) {
4996 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4997 } else {
4998 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4999 << " on " << *in << dendl;
5000 InodeRef tmp_ref;
5001 if (in->get_num_ref() == 1)
5002 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5003 if (in->flushing_caps == 0 && in->cap_snaps.empty())
5004 in->flushing_cap_item.remove_myself();
5005 session->flushing_caps_tids.erase(capsnap.flush_tid);
5006 in->cap_snaps.erase(follows);
5007 }
5008 } else {
5009 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
5010 << " on " << *in << dendl;
5011 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5012 }
5013
5014 m->put();
5015 }
5016
5017 class C_Client_DentryInvalidate : public Context {
5018 private:
5019 Client *client;
5020 vinodeno_t dirino;
5021 vinodeno_t ino;
5022 string name;
5023 public:
5024 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5025 client(c), name(dn->name) {
5026 if (client->use_faked_inos()) {
5027 dirino.ino = dn->dir->parent_inode->faked_ino;
5028 if (del)
5029 ino.ino = dn->inode->faked_ino;
5030 } else {
5031 dirino = dn->dir->parent_inode->vino();
5032 if (del)
5033 ino = dn->inode->vino();
5034 }
5035 if (!del)
5036 ino.ino = inodeno_t();
5037 }
5038 void finish(int r) override {
5039 // _async_dentry_invalidate is responsible for its own locking
5040 assert(!client->client_lock.is_locked_by_me());
5041 client->_async_dentry_invalidate(dirino, ino, name);
5042 }
5043 };
5044
5045 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5046 {
5047 if (unmounting)
5048 return;
5049 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
5050 << " in dir " << dirino << dendl;
5051 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5052 }
5053
5054 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5055 {
5056 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5057 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5058 }
5059
5060 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5061 {
5062 int ref = in->get_num_ref();
5063
5064 if (in->dir && !in->dir->dentries.empty()) {
5065 for (auto p = in->dir->dentries.begin();
5066 p != in->dir->dentries.end(); ) {
5067 Dentry *dn = p->second;
5068 ++p;
5069 /* rmsnap removes whole subtree, need trim inodes recursively.
5070 * we don't need to invalidate dentries recursively. because
5071 * invalidating a directory dentry effectively invalidate
5072 * whole subtree */
5073 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5074 _try_to_trim_inode(dn->inode.get(), false);
5075
5076 if (dn->lru_is_expireable())
5077 unlink(dn, true, false); // keep dir, drop dentry
5078 }
5079 if (in->dir->dentries.empty()) {
5080 close_dir(in->dir);
5081 --ref;
5082 }
5083 }
5084
5085 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5086 InodeRef snapdir = open_snapdir(in);
5087 _try_to_trim_inode(snapdir.get(), false);
5088 --ref;
5089 }
5090
5091 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5092 set<Dentry*>::iterator q = in->dn_set.begin();
5093 while (q != in->dn_set.end()) {
5094 Dentry *dn = *q++;
5095 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5096 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5097 _schedule_invalidate_dentry_callback(dn, true);
5098 unlink(dn, true, true);
5099 }
5100 }
5101 }
5102
5103 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5104 {
5105 mds_rank_t mds = session->mds_num;
5106 int used = get_caps_used(in);
5107 int wanted = in->caps_wanted();
5108
5109 const int old_caps = cap->issued;
5110 const int new_caps = m->get_caps();
5111 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5112 << " mds." << mds << " seq " << m->get_seq()
5113 << " caps now " << ccap_string(new_caps)
5114 << " was " << ccap_string(old_caps) << dendl;
5115 cap->seq = m->get_seq();
5116 cap->gen = session->cap_gen;
5117
5118 // update inode
5119 int issued;
5120 in->caps_issued(&issued);
5121 issued |= in->caps_dirty();
5122
5123 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5124 !(issued & CEPH_CAP_AUTH_EXCL)) {
5125 in->mode = m->head.mode;
5126 in->uid = m->head.uid;
5127 in->gid = m->head.gid;
5128 in->btime = m->btime;
5129 }
5130 bool deleted_inode = false;
5131 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5132 !(issued & CEPH_CAP_LINK_EXCL)) {
5133 in->nlink = m->head.nlink;
5134 if (in->nlink == 0 &&
5135 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5136 deleted_inode = true;
5137 }
5138 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5139 m->xattrbl.length() &&
5140 m->head.xattr_version > in->xattr_version) {
5141 bufferlist::iterator p = m->xattrbl.begin();
5142 ::decode(in->xattrs, p);
5143 in->xattr_version = m->head.xattr_version;
5144 }
5145
5146 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5147 in->dirstat.nfiles = m->get_nfiles();
5148 in->dirstat.nsubdirs = m->get_nsubdirs();
5149 }
5150
5151 if (new_caps & CEPH_CAP_ANY_RD) {
5152 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5153 m->get_ctime(), m->get_mtime(), m->get_atime());
5154 }
5155
5156 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5157 in->layout = m->get_layout();
5158 update_inode_file_size(in, issued, m->get_size(),
5159 m->get_truncate_seq(), m->get_truncate_size());
5160 }
5161
5162 if (m->inline_version > in->inline_version) {
5163 in->inline_data = m->inline_data;
5164 in->inline_version = m->inline_version;
5165 }
5166
5167 /* always take a newer change attr */
5168 if (m->get_change_attr() > in->change_attr)
5169 in->change_attr = m->get_change_attr();
5170
5171 // max_size
5172 if (cap == in->auth_cap &&
5173 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5174 (m->get_max_size() != in->max_size)) {
5175 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5176 in->max_size = m->get_max_size();
5177 if (in->max_size > in->wanted_max_size) {
5178 in->wanted_max_size = 0;
5179 in->requested_max_size = 0;
5180 }
5181 }
5182
5183 bool check = false;
5184 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5185 check = true;
5186
5187 check_cap_issue(in, cap, new_caps);
5188
5189 // update caps
5190 int revoked = old_caps & ~new_caps;
5191 if (revoked) {
5192 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
5193 cap->issued = new_caps;
5194 cap->implemented |= new_caps;
5195
5196 // recall delegations if we're losing caps necessary for them
5197 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5198 in->recall_deleg(false);
5199 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5200 in->recall_deleg(true);
5201
5202 if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
5203 !_flush(in, new C_Client_FlushComplete(this, in))) {
5204 // waitin' for flush
5205 } else if (revoked & CEPH_CAP_FILE_CACHE) {
5206 if (_release(in))
5207 check = true;
5208 } else {
5209 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5210 check = true;
5211 }
5212 } else if (old_caps == new_caps) {
5213 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5214 } else {
5215 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5216 cap->issued = new_caps;
5217 cap->implemented |= new_caps;
5218
5219 if (cap == in->auth_cap) {
5220 // non-auth MDS is revoking the newly grant caps ?
5221 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5222 if (it->second == cap)
5223 continue;
5224 if (it->second->implemented & ~it->second->issued & new_caps) {
5225 check = true;
5226 break;
5227 }
5228 }
5229 }
5230 }
5231
5232 if (check)
5233 check_caps(in, 0);
5234
5235 // wake up waiters
5236 if (new_caps)
5237 signal_cond_list(in->waitfor_caps);
5238
5239 // may drop inode's last ref
5240 if (deleted_inode)
5241 _try_to_trim_inode(in, true);
5242
5243 m->put();
5244 }
5245
5246 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5247 {
5248 if (perms.uid() == 0)
5249 return 0;
5250
5251 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5252 int ret = _posix_acl_permission(in, perms, want);
5253 if (ret != -EAGAIN)
5254 return ret;
5255 }
5256
5257 // check permissions before doing anything else
5258 if (!in->check_mode(perms, want))
5259 return -EACCES;
5260 return 0;
5261 }
5262
5263 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5264 const UserPerm& perms)
5265 {
5266 int r = _getattr_for_perm(in, perms);
5267 if (r < 0)
5268 goto out;
5269
5270 r = 0;
5271 if (strncmp(name, "system.", 7) == 0) {
5272 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5273 r = -EPERM;
5274 } else {
5275 r = inode_permission(in, perms, want);
5276 }
5277 out:
5278 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
5279 return r;
5280 }
5281
5282 ostream& operator<<(ostream &out, const UserPerm& perm) {
5283 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5284 return out;
5285 }
5286
5287 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5288 const UserPerm& perms)
5289 {
5290 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5291 int r = _getattr_for_perm(in, perms);
5292 if (r < 0)
5293 goto out;
5294
5295 if (mask & CEPH_SETATTR_SIZE) {
5296 r = inode_permission(in, perms, MAY_WRITE);
5297 if (r < 0)
5298 goto out;
5299 }
5300
5301 r = -EPERM;
5302 if (mask & CEPH_SETATTR_UID) {
5303 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5304 goto out;
5305 }
5306 if (mask & CEPH_SETATTR_GID) {
5307 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5308 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5309 goto out;
5310 }
5311
5312 if (mask & CEPH_SETATTR_MODE) {
5313 if (perms.uid() != 0 && perms.uid() != in->uid)
5314 goto out;
5315
5316 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5317 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5318 stx->stx_mode &= ~S_ISGID;
5319 }
5320
5321 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5322 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5323 if (perms.uid() != 0 && perms.uid() != in->uid) {
5324 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5325 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5326 check_mask |= CEPH_SETATTR_MTIME;
5327 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5328 check_mask |= CEPH_SETATTR_ATIME;
5329 if (check_mask & mask) {
5330 goto out;
5331 } else {
5332 r = inode_permission(in, perms, MAY_WRITE);
5333 if (r < 0)
5334 goto out;
5335 }
5336 }
5337 }
5338 r = 0;
5339 out:
5340 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5341 return r;
5342 }
5343
5344 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5345 {
5346 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5347 unsigned want = 0;
5348
5349 if ((flags & O_ACCMODE) == O_WRONLY)
5350 want = MAY_WRITE;
5351 else if ((flags & O_ACCMODE) == O_RDWR)
5352 want = MAY_READ | MAY_WRITE;
5353 else if ((flags & O_ACCMODE) == O_RDONLY)
5354 want = MAY_READ;
5355 if (flags & O_TRUNC)
5356 want |= MAY_WRITE;
5357
5358 int r = 0;
5359 switch (in->mode & S_IFMT) {
5360 case S_IFLNK:
5361 r = -ELOOP;
5362 goto out;
5363 case S_IFDIR:
5364 if (want & MAY_WRITE) {
5365 r = -EISDIR;
5366 goto out;
5367 }
5368 break;
5369 }
5370
5371 r = _getattr_for_perm(in, perms);
5372 if (r < 0)
5373 goto out;
5374
5375 r = inode_permission(in, perms, want);
5376 out:
5377 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5378 return r;
5379 }
5380
5381 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5382 {
5383 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5384 int r = _getattr_for_perm(dir, perms);
5385 if (r < 0)
5386 goto out;
5387
5388 r = inode_permission(dir, perms, MAY_EXEC);
5389 out:
5390 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5391 return r;
5392 }
5393
5394 int Client::may_create(Inode *dir, const UserPerm& perms)
5395 {
5396 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5397 int r = _getattr_for_perm(dir, perms);
5398 if (r < 0)
5399 goto out;
5400
5401 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5402 out:
5403 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5404 return r;
5405 }
5406
5407 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5408 {
5409 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5410 int r = _getattr_for_perm(dir, perms);
5411 if (r < 0)
5412 goto out;
5413
5414 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5415 if (r < 0)
5416 goto out;
5417
5418 /* 'name == NULL' means rmsnap */
5419 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5420 InodeRef otherin;
5421 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5422 if (r < 0)
5423 goto out;
5424 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5425 r = -EPERM;
5426 }
5427 out:
5428 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5429 return r;
5430 }
5431
5432 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5433 {
5434 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5435 int r = _getattr_for_perm(in, perms);
5436 if (r < 0)
5437 goto out;
5438
5439 if (perms.uid() == 0 || perms.uid() == in->uid) {
5440 r = 0;
5441 goto out;
5442 }
5443
5444 r = -EPERM;
5445 if (!S_ISREG(in->mode))
5446 goto out;
5447
5448 if (in->mode & S_ISUID)
5449 goto out;
5450
5451 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5452 goto out;
5453
5454 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5455 out:
5456 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5457 return r;
5458 }
5459
5460 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5461 {
5462 int mask = CEPH_STAT_CAP_MODE;
5463 bool force = false;
5464 if (acl_type != NO_ACL) {
5465 mask |= CEPH_STAT_CAP_XATTR;
5466 force = in->xattr_version == 0;
5467 }
5468 return _getattr(in, mask, perms, force);
5469 }
5470
5471 vinodeno_t Client::_get_vino(Inode *in)
5472 {
5473 /* The caller must hold the client lock */
5474 return vinodeno_t(in->ino, in->snapid);
5475 }
5476
5477 inodeno_t Client::_get_inodeno(Inode *in)
5478 {
5479 /* The caller must hold the client lock */
5480 return in->ino;
5481 }
5482
5483
5484 /**
5485 * Resolve an MDS spec to a list of MDS daemon GIDs.
5486 *
5487 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5488 * It may be '*' in which case it matches all GIDs.
5489 *
5490 * If no error is returned, the `targets` vector will be populated with at least
5491 * one MDS.
5492 */
5493 int Client::resolve_mds(
5494 const std::string &mds_spec,
5495 std::vector<mds_gid_t> *targets)
5496 {
5497 assert(fsmap);
5498 assert(targets != nullptr);
5499
5500 mds_role_t role;
5501 std::stringstream ss;
5502 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5503 if (role_r == 0) {
5504 // We got a role, resolve it to a GID
5505 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5506 << role << "'" << dendl;
5507 targets->push_back(
5508 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5509 return 0;
5510 }
5511
5512 std::string strtol_err;
5513 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5514 if (strtol_err.empty()) {
5515 // It is a possible GID
5516 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5517 if (fsmap->gid_exists(mds_gid)) {
5518 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5519 targets->push_back(mds_gid);
5520 } else {
5521 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5522 << dendl;
5523 return -ENOENT;
5524 }
5525 } else if (mds_spec == "*") {
5526 // It is a wildcard: use all MDSs
5527 const auto mds_info = fsmap->get_mds_info();
5528
5529 if (mds_info.empty()) {
5530 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5531 return -ENOENT;
5532 }
5533
5534 for (const auto i : mds_info) {
5535 targets->push_back(i.first);
5536 }
5537 } else {
5538 // It did not parse as an integer, it is not a wildcard, it must be a name
5539 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5540 if (mds_gid == 0) {
5541 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5542
5543 lderr(cct) << "FSMap: " << *fsmap << dendl;
5544
5545 return -ENOENT;
5546 } else {
5547 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5548 << "' to GID " << mds_gid << dendl;
5549 targets->push_back(mds_gid);
5550 }
5551 }
5552
5553 return 0;
5554 }
5555
5556
5557 /**
5558 * Authenticate with mon and establish global ID
5559 */
5560 int Client::authenticate()
5561 {
5562 assert(client_lock.is_locked_by_me());
5563
5564 if (monclient->is_authenticated()) {
5565 return 0;
5566 }
5567
5568 client_lock.Unlock();
5569 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5570 client_lock.Lock();
5571 if (r < 0) {
5572 return r;
5573 }
5574
5575 whoami = monclient->get_global_id();
5576 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5577
5578 return 0;
5579 }
5580
5581 int Client::fetch_fsmap(bool user)
5582 {
5583 int r;
5584 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5585 // rather than MDSMap because no one MDSMap contains all the daemons, and
5586 // a `tell` can address any daemon.
5587 version_t fsmap_latest;
5588 do {
5589 C_SaferCond cond;
5590 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5591 client_lock.Unlock();
5592 r = cond.wait();
5593 client_lock.Lock();
5594 } while (r == -EAGAIN);
5595
5596 if (r < 0) {
5597 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5598 return r;
5599 }
5600
5601 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5602
5603 if (user) {
5604 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5605 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5606 monclient->renew_subs();
5607 wait_on_list(waiting_for_fsmap);
5608 }
5609 assert(fsmap_user);
5610 assert(fsmap_user->get_epoch() >= fsmap_latest);
5611 } else {
5612 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5613 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5614 monclient->renew_subs();
5615 wait_on_list(waiting_for_fsmap);
5616 }
5617 assert(fsmap);
5618 assert(fsmap->get_epoch() >= fsmap_latest);
5619 }
5620 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5621 << fsmap_latest << dendl;
5622 return 0;
5623 }
5624
5625 /**
5626 *
5627 * @mds_spec one of ID, rank, GID, "*"
5628 *
5629 */
5630 int Client::mds_command(
5631 const std::string &mds_spec,
5632 const vector<string>& cmd,
5633 const bufferlist& inbl,
5634 bufferlist *outbl,
5635 string *outs,
5636 Context *onfinish)
5637 {
5638 Mutex::Locker lock(client_lock);
5639
5640 if (!initialized)
5641 return -ENOTCONN;
5642
5643 int r;
5644 r = authenticate();
5645 if (r < 0) {
5646 return r;
5647 }
5648
5649 r = fetch_fsmap(false);
5650 if (r < 0) {
5651 return r;
5652 }
5653
5654 // Look up MDS target(s) of the command
5655 std::vector<mds_gid_t> targets;
5656 r = resolve_mds(mds_spec, &targets);
5657 if (r < 0) {
5658 return r;
5659 }
5660
5661 // If daemons are laggy, we won't send them commands. If all
5662 // are laggy then we fail.
5663 std::vector<mds_gid_t> non_laggy;
5664 for (const auto gid : targets) {
5665 const auto info = fsmap->get_info_gid(gid);
5666 if (!info.laggy()) {
5667 non_laggy.push_back(gid);
5668 }
5669 }
5670 if (non_laggy.size() == 0) {
5671 *outs = "All targeted MDS daemons are laggy";
5672 return -ENOENT;
5673 }
5674
5675 if (metadata.empty()) {
5676 // We are called on an unmounted client, so metadata
5677 // won't be initialized yet.
5678 populate_metadata("");
5679 }
5680
5681 // Send commands to targets
5682 C_GatherBuilder gather(cct, onfinish);
5683 for (const auto target_gid : non_laggy) {
5684 const auto info = fsmap->get_info_gid(target_gid);
5685
5686 // Open a connection to the target MDS
5687 entity_inst_t inst = info.get_inst();
5688 ConnectionRef conn = messenger->get_connection(inst);
5689
5690 // Generate MDSCommandOp state
5691 auto &op = command_table.start_command();
5692
5693 op.on_finish = gather.new_sub();
5694 op.cmd = cmd;
5695 op.outbl = outbl;
5696 op.outs = outs;
5697 op.inbl = inbl;
5698 op.mds_gid = target_gid;
5699 op.con = conn;
5700
5701 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5702 << " tid=" << op.tid << cmd << dendl;
5703
5704 // Construct and send MCommand
5705 MCommand *m = op.get_message(monclient->get_fsid());
5706 conn->send_message(m);
5707 }
5708 gather.activate();
5709
5710 return 0;
5711 }
5712
5713 void Client::handle_command_reply(MCommandReply *m)
5714 {
5715 ceph_tid_t const tid = m->get_tid();
5716
5717 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5718
5719 if (!command_table.exists(tid)) {
5720 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5721 m->put();
5722 return;
5723 }
5724
5725 auto &op = command_table.get_command(tid);
5726 if (op.outbl) {
5727 op.outbl->claim(m->get_data());
5728 }
5729 if (op.outs) {
5730 *op.outs = m->rs;
5731 }
5732
5733 if (op.on_finish) {
5734 op.on_finish->complete(m->r);
5735 }
5736
5737 command_table.erase(tid);
5738
5739 m->put();
5740 }
5741
5742 // -------------------
5743 // MOUNT
5744
5745 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5746 bool require_mds)
5747 {
5748 Mutex::Locker lock(client_lock);
5749
5750 if (mounted) {
5751 ldout(cct, 5) << "already mounted" << dendl;
5752 return 0;
5753 }
5754
5755 unmounting = false;
5756
5757 int r = authenticate();
5758 if (r < 0) {
5759 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5760 return r;
5761 }
5762
5763 std::string want = "mdsmap";
5764 const auto &mds_ns = cct->_conf->client_mds_namespace;
5765 if (!mds_ns.empty()) {
5766 r = fetch_fsmap(true);
5767 if (r < 0)
5768 return r;
5769 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5770 if (cid == FS_CLUSTER_ID_NONE)
5771 return -ENOENT;
5772
5773 std::ostringstream oss;
5774 oss << want << "." << cid;
5775 want = oss.str();
5776 }
5777 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5778
5779 monclient->sub_want(want, 0, 0);
5780 monclient->renew_subs();
5781
5782 tick(); // start tick
5783
5784 if (require_mds) {
5785 while (1) {
5786 auto availability = mdsmap->is_cluster_available();
5787 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5788 // Error out
5789 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5790 return CEPH_FUSE_NO_MDS_UP;
5791 } else if (availability == MDSMap::AVAILABLE) {
5792 // Continue to mount
5793 break;
5794 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5795 // Else, wait. MDSMonitor will update the map to bring
5796 // us to a conclusion eventually.
5797 wait_on_list(waiting_for_mdsmap);
5798 } else {
5799 // Unexpected value!
5800 ceph_abort();
5801 }
5802 }
5803 }
5804
5805 populate_metadata(mount_root.empty() ? "/" : mount_root);
5806
5807 filepath fp(CEPH_INO_ROOT);
5808 if (!mount_root.empty()) {
5809 fp = filepath(mount_root.c_str());
5810 }
5811 while (true) {
5812 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5813 req->set_filepath(fp);
5814 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5815 int res = make_request(req, perms);
5816 if (res < 0) {
5817 if (res == -EACCES && root) {
5818 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5819 break;
5820 }
5821 return res;
5822 }
5823
5824 if (fp.depth())
5825 fp.pop_dentry();
5826 else
5827 break;
5828 }
5829
5830 assert(root);
5831 _ll_get(root);
5832
5833 mounted = true;
5834
5835 // trace?
5836 if (!cct->_conf->client_trace.empty()) {
5837 traceout.open(cct->_conf->client_trace.c_str());
5838 if (traceout.is_open()) {
5839 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5840 } else {
5841 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5842 }
5843 }
5844
5845 /*
5846 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5847 ldout(cct, 3) << "op: struct stat st;" << dendl;
5848 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5849 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5850 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5851 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5852 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5853 ldout(cct, 3) << "op: int fd;" << dendl;
5854 */
5855 return 0;
5856 }
5857
5858 // UNMOUNT
5859
5860 void Client::_close_sessions()
5861 {
5862 while (!mds_sessions.empty()) {
5863 // send session closes!
5864 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5865 p != mds_sessions.end();
5866 ++p) {
5867 if (p->second->state != MetaSession::STATE_CLOSING) {
5868 _close_mds_session(p->second);
5869 }
5870 }
5871
5872 // wait for sessions to close
5873 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5874 mount_cond.Wait(client_lock);
5875 }
5876 }
5877
5878 void Client::flush_mdlog_sync()
5879 {
5880 if (mds_requests.empty())
5881 return;
5882 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5883 p != mds_sessions.end();
5884 ++p) {
5885 MetaSession *s = p->second;
5886 flush_mdlog(s);
5887 }
5888 }
5889
5890 void Client::flush_mdlog(MetaSession *session)
5891 {
5892 // Only send this to Luminous or newer MDS daemons, older daemons
5893 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5894 const uint64_t features = session->con->get_features();
5895 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5896 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5897 session->con->send_message(m);
5898 }
5899 }
5900
5901
5902 void Client::_unmount()
5903 {
5904 if (unmounting)
5905 return;
5906
5907 ldout(cct, 2) << "unmounting" << dendl;
5908 unmounting = true;
5909
5910 deleg_timeout = 0;
5911
5912 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5913 while (!mds_requests.empty()) {
5914 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5915 mount_cond.Wait(client_lock);
5916 }
5917
5918 if (tick_event)
5919 timer.cancel_event(tick_event);
5920 tick_event = 0;
5921
5922 cwd.reset();
5923
5924 // clean up any unclosed files
5925 while (!fd_map.empty()) {
5926 Fh *fh = fd_map.begin()->second;
5927 fd_map.erase(fd_map.begin());
5928 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5929 _release_fh(fh);
5930 }
5931
5932 while (!ll_unclosed_fh_set.empty()) {
5933 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5934 Fh *fh = *it;
5935 ll_unclosed_fh_set.erase(fh);
5936 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5937 _release_fh(fh);
5938 }
5939
5940 while (!opened_dirs.empty()) {
5941 dir_result_t *dirp = *opened_dirs.begin();
5942 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5943 _closedir(dirp);
5944 }
5945
5946 _ll_drop_pins();
5947
5948 if (blacklisted) {
5949 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5950
5951 if (cct->_conf->client_oc) {
5952 // Purge all cached data so that ObjectCacher doesn't get hung up
5953 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5954 // is to just leave things marked dirty
5955 // (http://tracker.ceph.com/issues/9105)
5956 for (const auto &i : inode_map) {
5957 objectcacher->purge_set(&(i.second->oset));
5958 }
5959 }
5960
5961 mounted = false;
5962 return;
5963 }
5964
5965 while (unsafe_sync_write > 0) {
5966 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5967 mount_cond.Wait(client_lock);
5968 }
5969
5970 if (cct->_conf->client_oc) {
5971 // flush/release all buffered data
5972 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5973 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5974 p != inode_map.end();
5975 p = next) {
5976 next = p;
5977 ++next;
5978 Inode *in = p->second;
5979 if (!in) {
5980 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5981 assert(in);
5982 }
5983 if (!in->caps.empty()) {
5984 InodeRef tmp_ref(in);
5985 _release(in);
5986 _flush(in, new C_Client_FlushComplete(this, in));
5987 }
5988 }
5989 }
5990
5991 flush_caps_sync();
5992 wait_sync_caps(last_flush_tid);
5993
5994 // empty lru cache
5995 trim_cache();
5996
5997 while (lru.lru_get_size() > 0 ||
5998 !inode_map.empty()) {
5999 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6000 << "+" << inode_map.size() << " items"
6001 << ", waiting (for caps to release?)"
6002 << dendl;
6003 utime_t until = ceph_clock_now() + utime_t(5, 0);
6004 int r = mount_cond.WaitUntil(client_lock, until);
6005 if (r == ETIMEDOUT) {
6006 dump_cache(NULL);
6007 }
6008 }
6009 assert(lru.lru_get_size() == 0);
6010 assert(inode_map.empty());
6011
6012 // stop tracing
6013 if (!cct->_conf->client_trace.empty()) {
6014 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6015 traceout.close();
6016 }
6017
6018 _close_sessions();
6019
6020 mounted = false;
6021
6022 ldout(cct, 2) << "unmounted." << dendl;
6023 }
6024
6025 void Client::unmount()
6026 {
6027 Mutex::Locker lock(client_lock);
6028 _unmount();
6029 }
6030
6031 void Client::flush_cap_releases()
6032 {
6033 // send any cap releases
6034 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6035 p != mds_sessions.end();
6036 ++p) {
6037 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
6038 p->first)) {
6039 if (cct->_conf->client_inject_release_failure) {
6040 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6041 p->second->release->put();
6042 } else {
6043 p->second->con->send_message(p->second->release);
6044 }
6045 p->second->release = 0;
6046 }
6047 }
6048 }
6049
6050 void Client::tick()
6051 {
6052 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6053 sleep(cct->_conf->client_debug_inject_tick_delay);
6054 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6055 cct->_conf->apply_changes(NULL);
6056 }
6057
6058 ldout(cct, 21) << "tick" << dendl;
6059 tick_event = timer.add_event_after(
6060 cct->_conf->client_tick_interval,
6061 new FunctionContext([this](int) {
6062 // Called back via Timer, which takes client_lock for us
6063 assert(client_lock.is_locked_by_me());
6064 tick();
6065 }));
6066 utime_t now = ceph_clock_now();
6067
6068 if (!mounted && !mds_requests.empty()) {
6069 MetaRequest *req = mds_requests.begin()->second;
6070 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6071 req->abort(-ETIMEDOUT);
6072 if (req->caller_cond) {
6073 req->kick = true;
6074 req->caller_cond->Signal();
6075 }
6076 signal_cond_list(waiting_for_mdsmap);
6077 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6078 p != mds_sessions.end();
6079 ++p)
6080 signal_context_list(p->second->waiting_for_open);
6081 }
6082 }
6083
6084 if (mdsmap->get_epoch()) {
6085 // renew caps?
6086 utime_t el = now - last_cap_renew;
6087 if (el > mdsmap->get_session_timeout() / 3.0)
6088 renew_caps();
6089
6090 flush_cap_releases();
6091 }
6092
6093 // delayed caps
6094 xlist<Inode*>::iterator p = delayed_list.begin();
6095 while (!p.end()) {
6096 Inode *in = *p;
6097 ++p;
6098 if (in->hold_caps_until > now)
6099 break;
6100 delayed_list.pop_front();
6101 check_caps(in, CHECK_CAPS_NODELAY);
6102 }
6103
6104 trim_cache(true);
6105 }
6106
6107 void Client::renew_caps()
6108 {
6109 ldout(cct, 10) << "renew_caps()" << dendl;
6110 last_cap_renew = ceph_clock_now();
6111
6112 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6113 p != mds_sessions.end();
6114 ++p) {
6115 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6116 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6117 renew_caps(p->second);
6118 }
6119 }
6120
6121 void Client::renew_caps(MetaSession *session)
6122 {
6123 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6124 session->last_cap_renew_request = ceph_clock_now();
6125 uint64_t seq = ++session->cap_renew_seq;
6126 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6127 }
6128
6129
6130 // ===============================================================
6131 // high level (POSIXy) interface
6132
6133 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6134 InodeRef *target, const UserPerm& perms)
6135 {
6136 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6137 MetaRequest *req = new MetaRequest(op);
6138 filepath path;
6139 dir->make_nosnap_relative_path(path);
6140 path.push_dentry(name);
6141 req->set_filepath(path);
6142 req->set_inode(dir);
6143 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6144 mask |= DEBUG_GETATTR_CAPS;
6145 req->head.args.getattr.mask = mask;
6146
6147 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6148
6149 int r = make_request(req, perms, target);
6150 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6151 return r;
6152 }
6153
6154 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6155 const UserPerm& perms)
6156 {
6157 int r = 0;
6158 Dentry *dn = NULL;
6159
6160 if (!dir->is_dir()) {
6161 r = -ENOTDIR;
6162 goto done;
6163 }
6164
6165 if (dname == "..") {
6166 if (dir->dn_set.empty())
6167 *target = dir;
6168 else
6169 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6170 goto done;
6171 }
6172
6173 if (dname == ".") {
6174 *target = dir;
6175 goto done;
6176 }
6177
6178 if (dname.length() > NAME_MAX) {
6179 r = -ENAMETOOLONG;
6180 goto done;
6181 }
6182
6183 if (dname == cct->_conf->client_snapdir &&
6184 dir->snapid == CEPH_NOSNAP) {
6185 *target = open_snapdir(dir);
6186 goto done;
6187 }
6188
6189 if (dir->dir &&
6190 dir->dir->dentries.count(dname)) {
6191 dn = dir->dir->dentries[dname];
6192
6193 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6194 << " seq " << dn->lease_seq
6195 << dendl;
6196
6197 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6198 // is dn lease valid?
6199 utime_t now = ceph_clock_now();
6200 if (dn->lease_mds >= 0 &&
6201 dn->lease_ttl > now &&
6202 mds_sessions.count(dn->lease_mds)) {
6203 MetaSession *s = mds_sessions[dn->lease_mds];
6204 if (s->cap_ttl > now &&
6205 s->cap_gen == dn->lease_gen) {
6206 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6207 // make trim_caps() behave.
6208 dir->try_touch_cap(dn->lease_mds);
6209 goto hit_dn;
6210 }
6211 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6212 << " vs lease_gen " << dn->lease_gen << dendl;
6213 }
6214 // dir lease?
6215 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6216 if (dn->cap_shared_gen == dir->shared_gen &&
6217 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6218 goto hit_dn;
6219 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6220 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6221 << *dir << " dn '" << dname << "'" << dendl;
6222 return -ENOENT;
6223 }
6224 }
6225 } else {
6226 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6227 }
6228 } else {
6229 // can we conclude ENOENT locally?
6230 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6231 (dir->flags & I_COMPLETE)) {
6232 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6233 return -ENOENT;
6234 }
6235 }
6236
6237 r = _do_lookup(dir, dname, mask, target, perms);
6238 goto done;
6239
6240 hit_dn:
6241 if (dn->inode) {
6242 *target = dn->inode;
6243 } else {
6244 r = -ENOENT;
6245 }
6246 touch_dn(dn);
6247
6248 done:
6249 if (r < 0)
6250 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6251 else
6252 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6253 return r;
6254 }
6255
6256 int Client::get_or_create(Inode *dir, const char* name,
6257 Dentry **pdn, bool expect_null)
6258 {
6259 // lookup
6260 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6261 dir->open_dir();
6262 if (dir->dir->dentries.count(name)) {
6263 Dentry *dn = dir->dir->dentries[name];
6264
6265 // is dn lease valid?
6266 utime_t now = ceph_clock_now();
6267 if (dn->inode &&
6268 dn->lease_mds >= 0 &&
6269 dn->lease_ttl > now &&
6270 mds_sessions.count(dn->lease_mds)) {
6271 MetaSession *s = mds_sessions[dn->lease_mds];
6272 if (s->cap_ttl > now &&
6273 s->cap_gen == dn->lease_gen) {
6274 if (expect_null)
6275 return -EEXIST;
6276 }
6277 }
6278 *pdn = dn;
6279 } else {
6280 // otherwise link up a new one
6281 *pdn = link(dir->dir, name, NULL, NULL);
6282 }
6283
6284 // success
6285 return 0;
6286 }
6287
6288 int Client::path_walk(const filepath& origpath, InodeRef *end,
6289 const UserPerm& perms, bool followsym, int mask)
6290 {
6291 filepath path = origpath;
6292 InodeRef cur;
6293 if (origpath.absolute())
6294 cur = root;
6295 else
6296 cur = cwd;
6297 assert(cur);
6298
6299 ldout(cct, 10) << "path_walk " << path << dendl;
6300
6301 int symlinks = 0;
6302
6303 unsigned i=0;
6304 while (i < path.depth() && cur) {
6305 int caps = 0;
6306 const string &dname = path[i];
6307 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6308 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6309 InodeRef next;
6310 if (cct->_conf->client_permissions) {
6311 int r = may_lookup(cur.get(), perms);
6312 if (r < 0)
6313 return r;
6314 caps = CEPH_CAP_AUTH_SHARED;
6315 }
6316
6317 /* Get extra requested caps on the last component */
6318 if (i == (path.depth() - 1))
6319 caps |= mask;
6320 int r = _lookup(cur.get(), dname, caps, &next, perms);
6321 if (r < 0)
6322 return r;
6323 // only follow trailing symlink if followsym. always follow
6324 // 'directory' symlinks.
6325 if (next && next->is_symlink()) {
6326 symlinks++;
6327 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6328 if (symlinks > MAXSYMLINKS) {
6329 return -ELOOP;
6330 }
6331
6332 if (i < path.depth() - 1) {
6333 // dir symlink
6334 // replace consumed components of path with symlink dir target
6335 filepath resolved(next->symlink.c_str());
6336 resolved.append(path.postfixpath(i + 1));
6337 path = resolved;
6338 i = 0;
6339 if (next->symlink[0] == '/') {
6340 cur = root;
6341 }
6342 continue;
6343 } else if (followsym) {
6344 if (next->symlink[0] == '/') {
6345 path = next->symlink.c_str();
6346 i = 0;
6347 // reset position
6348 cur = root;
6349 } else {
6350 filepath more(next->symlink.c_str());
6351 // we need to remove the symlink component from off of the path
6352 // before adding the target that the symlink points to. remain
6353 // at the same position in the path.
6354 path.pop_dentry();
6355 path.append(more);
6356 }
6357 continue;
6358 }
6359 }
6360 cur.swap(next);
6361 i++;
6362 }
6363 if (!cur)
6364 return -ENOENT;
6365 if (end)
6366 end->swap(cur);
6367 return 0;
6368 }
6369
6370
6371 // namespace ops
6372
6373 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6374 {
6375 Mutex::Locker lock(client_lock);
6376 tout(cct) << "link" << std::endl;
6377 tout(cct) << relexisting << std::endl;
6378 tout(cct) << relpath << std::endl;
6379
6380 if (unmounting)
6381 return -ENOTCONN;
6382
6383 filepath existing(relexisting);
6384
6385 InodeRef in, dir;
6386 int r = path_walk(existing, &in, perm, true);
6387 if (r < 0)
6388 return r;
6389 if (std::string(relpath) == "/") {
6390 r = -EEXIST;
6391 return r;
6392 }
6393 filepath path(relpath);
6394 string name = path.last_dentry();
6395 path.pop_dentry();
6396
6397 r = path_walk(path, &dir, perm, true);
6398 if (r < 0)
6399 return r;
6400 if (cct->_conf->client_permissions) {
6401 if (S_ISDIR(in->mode)) {
6402 r = -EPERM;
6403 return r;
6404 }
6405 r = may_hardlink(in.get(), perm);
6406 if (r < 0)
6407 return r;
6408 r = may_create(dir.get(), perm);
6409 if (r < 0)
6410 return r;
6411 }
6412 r = _link(in.get(), dir.get(), name.c_str(), perm);
6413 return r;
6414 }
6415
6416 int Client::unlink(const char *relpath, const UserPerm& perm)
6417 {
6418 Mutex::Locker lock(client_lock);
6419 tout(cct) << "unlink" << std::endl;
6420 tout(cct) << relpath << std::endl;
6421
6422 if (unmounting)
6423 return -ENOTCONN;
6424
6425 if (std::string(relpath) == "/")
6426 return -EISDIR;
6427
6428 filepath path(relpath);
6429 string name = path.last_dentry();
6430 path.pop_dentry();
6431 InodeRef dir;
6432 int r = path_walk(path, &dir, perm);
6433 if (r < 0)
6434 return r;
6435 if (cct->_conf->client_permissions) {
6436 r = may_delete(dir.get(), name.c_str(), perm);
6437 if (r < 0)
6438 return r;
6439 }
6440 return _unlink(dir.get(), name.c_str(), perm);
6441 }
6442
6443 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6444 {
6445 Mutex::Locker lock(client_lock);
6446 tout(cct) << "rename" << std::endl;
6447 tout(cct) << relfrom << std::endl;
6448 tout(cct) << relto << std::endl;
6449
6450 if (unmounting)
6451 return -ENOTCONN;
6452
6453 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6454 return -EBUSY;
6455
6456 filepath from(relfrom);
6457 filepath to(relto);
6458 string fromname = from.last_dentry();
6459 from.pop_dentry();
6460 string toname = to.last_dentry();
6461 to.pop_dentry();
6462
6463 InodeRef fromdir, todir;
6464 int r = path_walk(from, &fromdir, perm);
6465 if (r < 0)
6466 goto out;
6467 r = path_walk(to, &todir, perm);
6468 if (r < 0)
6469 goto out;
6470
6471 if (cct->_conf->client_permissions) {
6472 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6473 if (r < 0)
6474 return r;
6475 r = may_delete(todir.get(), toname.c_str(), perm);
6476 if (r < 0 && r != -ENOENT)
6477 return r;
6478 }
6479 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6480 out:
6481 return r;
6482 }
6483
6484 // dirs
6485
6486 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6487 {
6488 Mutex::Locker lock(client_lock);
6489 tout(cct) << "mkdir" << std::endl;
6490 tout(cct) << relpath << std::endl;
6491 tout(cct) << mode << std::endl;
6492 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6493
6494 if (unmounting)
6495 return -ENOTCONN;
6496
6497 if (std::string(relpath) == "/")
6498 return -EEXIST;
6499
6500 filepath path(relpath);
6501 string name = path.last_dentry();
6502 path.pop_dentry();
6503 InodeRef dir;
6504 int r = path_walk(path, &dir, perm);
6505 if (r < 0)
6506 return r;
6507 if (cct->_conf->client_permissions) {
6508 r = may_create(dir.get(), perm);
6509 if (r < 0)
6510 return r;
6511 }
6512 return _mkdir(dir.get(), name.c_str(), mode, perm);
6513 }
6514
6515 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6516 {
6517 Mutex::Locker lock(client_lock);
6518 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6519 tout(cct) << "mkdirs" << std::endl;
6520 tout(cct) << relpath << std::endl;
6521 tout(cct) << mode << std::endl;
6522
6523 if (unmounting)
6524 return -ENOTCONN;
6525
6526 //get through existing parts of path
6527 filepath path(relpath);
6528 unsigned int i;
6529 int r = 0, caps = 0;
6530 InodeRef cur, next;
6531 cur = cwd;
6532 for (i=0; i<path.depth(); ++i) {
6533 if (cct->_conf->client_permissions) {
6534 r = may_lookup(cur.get(), perms);
6535 if (r < 0)
6536 break;
6537 caps = CEPH_CAP_AUTH_SHARED;
6538 }
6539 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6540 if (r < 0)
6541 break;
6542 cur.swap(next);
6543 }
6544 //check that we have work left to do
6545 if (i==path.depth()) return -EEXIST;
6546 if (r!=-ENOENT) return r;
6547 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6548 //make new directory at each level
6549 for (; i<path.depth(); ++i) {
6550 if (cct->_conf->client_permissions) {
6551 r = may_create(cur.get(), perms);
6552 if (r < 0)
6553 return r;
6554 }
6555 //make new dir
6556 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6557
6558 //check proper creation/existence
6559 if(-EEXIST == r && i < path.depth() - 1) {
6560 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6561 }
6562 if (r < 0)
6563 return r;
6564 //move to new dir and continue
6565 cur.swap(next);
6566 ldout(cct, 20) << "mkdirs: successfully created directory "
6567 << filepath(cur->ino).get_path() << dendl;
6568 }
6569 return 0;
6570 }
6571
6572 int Client::rmdir(const char *relpath, const UserPerm& perms)
6573 {
6574 Mutex::Locker lock(client_lock);
6575 tout(cct) << "rmdir" << std::endl;
6576 tout(cct) << relpath << std::endl;
6577
6578 if (unmounting)
6579 return -ENOTCONN;
6580
6581 if (std::string(relpath) == "/")
6582 return -EBUSY;
6583
6584 filepath path(relpath);
6585 string name = path.last_dentry();
6586 path.pop_dentry();
6587 InodeRef dir;
6588 int r = path_walk(path, &dir, perms);
6589 if (r < 0)
6590 return r;
6591 if (cct->_conf->client_permissions) {
6592 int r = may_delete(dir.get(), name.c_str(), perms);
6593 if (r < 0)
6594 return r;
6595 }
6596 return _rmdir(dir.get(), name.c_str(), perms);
6597 }
6598
6599 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6600 {
6601 Mutex::Locker lock(client_lock);
6602 tout(cct) << "mknod" << std::endl;
6603 tout(cct) << relpath << std::endl;
6604 tout(cct) << mode << std::endl;
6605 tout(cct) << rdev << std::endl;
6606
6607 if (unmounting)
6608 return -ENOTCONN;
6609
6610 if (std::string(relpath) == "/")
6611 return -EEXIST;
6612
6613 filepath path(relpath);
6614 string name = path.last_dentry();
6615 path.pop_dentry();
6616 InodeRef dir;
6617 int r = path_walk(path, &dir, perms);
6618 if (r < 0)
6619 return r;
6620 if (cct->_conf->client_permissions) {
6621 int r = may_create(dir.get(), perms);
6622 if (r < 0)
6623 return r;
6624 }
6625 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6626 }
6627
6628 // symlinks
6629
6630 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6631 {
6632 Mutex::Locker lock(client_lock);
6633 tout(cct) << "symlink" << std::endl;
6634 tout(cct) << target << std::endl;
6635 tout(cct) << relpath << std::endl;
6636
6637 if (unmounting)
6638 return -ENOTCONN;
6639
6640 if (std::string(relpath) == "/")
6641 return -EEXIST;
6642
6643 filepath path(relpath);
6644 string name = path.last_dentry();
6645 path.pop_dentry();
6646 InodeRef dir;
6647 int r = path_walk(path, &dir, perms);
6648 if (r < 0)
6649 return r;
6650 if (cct->_conf->client_permissions) {
6651 int r = may_create(dir.get(), perms);
6652 if (r < 0)
6653 return r;
6654 }
6655 return _symlink(dir.get(), name.c_str(), target, perms);
6656 }
6657
6658 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6659 {
6660 Mutex::Locker lock(client_lock);
6661 tout(cct) << "readlink" << std::endl;
6662 tout(cct) << relpath << std::endl;
6663
6664 if (unmounting)
6665 return -ENOTCONN;
6666
6667 filepath path(relpath);
6668 InodeRef in;
6669 int r = path_walk(path, &in, perms, false);
6670 if (r < 0)
6671 return r;
6672
6673 return _readlink(in.get(), buf, size);
6674 }
6675
6676 int Client::_readlink(Inode *in, char *buf, size_t size)
6677 {
6678 if (!in->is_symlink())
6679 return -EINVAL;
6680
6681 // copy into buf (at most size bytes)
6682 int r = in->symlink.length();
6683 if (r > (int)size)
6684 r = size;
6685 memcpy(buf, in->symlink.c_str(), r);
6686 return r;
6687 }
6688
6689
6690 // inode stuff
6691
6692 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6693 {
6694 bool yes = in->caps_issued_mask(mask, true);
6695
6696 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6697 if (yes && !force)
6698 return 0;
6699
6700 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6701 filepath path;
6702 in->make_nosnap_relative_path(path);
6703 req->set_filepath(path);
6704 req->set_inode(in);
6705 req->head.args.getattr.mask = mask;
6706
6707 int res = make_request(req, perms);
6708 ldout(cct, 10) << "_getattr result=" << res << dendl;
6709 return res;
6710 }
6711
6712 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6713 const UserPerm& perms, InodeRef *inp)
6714 {
6715 int issued = in->caps_issued();
6716
6717 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6718 ccap_string(issued) << dendl;
6719
6720 if (in->snapid != CEPH_NOSNAP) {
6721 return -EROFS;
6722 }
6723 if ((mask & CEPH_SETATTR_SIZE) &&
6724 (unsigned long)stx->stx_size > in->size &&
6725 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6726 perms)) {
6727 return -EDQUOT;
6728 }
6729
6730 // make the change locally?
6731 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6732 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6733 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6734 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6735 << in->cap_dirtier_gid << ", forcing sync setattr"
6736 << dendl;
6737 /*
6738 * This works because we implicitly flush the caps as part of the
6739 * request, so the cap update check will happen with the writeback
6740 * cap context, and then the setattr check will happen with the
6741 * caller's context.
6742 *
6743 * In reality this pattern is likely pretty rare (different users
6744 * setattr'ing the same file). If that turns out not to be the
6745 * case later, we can build a more complex pipelined cap writeback
6746 * infrastructure...
6747 */
6748 if (!mask)
6749 mask |= CEPH_SETATTR_CTIME;
6750 goto force_request;
6751 }
6752
6753 if (!mask) {
6754 // caller just needs us to bump the ctime
6755 in->ctime = ceph_clock_now();
6756 in->cap_dirtier_uid = perms.uid();
6757 in->cap_dirtier_gid = perms.gid();
6758 if (issued & CEPH_CAP_AUTH_EXCL)
6759 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6760 else if (issued & CEPH_CAP_FILE_EXCL)
6761 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6762 else if (issued & CEPH_CAP_XATTR_EXCL)
6763 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6764 else
6765 mask |= CEPH_SETATTR_CTIME;
6766 }
6767
6768 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6769 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6770
6771 mask &= ~CEPH_SETATTR_KILL_SGUID;
6772
6773 if (mask & CEPH_SETATTR_UID) {
6774 in->ctime = ceph_clock_now();
6775 in->cap_dirtier_uid = perms.uid();
6776 in->cap_dirtier_gid = perms.gid();
6777 in->uid = stx->stx_uid;
6778 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6779 mask &= ~CEPH_SETATTR_UID;
6780 kill_sguid = true;
6781 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6782 }
6783 if (mask & CEPH_SETATTR_GID) {
6784 in->ctime = ceph_clock_now();
6785 in->cap_dirtier_uid = perms.uid();
6786 in->cap_dirtier_gid = perms.gid();
6787 in->gid = stx->stx_gid;
6788 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6789 mask &= ~CEPH_SETATTR_GID;
6790 kill_sguid = true;
6791 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6792 }
6793
6794 if (mask & CEPH_SETATTR_MODE) {
6795 in->ctime = ceph_clock_now();
6796 in->cap_dirtier_uid = perms.uid();
6797 in->cap_dirtier_gid = perms.gid();
6798 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6799 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6800 mask &= ~CEPH_SETATTR_MODE;
6801 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6802 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6803 /* Must squash the any setuid/setgid bits with an ownership change */
6804 in->mode &= ~(S_ISUID|S_ISGID);
6805 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6806 }
6807
6808 if (mask & CEPH_SETATTR_BTIME) {
6809 in->ctime = ceph_clock_now();
6810 in->cap_dirtier_uid = perms.uid();
6811 in->cap_dirtier_gid = perms.gid();
6812 in->btime = utime_t(stx->stx_btime);
6813 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6814 mask &= ~CEPH_SETATTR_BTIME;
6815 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6816 }
6817 } else if (mask & CEPH_SETATTR_SIZE) {
6818 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6819 mask |= CEPH_SETATTR_KILL_SGUID;
6820 }
6821
6822 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6823 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6824 if (mask & CEPH_SETATTR_MTIME)
6825 in->mtime = utime_t(stx->stx_mtime);
6826 if (mask & CEPH_SETATTR_ATIME)
6827 in->atime = utime_t(stx->stx_atime);
6828 in->ctime = ceph_clock_now();
6829 in->cap_dirtier_uid = perms.uid();
6830 in->cap_dirtier_gid = perms.gid();
6831 in->time_warp_seq++;
6832 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6833 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6834 }
6835 }
6836 if (!mask) {
6837 in->change_attr++;
6838 return 0;
6839 }
6840
6841 force_request:
6842 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6843
6844 filepath path;
6845
6846 in->make_nosnap_relative_path(path);
6847 req->set_filepath(path);
6848 req->set_inode(in);
6849
6850 if (mask & CEPH_SETATTR_KILL_SGUID) {
6851 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6852 }
6853 if (mask & CEPH_SETATTR_MODE) {
6854 req->head.args.setattr.mode = stx->stx_mode;
6855 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6856 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6857 }
6858 if (mask & CEPH_SETATTR_UID) {
6859 req->head.args.setattr.uid = stx->stx_uid;
6860 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6861 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6862 }
6863 if (mask & CEPH_SETATTR_GID) {
6864 req->head.args.setattr.gid = stx->stx_gid;
6865 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6866 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6867 }
6868 if (mask & CEPH_SETATTR_BTIME) {
6869 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6870 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6871 }
6872 if (mask & CEPH_SETATTR_MTIME) {
6873 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6874 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6875 CEPH_CAP_FILE_WR;
6876 }
6877 if (mask & CEPH_SETATTR_ATIME) {
6878 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6879 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6880 CEPH_CAP_FILE_WR;
6881 }
6882 if (mask & CEPH_SETATTR_SIZE) {
6883 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6884 req->head.args.setattr.size = stx->stx_size;
6885 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6886 } else { //too big!
6887 put_request(req);
6888 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6889 return -EFBIG;
6890 }
6891 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6892 CEPH_CAP_FILE_WR;
6893 }
6894 req->head.args.setattr.mask = mask;
6895
6896 req->regetattr_mask = mask;
6897
6898 int res = make_request(req, perms, inp);
6899 ldout(cct, 10) << "_setattr result=" << res << dendl;
6900 return res;
6901 }
6902
6903 /* Note that we only care about attrs that setattr cares about */
6904 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6905 {
6906 stx->stx_size = st->st_size;
6907 stx->stx_mode = st->st_mode;
6908 stx->stx_uid = st->st_uid;
6909 stx->stx_gid = st->st_gid;
6910 stx->stx_mtime = st->st_mtim;
6911 stx->stx_atime = st->st_atim;
6912 }
6913
6914 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6915 const UserPerm& perms, InodeRef *inp)
6916 {
6917 int ret = _do_setattr(in, stx, mask, perms, inp);
6918 if (ret < 0)
6919 return ret;
6920 if (mask & CEPH_SETATTR_MODE)
6921 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6922 return ret;
6923 }
6924
6925 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6926 const UserPerm& perms)
6927 {
6928 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6929 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6930 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6931 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6932 if (cct->_conf->client_permissions) {
6933 int r = may_setattr(in.get(), stx, mask, perms);
6934 if (r < 0)
6935 return r;
6936 }
6937 return __setattrx(in.get(), stx, mask, perms);
6938 }
6939
6940 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6941 const UserPerm& perms)
6942 {
6943 struct ceph_statx stx;
6944
6945 stat_to_statx(attr, &stx);
6946 mask &= ~CEPH_SETATTR_BTIME;
6947
6948 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6949 mask &= ~CEPH_SETATTR_UID;
6950 }
6951 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6952 mask &= ~CEPH_SETATTR_GID;
6953 }
6954
6955 return _setattrx(in, &stx, mask, perms);
6956 }
6957
6958 int Client::setattr(const char *relpath, struct stat *attr, int mask,
6959 const UserPerm& perms)
6960 {
6961 Mutex::Locker lock(client_lock);
6962 tout(cct) << "setattr" << std::endl;
6963 tout(cct) << relpath << std::endl;
6964 tout(cct) << mask << std::endl;
6965
6966 if (unmounting)
6967 return -ENOTCONN;
6968
6969 filepath path(relpath);
6970 InodeRef in;
6971 int r = path_walk(path, &in, perms);
6972 if (r < 0)
6973 return r;
6974 return _setattr(in, attr, mask, perms);
6975 }
6976
6977 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6978 const UserPerm& perms, int flags)
6979 {
6980 Mutex::Locker lock(client_lock);
6981 tout(cct) << "setattrx" << std::endl;
6982 tout(cct) << relpath << std::endl;
6983 tout(cct) << mask << std::endl;
6984
6985 if (unmounting)
6986 return -ENOTCONN;
6987
6988 filepath path(relpath);
6989 InodeRef in;
6990 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6991 if (r < 0)
6992 return r;
6993 return _setattrx(in, stx, mask, perms);
6994 }
6995
6996 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6997 {
6998 Mutex::Locker lock(client_lock);
6999 tout(cct) << "fsetattr" << std::endl;
7000 tout(cct) << fd << std::endl;
7001 tout(cct) << mask << std::endl;
7002
7003 if (unmounting)
7004 return -ENOTCONN;
7005
7006 Fh *f = get_filehandle(fd);
7007 if (!f)
7008 return -EBADF;
7009 #if defined(__linux__) && defined(O_PATH)
7010 if (f->flags & O_PATH)
7011 return -EBADF;
7012 #endif
7013 return _setattr(f->inode, attr, mask, perms);
7014 }
7015
7016 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7017 {
7018 Mutex::Locker lock(client_lock);
7019 tout(cct) << "fsetattr" << std::endl;
7020 tout(cct) << fd << std::endl;
7021 tout(cct) << mask << std::endl;
7022
7023 if (unmounting)
7024 return -ENOTCONN;
7025
7026 Fh *f = get_filehandle(fd);
7027 if (!f)
7028 return -EBADF;
7029 #if defined(__linux__) && defined(O_PATH)
7030 if (f->flags & O_PATH)
7031 return -EBADF;
7032 #endif
7033 return _setattrx(f->inode, stx, mask, perms);
7034 }
7035
7036 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7037 frag_info_t *dirstat, int mask)
7038 {
7039 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7040 Mutex::Locker lock(client_lock);
7041 tout(cct) << "stat" << std::endl;
7042 tout(cct) << relpath << std::endl;
7043
7044 if (unmounting)
7045 return -ENOTCONN;
7046
7047 filepath path(relpath);
7048 InodeRef in;
7049 int r = path_walk(path, &in, perms, true, mask);
7050 if (r < 0)
7051 return r;
7052 r = _getattr(in, mask, perms);
7053 if (r < 0) {
7054 ldout(cct, 3) << "stat exit on error!" << dendl;
7055 return r;
7056 }
7057 fill_stat(in, stbuf, dirstat);
7058 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7059 return r;
7060 }
7061
7062 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7063 {
7064 unsigned mask = 0;
7065
7066 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7067 if (flags & AT_NO_ATTR_SYNC)
7068 goto out;
7069
7070 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7071 mask |= CEPH_CAP_PIN;
7072 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7073 mask |= CEPH_CAP_AUTH_SHARED;
7074 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7075 mask |= CEPH_CAP_LINK_SHARED;
7076 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7077 mask |= CEPH_CAP_FILE_SHARED;
7078 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7079 mask |= CEPH_CAP_XATTR_SHARED;
7080 out:
7081 return mask;
7082 }
7083
7084 int Client::statx(const char *relpath, struct ceph_statx *stx,
7085 const UserPerm& perms,
7086 unsigned int want, unsigned int flags)
7087 {
7088 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7089 Mutex::Locker lock(client_lock);
7090 tout(cct) << "statx" << std::endl;
7091 tout(cct) << relpath << std::endl;
7092
7093 if (unmounting)
7094 return -ENOTCONN;
7095
7096 filepath path(relpath);
7097 InodeRef in;
7098
7099 unsigned mask = statx_to_mask(flags, want);
7100
7101 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7102 if (r < 0)
7103 return r;
7104
7105 r = _getattr(in, mask, perms);
7106 if (r < 0) {
7107 ldout(cct, 3) << "statx exit on error!" << dendl;
7108 return r;
7109 }
7110
7111 fill_statx(in, mask, stx);
7112 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7113 return r;
7114 }
7115
7116 int Client::lstat(const char *relpath, struct stat *stbuf,
7117 const UserPerm& perms, frag_info_t *dirstat, int mask)
7118 {
7119 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7120 Mutex::Locker lock(client_lock);
7121 tout(cct) << "lstat" << std::endl;
7122 tout(cct) << relpath << std::endl;
7123
7124 if (unmounting)
7125 return -ENOTCONN;
7126
7127 filepath path(relpath);
7128 InodeRef in;
7129 // don't follow symlinks
7130 int r = path_walk(path, &in, perms, false, mask);
7131 if (r < 0)
7132 return r;
7133 r = _getattr(in, mask, perms);
7134 if (r < 0) {
7135 ldout(cct, 3) << "lstat exit on error!" << dendl;
7136 return r;
7137 }
7138 fill_stat(in, stbuf, dirstat);
7139 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7140 return r;
7141 }
7142
7143 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7144 {
7145 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7146 << " mode 0" << oct << in->mode << dec
7147 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7148 memset(st, 0, sizeof(struct stat));
7149 if (use_faked_inos())
7150 st->st_ino = in->faked_ino;
7151 else
7152 st->st_ino = in->ino;
7153 st->st_dev = in->snapid;
7154 st->st_mode = in->mode;
7155 st->st_rdev = in->rdev;
7156 if (in->is_dir()) {
7157 switch (in->nlink) {
7158 case 0:
7159 st->st_nlink = 0; /* dir is unlinked */
7160 break;
7161 case 1:
7162 st->st_nlink = 1 /* parent dentry */
7163 + 1 /* <dir>/. */
7164 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7165 break;
7166 default:
7167 ceph_abort();
7168 }
7169 } else {
7170 st->st_nlink = in->nlink;
7171 }
7172 st->st_uid = in->uid;
7173 st->st_gid = in->gid;
7174 if (in->ctime > in->mtime) {
7175 stat_set_ctime_sec(st, in->ctime.sec());
7176 stat_set_ctime_nsec(st, in->ctime.nsec());
7177 } else {
7178 stat_set_ctime_sec(st, in->mtime.sec());
7179 stat_set_ctime_nsec(st, in->mtime.nsec());
7180 }
7181 stat_set_atime_sec(st, in->atime.sec());
7182 stat_set_atime_nsec(st, in->atime.nsec());
7183 stat_set_mtime_sec(st, in->mtime.sec());
7184 stat_set_mtime_nsec(st, in->mtime.nsec());
7185 if (in->is_dir()) {
7186 if (cct->_conf->client_dirsize_rbytes)
7187 st->st_size = in->rstat.rbytes;
7188 else
7189 st->st_size = in->dirstat.size();
7190 st->st_blocks = 1;
7191 } else {
7192 st->st_size = in->size;
7193 st->st_blocks = (in->size + 511) >> 9;
7194 }
7195 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7196
7197 if (dirstat)
7198 *dirstat = in->dirstat;
7199 if (rstat)
7200 *rstat = in->rstat;
7201
7202 return in->caps_issued();
7203 }
7204
7205 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7206 {
7207 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7208 << " mode 0" << oct << in->mode << dec
7209 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7210 memset(stx, 0, sizeof(struct ceph_statx));
7211
7212 /*
7213 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7214 * so that all bits are set.
7215 */
7216 if (!mask)
7217 mask = ~0;
7218
7219 /* These are always considered to be available */
7220 stx->stx_dev = in->snapid;
7221 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7222
7223 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7224 stx->stx_mode = S_IFMT & in->mode;
7225 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7226 stx->stx_rdev = in->rdev;
7227 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7228
7229 if (mask & CEPH_CAP_AUTH_SHARED) {
7230 stx->stx_uid = in->uid;
7231 stx->stx_gid = in->gid;
7232 stx->stx_mode = in->mode;
7233 in->btime.to_timespec(&stx->stx_btime);
7234 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7235 }
7236
7237 if (mask & CEPH_CAP_LINK_SHARED) {
7238 if (in->is_dir()) {
7239 switch (in->nlink) {
7240 case 0:
7241 stx->stx_nlink = 0; /* dir is unlinked */
7242 break;
7243 case 1:
7244 stx->stx_nlink = 1 /* parent dentry */
7245 + 1 /* <dir>/. */
7246 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7247 break;
7248 default:
7249 ceph_abort();
7250 }
7251 } else {
7252 stx->stx_nlink = in->nlink;
7253 }
7254 stx->stx_mask |= CEPH_STATX_NLINK;
7255 }
7256
7257 if (mask & CEPH_CAP_FILE_SHARED) {
7258
7259 in->atime.to_timespec(&stx->stx_atime);
7260 in->mtime.to_timespec(&stx->stx_mtime);
7261
7262 if (in->is_dir()) {
7263 if (cct->_conf->client_dirsize_rbytes)
7264 stx->stx_size = in->rstat.rbytes;
7265 else
7266 stx->stx_size = in->dirstat.size();
7267 stx->stx_blocks = 1;
7268 } else {
7269 stx->stx_size = in->size;
7270 stx->stx_blocks = (in->size + 511) >> 9;
7271 }
7272 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7273 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7274 }
7275
7276 /* Change time and change_attr both require all shared caps to view */
7277 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7278 stx->stx_version = in->change_attr;
7279 if (in->ctime > in->mtime)
7280 in->ctime.to_timespec(&stx->stx_ctime);
7281 else
7282 in->mtime.to_timespec(&stx->stx_ctime);
7283 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7284 }
7285
7286 }
7287
7288 void Client::touch_dn(Dentry *dn)
7289 {
7290 lru.lru_touch(dn);
7291 }
7292
7293 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7294 {
7295 Mutex::Locker lock(client_lock);
7296 tout(cct) << "chmod" << std::endl;
7297 tout(cct) << relpath << std::endl;
7298 tout(cct) << mode << std::endl;
7299
7300 if (unmounting)
7301 return -ENOTCONN;
7302
7303 filepath path(relpath);
7304 InodeRef in;
7305 int r = path_walk(path, &in, perms);
7306 if (r < 0)
7307 return r;
7308 struct stat attr;
7309 attr.st_mode = mode;
7310 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7311 }
7312
7313 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7314 {
7315 Mutex::Locker lock(client_lock);
7316 tout(cct) << "fchmod" << std::endl;
7317 tout(cct) << fd << std::endl;
7318 tout(cct) << mode << std::endl;
7319
7320 if (unmounting)
7321 return -ENOTCONN;
7322
7323 Fh *f = get_filehandle(fd);
7324 if (!f)
7325 return -EBADF;
7326 #if defined(__linux__) && defined(O_PATH)
7327 if (f->flags & O_PATH)
7328 return -EBADF;
7329 #endif
7330 struct stat attr;
7331 attr.st_mode = mode;
7332 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7333 }
7334
7335 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7336 {
7337 Mutex::Locker lock(client_lock);
7338 tout(cct) << "lchmod" << std::endl;
7339 tout(cct) << relpath << std::endl;
7340 tout(cct) << mode << std::endl;
7341
7342 if (unmounting)
7343 return -ENOTCONN;
7344
7345 filepath path(relpath);
7346 InodeRef in;
7347 // don't follow symlinks
7348 int r = path_walk(path, &in, perms, false);
7349 if (r < 0)
7350 return r;
7351 struct stat attr;
7352 attr.st_mode = mode;
7353 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7354 }
7355
7356 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7357 const UserPerm& perms)
7358 {
7359 Mutex::Locker lock(client_lock);
7360 tout(cct) << "chown" << std::endl;
7361 tout(cct) << relpath << std::endl;
7362 tout(cct) << new_uid << std::endl;
7363 tout(cct) << new_gid << std::endl;
7364
7365 if (unmounting)
7366 return -ENOTCONN;
7367
7368 filepath path(relpath);
7369 InodeRef in;
7370 int r = path_walk(path, &in, perms);
7371 if (r < 0)
7372 return r;
7373 struct stat attr;
7374 attr.st_uid = new_uid;
7375 attr.st_gid = new_gid;
7376 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7377 }
7378
7379 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7380 {
7381 Mutex::Locker lock(client_lock);
7382 tout(cct) << "fchown" << std::endl;
7383 tout(cct) << fd << std::endl;
7384 tout(cct) << new_uid << std::endl;
7385 tout(cct) << new_gid << std::endl;
7386
7387 if (unmounting)
7388 return -ENOTCONN;
7389
7390 Fh *f = get_filehandle(fd);
7391 if (!f)
7392 return -EBADF;
7393 #if defined(__linux__) && defined(O_PATH)
7394 if (f->flags & O_PATH)
7395 return -EBADF;
7396 #endif
7397 struct stat attr;
7398 attr.st_uid = new_uid;
7399 attr.st_gid = new_gid;
7400 int mask = 0;
7401 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7402 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7403 return _setattr(f->inode, &attr, mask, perms);
7404 }
7405
7406 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7407 const UserPerm& perms)
7408 {
7409 Mutex::Locker lock(client_lock);
7410 tout(cct) << "lchown" << std::endl;
7411 tout(cct) << relpath << std::endl;
7412 tout(cct) << new_uid << std::endl;
7413 tout(cct) << new_gid << std::endl;
7414
7415 if (unmounting)
7416 return -ENOTCONN;
7417
7418 filepath path(relpath);
7419 InodeRef in;
7420 // don't follow symlinks
7421 int r = path_walk(path, &in, perms, false);
7422 if (r < 0)
7423 return r;
7424 struct stat attr;
7425 attr.st_uid = new_uid;
7426 attr.st_gid = new_gid;
7427 int mask = 0;
7428 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7429 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7430 return _setattr(in, &attr, mask, perms);
7431 }
7432
7433 int Client::utime(const char *relpath, struct utimbuf *buf,
7434 const UserPerm& perms)
7435 {
7436 Mutex::Locker lock(client_lock);
7437 tout(cct) << "utime" << std::endl;
7438 tout(cct) << relpath << std::endl;
7439 tout(cct) << buf->modtime << std::endl;
7440 tout(cct) << buf->actime << std::endl;
7441
7442 if (unmounting)
7443 return -ENOTCONN;
7444
7445 filepath path(relpath);
7446 InodeRef in;
7447 int r = path_walk(path, &in, perms);
7448 if (r < 0)
7449 return r;
7450 struct stat attr;
7451 stat_set_mtime_sec(&attr, buf->modtime);
7452 stat_set_mtime_nsec(&attr, 0);
7453 stat_set_atime_sec(&attr, buf->actime);
7454 stat_set_atime_nsec(&attr, 0);
7455 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7456 }
7457
7458 int Client::lutime(const char *relpath, struct utimbuf *buf,
7459 const UserPerm& perms)
7460 {
7461 Mutex::Locker lock(client_lock);
7462 tout(cct) << "lutime" << std::endl;
7463 tout(cct) << relpath << std::endl;
7464 tout(cct) << buf->modtime << std::endl;
7465 tout(cct) << buf->actime << std::endl;
7466
7467 if (unmounting)
7468 return -ENOTCONN;
7469
7470 filepath path(relpath);
7471 InodeRef in;
7472 // don't follow symlinks
7473 int r = path_walk(path, &in, perms, false);
7474 if (r < 0)
7475 return r;
7476 struct stat attr;
7477 stat_set_mtime_sec(&attr, buf->modtime);
7478 stat_set_mtime_nsec(&attr, 0);
7479 stat_set_atime_sec(&attr, buf->actime);
7480 stat_set_atime_nsec(&attr, 0);
7481 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7482 }
7483
7484 int Client::flock(int fd, int operation, uint64_t owner)
7485 {
7486 Mutex::Locker lock(client_lock);
7487 tout(cct) << "flock" << std::endl;
7488 tout(cct) << fd << std::endl;
7489 tout(cct) << operation << std::endl;
7490 tout(cct) << owner << std::endl;
7491
7492 if (unmounting)
7493 return -ENOTCONN;
7494
7495 Fh *f = get_filehandle(fd);
7496 if (!f)
7497 return -EBADF;
7498
7499 return _flock(f, operation, owner);
7500 }
7501
7502 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7503 {
7504 Mutex::Locker lock(client_lock);
7505 tout(cct) << "opendir" << std::endl;
7506 tout(cct) << relpath << std::endl;
7507
7508 if (unmounting)
7509 return -ENOTCONN;
7510
7511 filepath path(relpath);
7512 InodeRef in;
7513 int r = path_walk(path, &in, perms, true);
7514 if (r < 0)
7515 return r;
7516 if (cct->_conf->client_permissions) {
7517 int r = may_open(in.get(), O_RDONLY, perms);
7518 if (r < 0)
7519 return r;
7520 }
7521 r = _opendir(in.get(), dirpp, perms);
7522 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7523 if (r != -ENOTDIR)
7524 tout(cct) << (unsigned long)*dirpp << std::endl;
7525 return r;
7526 }
7527
7528 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7529 {
7530 if (!in->is_dir())
7531 return -ENOTDIR;
7532 *dirpp = new dir_result_t(in, perms);
7533 opened_dirs.insert(*dirpp);
7534 ldout(cct, 8) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7535 return 0;
7536 }
7537
7538
7539 int Client::closedir(dir_result_t *dir)
7540 {
7541 Mutex::Locker lock(client_lock);
7542 tout(cct) << "closedir" << std::endl;
7543 tout(cct) << (unsigned long)dir << std::endl;
7544
7545 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7546 _closedir(dir);
7547 return 0;
7548 }
7549
7550 void Client::_closedir(dir_result_t *dirp)
7551 {
7552 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7553 if (dirp->inode) {
7554 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7555 dirp->inode.reset();
7556 }
7557 _readdir_drop_dirp_buffer(dirp);
7558 opened_dirs.erase(dirp);
7559 delete dirp;
7560 }
7561
7562 void Client::rewinddir(dir_result_t *dirp)
7563 {
7564 Mutex::Locker lock(client_lock);
7565 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7566
7567 if (unmounting)
7568 return;
7569
7570 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7571 _readdir_drop_dirp_buffer(d);
7572 d->reset();
7573 }
7574
7575 loff_t Client::telldir(dir_result_t *dirp)
7576 {
7577 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7578 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7579 return d->offset;
7580 }
7581
7582 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7583 {
7584 Mutex::Locker lock(client_lock);
7585
7586 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7587
7588 if (unmounting)
7589 return;
7590
7591 if (offset == dirp->offset)
7592 return;
7593
7594 if (offset > dirp->offset)
7595 dirp->release_count = 0; // bump if we do a forward seek
7596 else
7597 dirp->ordered_count = 0; // disable filling readdir cache
7598
7599 if (dirp->hash_order()) {
7600 if (dirp->offset > offset) {
7601 _readdir_drop_dirp_buffer(dirp);
7602 dirp->reset();
7603 }
7604 } else {
7605 if (offset == 0 ||
7606 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7607 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7608 _readdir_drop_dirp_buffer(dirp);
7609 dirp->reset();
7610 }
7611 }
7612
7613 dirp->offset = offset;
7614 }
7615
7616
7617 //struct dirent {
7618 // ino_t d_ino; /* inode number */
7619 // off_t d_off; /* offset to the next dirent */
7620 // unsigned short d_reclen; /* length of this record */
7621 // unsigned char d_type; /* type of file */
7622 // char d_name[256]; /* filename */
7623 //};
7624 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7625 {
7626 strncpy(de->d_name, name, 255);
7627 de->d_name[255] = '\0';
7628 #ifndef __CYGWIN__
7629 de->d_ino = ino;
7630 #if !defined(DARWIN) && !defined(__FreeBSD__)
7631 de->d_off = next_off;
7632 #endif
7633 de->d_reclen = 1;
7634 de->d_type = IFTODT(type);
7635 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7636 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7637 #endif
7638 }
7639
7640 void Client::_readdir_next_frag(dir_result_t *dirp)
7641 {
7642 frag_t fg = dirp->buffer_frag;
7643
7644 if (fg.is_rightmost()) {
7645 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7646 dirp->set_end();
7647 return;
7648 }
7649
7650 // advance
7651 fg = fg.next();
7652 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7653
7654 if (dirp->hash_order()) {
7655 // keep last_name
7656 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7657 if (dirp->offset < new_offset) // don't decrease offset
7658 dirp->offset = new_offset;
7659 } else {
7660 dirp->last_name.clear();
7661 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7662 _readdir_rechoose_frag(dirp);
7663 }
7664 }
7665
7666 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7667 {
7668 assert(dirp->inode);
7669
7670 if (dirp->hash_order())
7671 return;
7672
7673 frag_t cur = frag_t(dirp->offset_high());
7674 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7675 if (fg != cur) {
7676 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7677 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7678 dirp->last_name.clear();
7679 dirp->next_offset = 2;
7680 }
7681 }
7682
7683 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7684 {
7685 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7686 dirp->buffer.clear();
7687 }
7688
7689 int Client::_readdir_get_frag(dir_result_t *dirp)
7690 {
7691 assert(dirp);
7692 assert(dirp->inode);
7693
7694 // get the current frag.
7695 frag_t fg;
7696 if (dirp->hash_order())
7697 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7698 else
7699 fg = frag_t(dirp->offset_high());
7700
7701 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7702 << " offset " << hex << dirp->offset << dec << dendl;
7703
7704 int op = CEPH_MDS_OP_READDIR;
7705 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7706 op = CEPH_MDS_OP_LSSNAP;
7707
7708 InodeRef& diri = dirp->inode;
7709
7710 MetaRequest *req = new MetaRequest(op);
7711 filepath path;
7712 diri->make_nosnap_relative_path(path);
7713 req->set_filepath(path);
7714 req->set_inode(diri.get());
7715 req->head.args.readdir.frag = fg;
7716 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7717 if (dirp->last_name.length()) {
7718 req->path2.set_path(dirp->last_name);
7719 } else if (dirp->hash_order()) {
7720 req->head.args.readdir.offset_hash = dirp->offset_high();
7721 }
7722 req->dirp = dirp;
7723
7724 bufferlist dirbl;
7725 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7726
7727 if (res == -EAGAIN) {
7728 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7729 _readdir_rechoose_frag(dirp);
7730 return _readdir_get_frag(dirp);
7731 }
7732
7733 if (res == 0) {
7734 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7735 << " size " << dirp->buffer.size() << dendl;
7736 } else {
7737 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7738 dirp->set_end();
7739 }
7740
7741 return res;
7742 }
7743
7744 struct dentry_off_lt {
7745 bool operator()(const Dentry* dn, int64_t off) const {
7746 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7747 }
7748 };
7749
7750 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7751 int caps, bool getref)
7752 {
7753 assert(client_lock.is_locked());
7754 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7755 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7756 << dendl;
7757 Dir *dir = dirp->inode->dir;
7758
7759 if (!dir) {
7760 ldout(cct, 10) << " dir is empty" << dendl;
7761 dirp->set_end();
7762 return 0;
7763 }
7764
7765 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7766 dir->readdir_cache.end(),
7767 dirp->offset, dentry_off_lt());
7768
7769 string dn_name;
7770 while (true) {
7771 if (!dirp->inode->is_complete_and_ordered())
7772 return -EAGAIN;
7773 if (pd == dir->readdir_cache.end())
7774 break;
7775 Dentry *dn = *pd;
7776 if (dn->inode == NULL) {
7777 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7778 ++pd;
7779 continue;
7780 }
7781 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7782 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7783 ++pd;
7784 continue;
7785 }
7786
7787 int r = _getattr(dn->inode, caps, dirp->perms);
7788 if (r < 0)
7789 return r;
7790
7791 struct ceph_statx stx;
7792 struct dirent de;
7793 fill_statx(dn->inode, caps, &stx);
7794
7795 uint64_t next_off = dn->offset + 1;
7796 ++pd;
7797 if (pd == dir->readdir_cache.end())
7798 next_off = dir_result_t::END;
7799
7800 Inode *in = NULL;
7801 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7802 if (getref) {
7803 in = dn->inode.get();
7804 _ll_get(in);
7805 }
7806
7807 dn_name = dn->name; // fill in name while we have lock
7808
7809 client_lock.Unlock();
7810 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7811 client_lock.Lock();
7812 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7813 << " = " << r << dendl;
7814 if (r < 0) {
7815 return r;
7816 }
7817
7818 dirp->offset = next_off;
7819 if (dirp->at_end())
7820 dirp->next_offset = 2;
7821 else
7822 dirp->next_offset = dirp->offset_low();
7823 dirp->last_name = dn_name; // we successfully returned this one; update!
7824 dirp->release_count = 0; // last_name no longer match cache index
7825 if (r > 0)
7826 return r;
7827 }
7828
7829 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7830 dirp->set_end();
7831 return 0;
7832 }
7833
7834 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7835 unsigned want, unsigned flags, bool getref)
7836 {
7837 int caps = statx_to_mask(flags, want);
7838
7839 Mutex::Locker lock(client_lock);
7840
7841 if (unmounting)
7842 return -ENOTCONN;
7843
7844 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7845
7846 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7847 << dec << " at_end=" << dirp->at_end()
7848 << " hash_order=" << dirp->hash_order() << dendl;
7849
7850 struct dirent de;
7851 struct ceph_statx stx;
7852 memset(&de, 0, sizeof(de));
7853 memset(&stx, 0, sizeof(stx));
7854
7855 InodeRef& diri = dirp->inode;
7856
7857 if (dirp->at_end())
7858 return 0;
7859
7860 if (dirp->offset == 0) {
7861 ldout(cct, 15) << " including ." << dendl;
7862 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7863 uint64_t next_off = 1;
7864
7865 int r;
7866 r = _getattr(diri, caps, dirp->perms);
7867 if (r < 0)
7868 return r;
7869
7870 fill_statx(diri, caps, &stx);
7871 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7872
7873 Inode *inode = NULL;
7874 if (getref) {
7875 inode = diri.get();
7876 _ll_get(inode);
7877 }
7878
7879 client_lock.Unlock();
7880 r = cb(p, &de, &stx, next_off, inode);
7881 client_lock.Lock();
7882 if (r < 0)
7883 return r;
7884
7885 dirp->offset = next_off;
7886 if (r > 0)
7887 return r;
7888 }
7889 if (dirp->offset == 1) {
7890 ldout(cct, 15) << " including .." << dendl;
7891 uint64_t next_off = 2;
7892 InodeRef in;
7893 if (diri->dn_set.empty())
7894 in = diri;
7895 else
7896 in = diri->get_first_parent()->dir->parent_inode;
7897
7898 int r;
7899 r = _getattr(in, caps, dirp->perms);
7900 if (r < 0)
7901 return r;
7902
7903 fill_statx(in, caps, &stx);
7904 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7905
7906 Inode *inode = NULL;
7907 if (getref) {
7908 inode = in.get();
7909 _ll_get(inode);
7910 }
7911
7912 client_lock.Unlock();
7913 r = cb(p, &de, &stx, next_off, inode);
7914 client_lock.Lock();
7915 if (r < 0)
7916 return r;
7917
7918 dirp->offset = next_off;
7919 if (r > 0)
7920 return r;
7921 }
7922
7923 // can we read from our cache?
7924 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7925 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7926 << dirp->inode->is_complete_and_ordered()
7927 << " issued " << ccap_string(dirp->inode->caps_issued())
7928 << dendl;
7929 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7930 dirp->inode->is_complete_and_ordered() &&
7931 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7932 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7933 if (err != -EAGAIN)
7934 return err;
7935 }
7936
7937 while (1) {
7938 if (dirp->at_end())
7939 return 0;
7940
7941 bool check_caps = true;
7942 if (!dirp->is_cached()) {
7943 int r = _readdir_get_frag(dirp);
7944 if (r)
7945 return r;
7946 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7947 // different than the requested one. (our dirfragtree was outdated)
7948 check_caps = false;
7949 }
7950 frag_t fg = dirp->buffer_frag;
7951
7952 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7953 << " offset " << hex << dirp->offset << dendl;
7954
7955 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7956 dirp->offset, dir_result_t::dentry_off_lt());
7957 it != dirp->buffer.end();
7958 ++it) {
7959 dir_result_t::dentry &entry = *it;
7960
7961 uint64_t next_off = entry.offset + 1;
7962
7963 int r;
7964 if (check_caps) {
7965 r = _getattr(entry.inode, caps, dirp->perms);
7966 if (r < 0)
7967 return r;
7968 }
7969
7970 fill_statx(entry.inode, caps, &stx);
7971 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7972
7973 Inode *inode = NULL;
7974 if (getref) {
7975 inode = entry.inode.get();
7976 _ll_get(inode);
7977 }
7978
7979 client_lock.Unlock();
7980 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7981 client_lock.Lock();
7982
7983 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7984 << " = " << r << dendl;
7985 if (r < 0)
7986 return r;
7987
7988 dirp->offset = next_off;
7989 if (r > 0)
7990 return r;
7991 }
7992
7993 if (dirp->next_offset > 2) {
7994 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7995 _readdir_drop_dirp_buffer(dirp);
7996 continue; // more!
7997 }
7998
7999 if (!fg.is_rightmost()) {
8000 // next frag!
8001 _readdir_next_frag(dirp);
8002 continue;
8003 }
8004
8005 if (diri->shared_gen == dirp->start_shared_gen &&
8006 diri->dir_release_count == dirp->release_count) {
8007 if (diri->dir_ordered_count == dirp->ordered_count) {
8008 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8009 if (diri->dir) {
8010 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8011 diri->dir->readdir_cache.resize(dirp->cache_index);
8012 }
8013 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8014 } else {
8015 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8016 diri->flags |= I_COMPLETE;
8017 }
8018 }
8019
8020 dirp->set_end();
8021 return 0;
8022 }
8023 ceph_abort();
8024 return 0;
8025 }
8026
8027
8028 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8029 {
8030 return readdirplus_r(d, de, 0, 0, 0, NULL);
8031 }
8032
8033 /*
8034 * readdirplus_r
8035 *
8036 * returns
8037 * 1 if we got a dirent
8038 * 0 for end of directory
8039 * <0 on error
8040 */
8041
8042 struct single_readdir {
8043 struct dirent *de;
8044 struct ceph_statx *stx;
8045 Inode *inode;
8046 bool full;
8047 };
8048
8049 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8050 struct ceph_statx *stx, off_t off,
8051 Inode *in)
8052 {
8053 single_readdir *c = static_cast<single_readdir *>(p);
8054
8055 if (c->full)
8056 return -1; // already filled this dirent
8057
8058 *c->de = *de;
8059 if (c->stx)
8060 *c->stx = *stx;
8061 c->inode = in;
8062 c->full = true;
8063 return 1;
8064 }
8065
8066 struct dirent *Client::readdir(dir_result_t *d)
8067 {
8068 int ret;
8069 static struct dirent de;
8070 single_readdir sr;
8071 sr.de = &de;
8072 sr.stx = NULL;
8073 sr.inode = NULL;
8074 sr.full = false;
8075
8076 // our callback fills the dirent and sets sr.full=true on first
8077 // call, and returns -1 the second time around.
8078 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8079 if (ret < -1) {
8080 errno = -ret; // this sucks.
8081 return (dirent *) NULL;
8082 }
8083 if (sr.full) {
8084 return &de;
8085 }
8086 return (dirent *) NULL;
8087 }
8088
8089 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8090 struct ceph_statx *stx, unsigned want,
8091 unsigned flags, Inode **out)
8092 {
8093 single_readdir sr;
8094 sr.de = de;
8095 sr.stx = stx;
8096 sr.inode = NULL;
8097 sr.full = false;
8098
8099 // our callback fills the dirent and sets sr.full=true on first
8100 // call, and returns -1 the second time around.
8101 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8102 if (r < -1)
8103 return r;
8104 if (out)
8105 *out = sr.inode;
8106 if (sr.full)
8107 return 1;
8108 return 0;
8109 }
8110
8111
8112 /* getdents */
8113 struct getdents_result {
8114 char *buf;
8115 int buflen;
8116 int pos;
8117 bool fullent;
8118 };
8119
8120 static int _readdir_getdent_cb(void *p, struct dirent *de,
8121 struct ceph_statx *stx, off_t off, Inode *in)
8122 {
8123 struct getdents_result *c = static_cast<getdents_result *>(p);
8124
8125 int dlen;
8126 if (c->fullent)
8127 dlen = sizeof(*de);
8128 else
8129 dlen = strlen(de->d_name) + 1;
8130
8131 if (c->pos + dlen > c->buflen)
8132 return -1; // doesn't fit
8133
8134 if (c->fullent) {
8135 memcpy(c->buf + c->pos, de, sizeof(*de));
8136 } else {
8137 memcpy(c->buf + c->pos, de->d_name, dlen);
8138 }
8139 c->pos += dlen;
8140 return 0;
8141 }
8142
8143 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8144 {
8145 getdents_result gr;
8146 gr.buf = buf;
8147 gr.buflen = buflen;
8148 gr.fullent = fullent;
8149 gr.pos = 0;
8150
8151 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8152
8153 if (r < 0) { // some error
8154 if (r == -1) { // buffer ran out of space
8155 if (gr.pos) { // but we got some entries already!
8156 return gr.pos;
8157 } // or we need a larger buffer
8158 return -ERANGE;
8159 } else { // actual error, return it
8160 return r;
8161 }
8162 }
8163 return gr.pos;
8164 }
8165
8166
8167 /* getdir */
8168 struct getdir_result {
8169 list<string> *contents;
8170 int num;
8171 };
8172
8173 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8174 {
8175 getdir_result *r = static_cast<getdir_result *>(p);
8176
8177 r->contents->push_back(de->d_name);
8178 r->num++;
8179 return 0;
8180 }
8181
8182 int Client::getdir(const char *relpath, list<string>& contents,
8183 const UserPerm& perms)
8184 {
8185 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8186 {
8187 Mutex::Locker lock(client_lock);
8188 tout(cct) << "getdir" << std::endl;
8189 tout(cct) << relpath << std::endl;
8190 }
8191
8192 dir_result_t *d;
8193 int r = opendir(relpath, &d, perms);
8194 if (r < 0)
8195 return r;
8196
8197 getdir_result gr;
8198 gr.contents = &contents;
8199 gr.num = 0;
8200 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8201
8202 closedir(d);
8203
8204 if (r < 0)
8205 return r;
8206 return gr.num;
8207 }
8208
8209
8210 /****** file i/o **********/
8211 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8212 mode_t mode, int stripe_unit, int stripe_count,
8213 int object_size, const char *data_pool)
8214 {
8215 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8216 Mutex::Locker lock(client_lock);
8217 tout(cct) << "open" << std::endl;
8218 tout(cct) << relpath << std::endl;
8219 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8220
8221 if (unmounting)
8222 return -ENOTCONN;
8223
8224 Fh *fh = NULL;
8225
8226 #if defined(__linux__) && defined(O_PATH)
8227 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8228 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8229 * in kernel (fs/open.c). */
8230 if (flags & O_PATH)
8231 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8232 #endif
8233
8234 filepath path(relpath);
8235 InodeRef in;
8236 bool created = false;
8237 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8238 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8239 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8240
8241 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8242 return -EEXIST;
8243
8244 #if defined(__linux__) && defined(O_PATH)
8245 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8246 #else
8247 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8248 #endif
8249 return -ELOOP;
8250
8251 if (r == -ENOENT && (flags & O_CREAT)) {
8252 filepath dirpath = path;
8253 string dname = dirpath.last_dentry();
8254 dirpath.pop_dentry();
8255 InodeRef dir;
8256 r = path_walk(dirpath, &dir, perms, true,
8257 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8258 if (r < 0)
8259 goto out;
8260 if (cct->_conf->client_permissions) {
8261 r = may_create(dir.get(), perms);
8262 if (r < 0)
8263 goto out;
8264 }
8265 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8266 stripe_count, object_size, data_pool, &created, perms);
8267 }
8268 if (r < 0)
8269 goto out;
8270
8271 if (!created) {
8272 // posix says we can only check permissions of existing files
8273 if (cct->_conf->client_permissions) {
8274 r = may_open(in.get(), flags, perms);
8275 if (r < 0)
8276 goto out;
8277 }
8278 }
8279
8280 if (!fh)
8281 r = _open(in.get(), flags, mode, &fh, perms);
8282 if (r >= 0) {
8283 // allocate a integer file descriptor
8284 assert(fh);
8285 r = get_fd();
8286 assert(fd_map.count(r) == 0);
8287 fd_map[r] = fh;
8288 }
8289
8290 out:
8291 tout(cct) << r << std::endl;
8292 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8293 return r;
8294 }
8295
8296 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8297 {
8298 /* Use default file striping parameters */
8299 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8300 }
8301
8302 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8303 const UserPerm& perms)
8304 {
8305 Mutex::Locker lock(client_lock);
8306 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8307
8308 if (unmounting)
8309 return -ENOTCONN;
8310
8311 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8312 filepath path(ino);
8313 req->set_filepath(path);
8314
8315 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8316 char f[30];
8317 sprintf(f, "%u", h);
8318 filepath path2(dirino);
8319 path2.push_dentry(string(f));
8320 req->set_filepath2(path2);
8321
8322 int r = make_request(req, perms, NULL, NULL,
8323 rand() % mdsmap->get_num_in_mds());
8324 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8325 return r;
8326 }
8327
8328
8329 /**
8330 * Load inode into local cache.
8331 *
8332 * If inode pointer is non-NULL, and take a reference on
8333 * the resulting Inode object in one operation, so that caller
8334 * can safely assume inode will still be there after return.
8335 */
8336 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8337 {
8338 ldout(cct, 8) << "lookup_ino enter(" << ino << ")" << dendl;
8339
8340 if (unmounting)
8341 return -ENOTCONN;
8342
8343 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8344 filepath path(ino);
8345 req->set_filepath(path);
8346
8347 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8348 if (r == 0 && inode != NULL) {
8349 vinodeno_t vino(ino, CEPH_NOSNAP);
8350 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8351 assert(p != inode_map.end());
8352 *inode = p->second;
8353 _ll_get(*inode);
8354 }
8355 ldout(cct, 8) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8356 return r;
8357 }
8358
8359 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8360 {
8361 Mutex::Locker lock(client_lock);
8362 return _lookup_ino(ino, perms, inode);
8363 }
8364
8365 /**
8366 * Find the parent inode of `ino` and insert it into
8367 * our cache. Conditionally also set `parent` to a referenced
8368 * Inode* if caller provides non-NULL value.
8369 */
8370 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8371 {
8372 ldout(cct, 8) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8373
8374 if (unmounting)
8375 return -ENOTCONN;
8376
8377 if (!ino->dn_set.empty()) {
8378 // if we exposed the parent here, we'd need to check permissions,
8379 // but right now we just rely on the MDS doing so in make_request
8380 ldout(cct, 8) << "lookup_parent dentry already present" << dendl;
8381 return 0;
8382 }
8383
8384 if (ino->is_root()) {
8385 *parent = NULL;
8386 ldout(cct, 8) << "ino is root, no parent" << dendl;
8387 return -EINVAL;
8388 }
8389
8390 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8391 filepath path(ino->ino);
8392 req->set_filepath(path);
8393
8394 InodeRef target;
8395 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8396 // Give caller a reference to the parent ino if they provided a pointer.
8397 if (parent != NULL) {
8398 if (r == 0) {
8399 *parent = target.get();
8400 _ll_get(*parent);
8401 ldout(cct, 8) << "lookup_parent found parent " << (*parent)->ino << dendl;
8402 } else {
8403 *parent = NULL;
8404 }
8405 }
8406 ldout(cct, 8) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8407 return r;
8408 }
8409
8410 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8411 {
8412 Mutex::Locker lock(client_lock);
8413 return _lookup_parent(ino, perms, parent);
8414 }
8415
8416 /**
8417 * Populate the parent dentry for `ino`, provided it is
8418 * a child of `parent`.
8419 */
8420 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8421 {
8422 assert(parent->is_dir());
8423 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8424
8425 if (unmounting)
8426 return -ENOTCONN;
8427
8428 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8429 req->set_filepath2(filepath(parent->ino));
8430 req->set_filepath(filepath(ino->ino));
8431 req->set_inode(ino);
8432
8433 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8434 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8435 return r;
8436 }
8437
8438 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8439 {
8440 Mutex::Locker lock(client_lock);
8441 return _lookup_name(ino, parent, perms);
8442 }
8443
8444 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8445 {
8446 assert(in);
8447 Fh *f = new Fh(in);
8448 f->mode = cmode;
8449 f->flags = flags;
8450
8451 // inode
8452 f->actor_perms = perms;
8453
8454 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8455
8456 if (in->snapid != CEPH_NOSNAP) {
8457 in->snap_cap_refs++;
8458 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8459 << ccap_string(in->caps_issued()) << dendl;
8460 }
8461
8462 const md_config_t *conf = cct->_conf;
8463 f->readahead.set_trigger_requests(1);
8464 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8465 uint64_t max_readahead = Readahead::NO_LIMIT;
8466 if (conf->client_readahead_max_bytes) {
8467 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8468 }
8469 if (conf->client_readahead_max_periods) {
8470 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8471 }
8472 f->readahead.set_max_readahead_size(max_readahead);
8473 vector<uint64_t> alignments;
8474 alignments.push_back(in->layout.get_period());
8475 alignments.push_back(in->layout.stripe_unit);
8476 f->readahead.set_alignments(alignments);
8477
8478 return f;
8479 }
8480
8481 int Client::_release_fh(Fh *f)
8482 {
8483 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8484 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8485 Inode *in = f->inode.get();
8486 ldout(cct, 8) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8487
8488 in->unset_deleg(f);
8489
8490 if (in->snapid == CEPH_NOSNAP) {
8491 if (in->put_open_ref(f->mode)) {
8492 _flush(in, new C_Client_FlushComplete(this, in));
8493 check_caps(in, 0);
8494 }
8495 } else {
8496 assert(in->snap_cap_refs > 0);
8497 in->snap_cap_refs--;
8498 }
8499
8500 _release_filelocks(f);
8501
8502 // Finally, read any async err (i.e. from flushes)
8503 int err = f->take_async_err();
8504 if (err != 0) {
8505 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8506 << cpp_strerror(err) << dendl;
8507 } else {
8508 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8509 }
8510
8511 _put_fh(f);
8512
8513 return err;
8514 }
8515
8516 void Client::_put_fh(Fh *f)
8517 {
8518 int left = f->put();
8519 if (!left) {
8520 delete f;
8521 }
8522 }
8523
8524 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8525 const UserPerm& perms)
8526 {
8527 if (in->snapid != CEPH_NOSNAP &&
8528 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8529 return -EROFS;
8530 }
8531
8532 // use normalized flags to generate cmode
8533 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8534 if (cmode < 0)
8535 return -EINVAL;
8536 int want = ceph_caps_for_mode(cmode);
8537 int result = 0;
8538
8539 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8540
8541 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8542 // update wanted?
8543 check_caps(in, CHECK_CAPS_NODELAY);
8544 } else {
8545
8546 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8547 filepath path;
8548 in->make_nosnap_relative_path(path);
8549 req->set_filepath(path);
8550 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8551 req->head.args.open.mode = mode;
8552 req->head.args.open.pool = -1;
8553 if (cct->_conf->client_debug_getattr_caps)
8554 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8555 else
8556 req->head.args.open.mask = 0;
8557 req->head.args.open.old_size = in->size; // for O_TRUNC
8558 req->set_inode(in);
8559 result = make_request(req, perms);
8560
8561 /*
8562 * NFS expects that delegations will be broken on a conflicting open,
8563 * not just when there is actual conflicting access to the file. SMB leases
8564 * and oplocks also have similar semantics.
8565 *
8566 * Ensure that clients that have delegations enabled will wait on minimal
8567 * caps during open, just to ensure that other clients holding delegations
8568 * return theirs first.
8569 */
8570 if (deleg_timeout && result == 0) {
8571 int need = 0, have;
8572
8573 if (cmode & CEPH_FILE_MODE_WR)
8574 need |= CEPH_CAP_FILE_WR;
8575 if (cmode & CEPH_FILE_MODE_RD)
8576 need |= CEPH_CAP_FILE_RD;
8577
8578 result = get_caps(in, need, want, &have, -1);
8579 if (result < 0) {
8580 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8581 " . Denying open: " <<
8582 cpp_strerror(result) << dendl;
8583 in->put_open_ref(cmode);
8584 } else {
8585 put_cap_ref(in, need);
8586 }
8587 }
8588 }
8589
8590 // success?
8591 if (result >= 0) {
8592 if (fhp)
8593 *fhp = _create_fh(in, flags, cmode, perms);
8594 } else {
8595 in->put_open_ref(cmode);
8596 }
8597
8598 trim_cache();
8599
8600 return result;
8601 }
8602
8603 int Client::_renew_caps(Inode *in)
8604 {
8605 int wanted = in->caps_file_wanted();
8606 if (in->is_any_caps() &&
8607 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8608 check_caps(in, CHECK_CAPS_NODELAY);
8609 return 0;
8610 }
8611
8612 int flags = 0;
8613 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8614 flags = O_RDWR;
8615 else if (wanted & CEPH_CAP_FILE_RD)
8616 flags = O_RDONLY;
8617 else if (wanted & CEPH_CAP_FILE_WR)
8618 flags = O_WRONLY;
8619
8620 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8621 filepath path;
8622 in->make_nosnap_relative_path(path);
8623 req->set_filepath(path);
8624 req->head.args.open.flags = flags;
8625 req->head.args.open.pool = -1;
8626 if (cct->_conf->client_debug_getattr_caps)
8627 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8628 else
8629 req->head.args.open.mask = 0;
8630 req->set_inode(in);
8631
8632 // duplicate in case Cap goes away; not sure if that race is a concern?
8633 const UserPerm *pperm = in->get_best_perms();
8634 UserPerm perms;
8635 if (pperm != NULL)
8636 perms = *pperm;
8637 int ret = make_request(req, perms);
8638 return ret;
8639 }
8640
8641 int Client::close(int fd)
8642 {
8643 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8644 Mutex::Locker lock(client_lock);
8645 tout(cct) << "close" << std::endl;
8646 tout(cct) << fd << std::endl;
8647
8648 if (unmounting)
8649 return -ENOTCONN;
8650
8651 Fh *fh = get_filehandle(fd);
8652 if (!fh)
8653 return -EBADF;
8654 int err = _release_fh(fh);
8655 fd_map.erase(fd);
8656 put_fd(fd);
8657 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8658 return err;
8659 }
8660
8661
8662 // ------------
8663 // read, write
8664
8665 loff_t Client::lseek(int fd, loff_t offset, int whence)
8666 {
8667 Mutex::Locker lock(client_lock);
8668 tout(cct) << "lseek" << std::endl;
8669 tout(cct) << fd << std::endl;
8670 tout(cct) << offset << std::endl;
8671 tout(cct) << whence << std::endl;
8672
8673 if (unmounting)
8674 return -ENOTCONN;
8675
8676 Fh *f = get_filehandle(fd);
8677 if (!f)
8678 return -EBADF;
8679 #if defined(__linux__) && defined(O_PATH)
8680 if (f->flags & O_PATH)
8681 return -EBADF;
8682 #endif
8683 return _lseek(f, offset, whence);
8684 }
8685
8686 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8687 {
8688 Inode *in = f->inode.get();
8689 int r;
8690
8691 switch (whence) {
8692 case SEEK_SET:
8693 f->pos = offset;
8694 break;
8695
8696 case SEEK_CUR:
8697 f->pos += offset;
8698 break;
8699
8700 case SEEK_END:
8701 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8702 if (r < 0)
8703 return r;
8704 f->pos = in->size + offset;
8705 break;
8706
8707 default:
8708 ceph_abort();
8709 }
8710
8711 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8712 return f->pos;
8713 }
8714
8715
8716 void Client::lock_fh_pos(Fh *f)
8717 {
8718 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8719
8720 if (f->pos_locked || !f->pos_waiters.empty()) {
8721 Cond cond;
8722 f->pos_waiters.push_back(&cond);
8723 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8724 while (f->pos_locked || f->pos_waiters.front() != &cond)
8725 cond.Wait(client_lock);
8726 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8727 assert(f->pos_waiters.front() == &cond);
8728 f->pos_waiters.pop_front();
8729 }
8730
8731 f->pos_locked = true;
8732 }
8733
8734 void Client::unlock_fh_pos(Fh *f)
8735 {
8736 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8737 f->pos_locked = false;
8738 }
8739
8740 int Client::uninline_data(Inode *in, Context *onfinish)
8741 {
8742 if (!in->inline_data.length()) {
8743 onfinish->complete(0);
8744 return 0;
8745 }
8746
8747 char oid_buf[32];
8748 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8749 object_t oid = oid_buf;
8750
8751 ObjectOperation create_ops;
8752 create_ops.create(false);
8753
8754 objecter->mutate(oid,
8755 OSDMap::file_to_object_locator(in->layout),
8756 create_ops,
8757 in->snaprealm->get_snap_context(),
8758 ceph::real_clock::now(),
8759 0,
8760 NULL);
8761
8762 bufferlist inline_version_bl;
8763 ::encode(in->inline_version, inline_version_bl);
8764
8765 ObjectOperation uninline_ops;
8766 uninline_ops.cmpxattr("inline_version",
8767 CEPH_OSD_CMPXATTR_OP_GT,
8768 CEPH_OSD_CMPXATTR_MODE_U64,
8769 inline_version_bl);
8770 bufferlist inline_data = in->inline_data;
8771 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8772 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8773
8774 objecter->mutate(oid,
8775 OSDMap::file_to_object_locator(in->layout),
8776 uninline_ops,
8777 in->snaprealm->get_snap_context(),
8778 ceph::real_clock::now(),
8779 0,
8780 onfinish);
8781
8782 return 0;
8783 }
8784
8785 //
8786
8787 // blocking osd interface
8788
8789 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8790 {
8791 Mutex::Locker lock(client_lock);
8792 tout(cct) << "read" << std::endl;
8793 tout(cct) << fd << std::endl;
8794 tout(cct) << size << std::endl;
8795 tout(cct) << offset << std::endl;
8796
8797 if (unmounting)
8798 return -ENOTCONN;
8799
8800 Fh *f = get_filehandle(fd);
8801 if (!f)
8802 return -EBADF;
8803 #if defined(__linux__) && defined(O_PATH)
8804 if (f->flags & O_PATH)
8805 return -EBADF;
8806 #endif
8807 bufferlist bl;
8808 int r = _read(f, offset, size, &bl);
8809 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8810 if (r >= 0) {
8811 bl.copy(0, bl.length(), buf);
8812 r = bl.length();
8813 }
8814 return r;
8815 }
8816
8817 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8818 {
8819 if (iovcnt < 0)
8820 return -EINVAL;
8821 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8822 }
8823
8824 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8825 {
8826 const md_config_t *conf = cct->_conf;
8827 Inode *in = f->inode.get();
8828
8829 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8830 return -EBADF;
8831 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8832
8833 bool movepos = false;
8834 if (offset < 0) {
8835 lock_fh_pos(f);
8836 offset = f->pos;
8837 movepos = true;
8838 }
8839 loff_t start_pos = offset;
8840
8841 if (in->inline_version == 0) {
8842 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8843 if (r < 0) {
8844 if (movepos)
8845 unlock_fh_pos(f);
8846 return r;
8847 }
8848 assert(in->inline_version > 0);
8849 }
8850
8851 retry:
8852 int have;
8853 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8854 if (r < 0) {
8855 if (movepos)
8856 unlock_fh_pos(f);
8857 return r;
8858 }
8859 if (f->flags & O_DIRECT)
8860 have &= ~CEPH_CAP_FILE_CACHE;
8861
8862 Mutex uninline_flock("Client::_read_uninline_data flock");
8863 Cond uninline_cond;
8864 bool uninline_done = false;
8865 int uninline_ret = 0;
8866 Context *onuninline = NULL;
8867
8868 if (in->inline_version < CEPH_INLINE_NONE) {
8869 if (!(have & CEPH_CAP_FILE_CACHE)) {
8870 onuninline = new C_SafeCond(&uninline_flock,
8871 &uninline_cond,
8872 &uninline_done,
8873 &uninline_ret);
8874 uninline_data(in, onuninline);
8875 } else {
8876 uint32_t len = in->inline_data.length();
8877
8878 uint64_t endoff = offset + size;
8879 if (endoff > in->size)
8880 endoff = in->size;
8881
8882 if (offset < len) {
8883 if (endoff <= len) {
8884 bl->substr_of(in->inline_data, offset, endoff - offset);
8885 } else {
8886 bl->substr_of(in->inline_data, offset, len - offset);
8887 bl->append_zero(endoff - len);
8888 }
8889 } else if ((uint64_t)offset < endoff) {
8890 bl->append_zero(endoff - offset);
8891 }
8892
8893 goto success;
8894 }
8895 }
8896
8897 if (!conf->client_debug_force_sync_read &&
8898 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8899
8900 if (f->flags & O_RSYNC) {
8901 _flush_range(in, offset, size);
8902 }
8903 r = _read_async(f, offset, size, bl);
8904 if (r < 0)
8905 goto done;
8906 } else {
8907 if (f->flags & O_DIRECT)
8908 _flush_range(in, offset, size);
8909
8910 bool checkeof = false;
8911 r = _read_sync(f, offset, size, bl, &checkeof);
8912 if (r < 0)
8913 goto done;
8914 if (checkeof) {
8915 offset += r;
8916 size -= r;
8917
8918 put_cap_ref(in, CEPH_CAP_FILE_RD);
8919 have = 0;
8920 // reverify size
8921 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8922 if (r < 0)
8923 goto done;
8924
8925 // eof? short read.
8926 if ((uint64_t)offset < in->size)
8927 goto retry;
8928 }
8929 }
8930
8931 success:
8932 if (movepos) {
8933 // adjust fd pos
8934 f->pos = start_pos + bl->length();
8935 unlock_fh_pos(f);
8936 }
8937
8938 done:
8939 // done!
8940
8941 if (onuninline) {
8942 client_lock.Unlock();
8943 uninline_flock.Lock();
8944 while (!uninline_done)
8945 uninline_cond.Wait(uninline_flock);
8946 uninline_flock.Unlock();
8947 client_lock.Lock();
8948
8949 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8950 in->inline_data.clear();
8951 in->inline_version = CEPH_INLINE_NONE;
8952 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8953 check_caps(in, 0);
8954 } else
8955 r = uninline_ret;
8956 }
8957
8958 if (have)
8959 put_cap_ref(in, CEPH_CAP_FILE_RD);
8960 if (r < 0) {
8961 if (movepos)
8962 unlock_fh_pos(f);
8963 return r;
8964 } else
8965 return bl->length();
8966 }
8967
8968 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8969 client(c), f(f) {
8970 f->get();
8971 f->readahead.inc_pending();
8972 }
8973
8974 Client::C_Readahead::~C_Readahead() {
8975 f->readahead.dec_pending();
8976 client->_put_fh(f);
8977 }
8978
8979 void Client::C_Readahead::finish(int r) {
8980 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8981 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8982 }
8983
8984 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8985 {
8986 const md_config_t *conf = cct->_conf;
8987 Inode *in = f->inode.get();
8988
8989 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8990
8991 // trim read based on file size?
8992 if (off >= in->size)
8993 return 0;
8994 if (len == 0)
8995 return 0;
8996 if (off + len > in->size) {
8997 len = in->size - off;
8998 }
8999
9000 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9001 << " max_bytes=" << f->readahead.get_max_readahead_size()
9002 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9003
9004 // read (and possibly block)
9005 int r, rvalue = 0;
9006 Mutex flock("Client::_read_async flock");
9007 Cond cond;
9008 bool done = false;
9009 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
9010 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9011 off, len, bl, 0, onfinish);
9012 if (r == 0) {
9013 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9014 client_lock.Unlock();
9015 flock.Lock();
9016 while (!done)
9017 cond.Wait(flock);
9018 flock.Unlock();
9019 client_lock.Lock();
9020 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9021 r = rvalue;
9022 } else {
9023 // it was cached.
9024 delete onfinish;
9025 }
9026
9027 if(f->readahead.get_min_readahead_size() > 0) {
9028 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9029 if (readahead_extent.second > 0) {
9030 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9031 << " (caller wants " << off << "~" << len << ")" << dendl;
9032 Context *onfinish2 = new C_Readahead(this, f);
9033 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9034 readahead_extent.first, readahead_extent.second,
9035 NULL, 0, onfinish2);
9036 if (r2 == 0) {
9037 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9038 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9039 } else {
9040 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9041 delete onfinish2;
9042 }
9043 }
9044 }
9045
9046 return r;
9047 }
9048
9049 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9050 bool *checkeof)
9051 {
9052 Inode *in = f->inode.get();
9053 uint64_t pos = off;
9054 int left = len;
9055 int read = 0;
9056
9057 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
9058
9059 Mutex flock("Client::_read_sync flock");
9060 Cond cond;
9061 while (left > 0) {
9062 int r = 0;
9063 bool done = false;
9064 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
9065 bufferlist tbl;
9066
9067 int wanted = left;
9068 filer->read_trunc(in->ino, &in->layout, in->snapid,
9069 pos, left, &tbl, 0,
9070 in->truncate_size, in->truncate_seq,
9071 onfinish);
9072 client_lock.Unlock();
9073 flock.Lock();
9074 while (!done)
9075 cond.Wait(flock);
9076 flock.Unlock();
9077 client_lock.Lock();
9078
9079 // if we get ENOENT from OSD, assume 0 bytes returned
9080 if (r == -ENOENT)
9081 r = 0;
9082 if (r < 0)
9083 return r;
9084 if (tbl.length()) {
9085 r = tbl.length();
9086
9087 read += r;
9088 pos += r;
9089 left -= r;
9090 bl->claim_append(tbl);
9091 }
9092 // short read?
9093 if (r >= 0 && r < wanted) {
9094 if (pos < in->size) {
9095 // zero up to known EOF
9096 int64_t some = in->size - pos;
9097 if (some > left)
9098 some = left;
9099 bufferptr z(some);
9100 z.zero();
9101 bl->push_back(z);
9102 read += some;
9103 pos += some;
9104 left -= some;
9105 if (left == 0)
9106 return read;
9107 }
9108
9109 *checkeof = true;
9110 return read;
9111 }
9112 }
9113 return read;
9114 }
9115
9116
9117 /*
9118 * we keep count of uncommitted sync writes on the inode, so that
9119 * fsync can DDRT.
9120 */
9121 void Client::_sync_write_commit(Inode *in)
9122 {
9123 assert(unsafe_sync_write > 0);
9124 unsafe_sync_write--;
9125
9126 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9127
9128 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9129 if (unsafe_sync_write == 0 && unmounting) {
9130 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9131 mount_cond.Signal();
9132 }
9133 }
9134
9135 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9136 {
9137 Mutex::Locker lock(client_lock);
9138 tout(cct) << "write" << std::endl;
9139 tout(cct) << fd << std::endl;
9140 tout(cct) << size << std::endl;
9141 tout(cct) << offset << std::endl;
9142
9143 if (unmounting)
9144 return -ENOTCONN;
9145
9146 Fh *fh = get_filehandle(fd);
9147 if (!fh)
9148 return -EBADF;
9149 #if defined(__linux__) && defined(O_PATH)
9150 if (fh->flags & O_PATH)
9151 return -EBADF;
9152 #endif
9153 int r = _write(fh, offset, size, buf, NULL, 0);
9154 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9155 return r;
9156 }
9157
9158 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9159 {
9160 if (iovcnt < 0)
9161 return -EINVAL;
9162 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9163 }
9164
9165 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9166 {
9167 Mutex::Locker lock(client_lock);
9168 tout(cct) << fd << std::endl;
9169 tout(cct) << offset << std::endl;
9170
9171 if (unmounting)
9172 return -ENOTCONN;
9173
9174 Fh *fh = get_filehandle(fd);
9175 if (!fh)
9176 return -EBADF;
9177 #if defined(__linux__) && defined(O_PATH)
9178 if (fh->flags & O_PATH)
9179 return -EBADF;
9180 #endif
9181 loff_t totallen = 0;
9182 for (unsigned i = 0; i < iovcnt; i++) {
9183 totallen += iov[i].iov_len;
9184 }
9185 if (write) {
9186 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9187 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9188 return w;
9189 } else {
9190 bufferlist bl;
9191 int r = _read(fh, offset, totallen, &bl);
9192 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9193 if (r <= 0)
9194 return r;
9195
9196 int bufoff = 0;
9197 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9198 /*
9199 * This piece of code aims to handle the case that bufferlist does not have enough data
9200 * to fill in the iov
9201 */
9202 if (resid < iov[j].iov_len) {
9203 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9204 break;
9205 } else {
9206 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9207 }
9208 resid -= iov[j].iov_len;
9209 bufoff += iov[j].iov_len;
9210 }
9211 return r;
9212 }
9213 }
9214
9215 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9216 const struct iovec *iov, int iovcnt)
9217 {
9218 uint64_t fpos = 0;
9219
9220 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9221 return -EFBIG;
9222
9223 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9224 Inode *in = f->inode.get();
9225
9226 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9227 return -ENOSPC;
9228 }
9229
9230 assert(in->snapid == CEPH_NOSNAP);
9231
9232 // was Fh opened as writeable?
9233 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9234 return -EBADF;
9235
9236 // check quota
9237 uint64_t endoff = offset + size;
9238 std::list<InodeRef> quota_roots;
9239 if (endoff > in->size &&
9240 is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
9241 return -EDQUOT;
9242 }
9243
9244 // use/adjust fd pos?
9245 if (offset < 0) {
9246 lock_fh_pos(f);
9247 /*
9248 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9249 * change out from under us.
9250 */
9251 if (f->flags & O_APPEND) {
9252 int r = _lseek(f, 0, SEEK_END);
9253 if (r < 0) {
9254 unlock_fh_pos(f);
9255 return r;
9256 }
9257 }
9258 offset = f->pos;
9259 fpos = offset+size;
9260 unlock_fh_pos(f);
9261 }
9262
9263 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9264
9265 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9266
9267 // time it.
9268 utime_t start = ceph_clock_now();
9269
9270 if (in->inline_version == 0) {
9271 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9272 if (r < 0)
9273 return r;
9274 assert(in->inline_version > 0);
9275 }
9276
9277 // copy into fresh buffer (since our write may be resub, async)
9278 bufferlist bl;
9279 if (buf) {
9280 if (size > 0)
9281 bl.append(buf, size);
9282 } else if (iov){
9283 for (int i = 0; i < iovcnt; i++) {
9284 if (iov[i].iov_len > 0) {
9285 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9286 }
9287 }
9288 }
9289
9290 utime_t lat;
9291 uint64_t totalwritten;
9292 int have;
9293 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9294 CEPH_CAP_FILE_BUFFER, &have, endoff);
9295 if (r < 0)
9296 return r;
9297
9298 /* clear the setuid/setgid bits, if any */
9299 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9300 struct ceph_statx stx = { 0 };
9301
9302 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9303 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9304 if (r < 0)
9305 return r;
9306 } else {
9307 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9308 }
9309
9310 if (f->flags & O_DIRECT)
9311 have &= ~CEPH_CAP_FILE_BUFFER;
9312
9313 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9314
9315 Mutex uninline_flock("Client::_write_uninline_data flock");
9316 Cond uninline_cond;
9317 bool uninline_done = false;
9318 int uninline_ret = 0;
9319 Context *onuninline = NULL;
9320
9321 if (in->inline_version < CEPH_INLINE_NONE) {
9322 if (endoff > cct->_conf->client_max_inline_size ||
9323 endoff > CEPH_INLINE_MAX_SIZE ||
9324 !(have & CEPH_CAP_FILE_BUFFER)) {
9325 onuninline = new C_SafeCond(&uninline_flock,
9326 &uninline_cond,
9327 &uninline_done,
9328 &uninline_ret);
9329 uninline_data(in, onuninline);
9330 } else {
9331 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9332
9333 uint32_t len = in->inline_data.length();
9334
9335 if (endoff < len)
9336 in->inline_data.copy(endoff, len - endoff, bl);
9337
9338 if (offset < len)
9339 in->inline_data.splice(offset, len - offset);
9340 else if (offset > len)
9341 in->inline_data.append_zero(offset - len);
9342
9343 in->inline_data.append(bl);
9344 in->inline_version++;
9345
9346 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9347
9348 goto success;
9349 }
9350 }
9351
9352 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9353 // do buffered write
9354 if (!in->oset.dirty_or_tx)
9355 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9356
9357 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9358
9359 // async, caching, non-blocking.
9360 r = objectcacher->file_write(&in->oset, &in->layout,
9361 in->snaprealm->get_snap_context(),
9362 offset, size, bl, ceph::real_clock::now(),
9363 0);
9364 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9365
9366 if (r < 0)
9367 goto done;
9368
9369 // flush cached write if O_SYNC is set on file fh
9370 // O_DSYNC == O_SYNC on linux < 2.6.33
9371 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9372 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9373 _flush_range(in, offset, size);
9374 }
9375 } else {
9376 if (f->flags & O_DIRECT)
9377 _flush_range(in, offset, size);
9378
9379 // simple, non-atomic sync write
9380 Mutex flock("Client::_write flock");
9381 Cond cond;
9382 bool done = false;
9383 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9384
9385 unsafe_sync_write++;
9386 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9387
9388 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9389 offset, size, bl, ceph::real_clock::now(), 0,
9390 in->truncate_size, in->truncate_seq,
9391 onfinish);
9392 client_lock.Unlock();
9393 flock.Lock();
9394
9395 while (!done)
9396 cond.Wait(flock);
9397 flock.Unlock();
9398 client_lock.Lock();
9399 _sync_write_commit(in);
9400 }
9401
9402 // if we get here, write was successful, update client metadata
9403 success:
9404 // time
9405 lat = ceph_clock_now();
9406 lat -= start;
9407 logger->tinc(l_c_wrlat, lat);
9408
9409 if (fpos) {
9410 lock_fh_pos(f);
9411 f->pos = fpos;
9412 unlock_fh_pos(f);
9413 }
9414 totalwritten = size;
9415 r = (int)totalwritten;
9416
9417 // extend file?
9418 if (totalwritten + offset > in->size) {
9419 in->size = totalwritten + offset;
9420 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9421
9422 if (is_quota_bytes_approaching(in, quota_roots)) {
9423 check_caps(in, CHECK_CAPS_NODELAY);
9424 } else if (is_max_size_approaching(in)) {
9425 check_caps(in, 0);
9426 }
9427
9428 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9429 } else {
9430 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9431 }
9432
9433 // mtime
9434 in->mtime = in->ctime = ceph_clock_now();
9435 in->change_attr++;
9436 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9437
9438 done:
9439
9440 if (onuninline) {
9441 client_lock.Unlock();
9442 uninline_flock.Lock();
9443 while (!uninline_done)
9444 uninline_cond.Wait(uninline_flock);
9445 uninline_flock.Unlock();
9446 client_lock.Lock();
9447
9448 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9449 in->inline_data.clear();
9450 in->inline_version = CEPH_INLINE_NONE;
9451 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9452 check_caps(in, 0);
9453 } else
9454 r = uninline_ret;
9455 }
9456
9457 put_cap_ref(in, CEPH_CAP_FILE_WR);
9458 return r;
9459 }
9460
9461 int Client::_flush(Fh *f)
9462 {
9463 Inode *in = f->inode.get();
9464 int err = f->take_async_err();
9465 if (err != 0) {
9466 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9467 << cpp_strerror(err) << dendl;
9468 } else {
9469 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9470 }
9471
9472 return err;
9473 }
9474
9475 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9476 {
9477 struct ceph_statx stx;
9478 stx.stx_size = length;
9479 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9480 }
9481
9482 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9483 {
9484 Mutex::Locker lock(client_lock);
9485 tout(cct) << "ftruncate" << std::endl;
9486 tout(cct) << fd << std::endl;
9487 tout(cct) << length << std::endl;
9488
9489 if (unmounting)
9490 return -ENOTCONN;
9491
9492 Fh *f = get_filehandle(fd);
9493 if (!f)
9494 return -EBADF;
9495 #if defined(__linux__) && defined(O_PATH)
9496 if (f->flags & O_PATH)
9497 return -EBADF;
9498 #endif
9499 struct stat attr;
9500 attr.st_size = length;
9501 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9502 }
9503
9504 int Client::fsync(int fd, bool syncdataonly)
9505 {
9506 Mutex::Locker lock(client_lock);
9507 tout(cct) << "fsync" << std::endl;
9508 tout(cct) << fd << std::endl;
9509 tout(cct) << syncdataonly << std::endl;
9510
9511 if (unmounting)
9512 return -ENOTCONN;
9513
9514 Fh *f = get_filehandle(fd);
9515 if (!f)
9516 return -EBADF;
9517 #if defined(__linux__) && defined(O_PATH)
9518 if (f->flags & O_PATH)
9519 return -EBADF;
9520 #endif
9521 int r = _fsync(f, syncdataonly);
9522 if (r == 0) {
9523 // The IOs in this fsync were okay, but maybe something happened
9524 // in the background that we shoudl be reporting?
9525 r = f->take_async_err();
9526 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9527 << ") = 0, async_err = " << r << dendl;
9528 } else {
9529 // Assume that an error we encountered during fsync, even reported
9530 // synchronously, would also have applied the error to the Fh, and we
9531 // should clear it here to avoid returning the same error again on next
9532 // call.
9533 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9534 << r << dendl;
9535 f->take_async_err();
9536 }
9537 return r;
9538 }
9539
9540 int Client::_fsync(Inode *in, bool syncdataonly)
9541 {
9542 int r = 0;
9543 Mutex lock("Client::_fsync::lock");
9544 Cond cond;
9545 bool done = false;
9546 C_SafeCond *object_cacher_completion = NULL;
9547 ceph_tid_t flush_tid = 0;
9548 InodeRef tmp_ref;
9549
9550 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9551
9552 if (cct->_conf->client_oc) {
9553 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9554 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9555 _flush(in, object_cacher_completion);
9556 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9557 }
9558
9559 if (!syncdataonly && in->dirty_caps) {
9560 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9561 if (in->flushing_caps)
9562 flush_tid = last_flush_tid;
9563 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9564
9565 if (!syncdataonly && !in->unsafe_ops.empty()) {
9566 flush_mdlog_sync();
9567
9568 MetaRequest *req = in->unsafe_ops.back();
9569 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9570
9571 req->get();
9572 wait_on_list(req->waitfor_safe);
9573 put_request(req);
9574 }
9575
9576 if (object_cacher_completion) { // wait on a real reply instead of guessing
9577 client_lock.Unlock();
9578 lock.Lock();
9579 ldout(cct, 15) << "waiting on data to flush" << dendl;
9580 while (!done)
9581 cond.Wait(lock);
9582 lock.Unlock();
9583 client_lock.Lock();
9584 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9585 } else {
9586 // FIXME: this can starve
9587 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9588 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9589 << " uncommitted, waiting" << dendl;
9590 wait_on_list(in->waitfor_commit);
9591 }
9592 }
9593
9594 if (!r) {
9595 if (flush_tid > 0)
9596 wait_sync_caps(in, flush_tid);
9597
9598 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9599 } else {
9600 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9601 << cpp_strerror(-r) << dendl;
9602 }
9603
9604 return r;
9605 }
9606
9607 int Client::_fsync(Fh *f, bool syncdataonly)
9608 {
9609 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9610 return _fsync(f->inode.get(), syncdataonly);
9611 }
9612
9613 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9614 {
9615 Mutex::Locker lock(client_lock);
9616 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9617 tout(cct) << fd << std::endl;
9618
9619 if (unmounting)
9620 return -ENOTCONN;
9621
9622 Fh *f = get_filehandle(fd);
9623 if (!f)
9624 return -EBADF;
9625 int r = _getattr(f->inode, mask, perms);
9626 if (r < 0)
9627 return r;
9628 fill_stat(f->inode, stbuf, NULL);
9629 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9630 return r;
9631 }
9632
9633 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9634 unsigned int want, unsigned int flags)
9635 {
9636 Mutex::Locker lock(client_lock);
9637 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9638 tout(cct) << fd << std::endl;
9639
9640 if (unmounting)
9641 return -ENOTCONN;
9642
9643 Fh *f = get_filehandle(fd);
9644 if (!f)
9645 return -EBADF;
9646
9647 unsigned mask = statx_to_mask(flags, want);
9648
9649 int r = 0;
9650 if (mask && !f->inode->caps_issued_mask(mask, true)) {
9651 r = _getattr(f->inode, mask, perms);
9652 if (r < 0) {
9653 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9654 return r;
9655 }
9656 }
9657
9658 fill_statx(f->inode, mask, stx);
9659 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9660 return r;
9661 }
9662
9663 // not written yet, but i want to link!
9664
9665 int Client::chdir(const char *relpath, std::string &new_cwd,
9666 const UserPerm& perms)
9667 {
9668 Mutex::Locker lock(client_lock);
9669 tout(cct) << "chdir" << std::endl;
9670 tout(cct) << relpath << std::endl;
9671
9672 if (unmounting)
9673 return -ENOTCONN;
9674
9675 filepath path(relpath);
9676 InodeRef in;
9677 int r = path_walk(path, &in, perms);
9678 if (r < 0)
9679 return r;
9680 if (cwd != in)
9681 cwd.swap(in);
9682 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9683
9684 _getcwd(new_cwd, perms);
9685 return 0;
9686 }
9687
9688 void Client::_getcwd(string& dir, const UserPerm& perms)
9689 {
9690 filepath path;
9691 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9692
9693 Inode *in = cwd.get();
9694 while (in != root) {
9695 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9696
9697 // A cwd or ancester is unlinked
9698 if (in->dn_set.empty()) {
9699 return;
9700 }
9701
9702 Dentry *dn = in->get_first_parent();
9703
9704
9705 if (!dn) {
9706 // look it up
9707 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9708 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9709 filepath path(in->ino);
9710 req->set_filepath(path);
9711 req->set_inode(in);
9712 int res = make_request(req, perms);
9713 if (res < 0)
9714 break;
9715
9716 // start over
9717 path = filepath();
9718 in = cwd.get();
9719 continue;
9720 }
9721 path.push_front_dentry(dn->name);
9722 in = dn->dir->parent_inode;
9723 }
9724 dir = "/";
9725 dir += path.get_path();
9726 }
9727
9728 void Client::getcwd(string& dir, const UserPerm& perms)
9729 {
9730 Mutex::Locker l(client_lock);
9731 if (!unmounting)
9732 _getcwd(dir, perms);
9733 }
9734
9735 int Client::statfs(const char *path, struct statvfs *stbuf,
9736 const UserPerm& perms)
9737 {
9738 Mutex::Locker l(client_lock);
9739 tout(cct) << "statfs" << std::endl;
9740 unsigned long int total_files_on_fs;
9741
9742 if (unmounting)
9743 return -ENOTCONN;
9744
9745 ceph_statfs stats;
9746 C_SaferCond cond;
9747
9748 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9749 if (data_pools.size() == 1) {
9750 objecter->get_fs_stats(stats, data_pools[0], &cond);
9751 } else {
9752 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9753 }
9754
9755 client_lock.Unlock();
9756 int rval = cond.wait();
9757 assert(root);
9758 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9759 client_lock.Lock();
9760
9761 if (rval < 0) {
9762 ldout(cct, 1) << "underlying call to statfs returned error: "
9763 << cpp_strerror(rval)
9764 << dendl;
9765 return rval;
9766 }
9767
9768 memset(stbuf, 0, sizeof(*stbuf));
9769
9770 /*
9771 * we're going to set a block size of 4MB so we can represent larger
9772 * FSes without overflowing. Additionally convert the space
9773 * measurements from KB to bytes while making them in terms of
9774 * blocks. We use 4MB only because it is big enough, and because it
9775 * actually *is* the (ceph) default block size.
9776 */
9777 const int CEPH_BLOCK_SHIFT = 22;
9778 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9779 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9780 stbuf->f_files = total_files_on_fs;
9781 stbuf->f_ffree = 0;
9782 stbuf->f_favail = -1;
9783 stbuf->f_fsid = -1; // ??
9784 stbuf->f_flag = 0; // ??
9785 stbuf->f_namemax = NAME_MAX;
9786
9787 // Usually quota_root will == root_ancestor, but if the mount root has no
9788 // quota but we can see a parent of it that does have a quota, we'll
9789 // respect that one instead.
9790 assert(root != nullptr);
9791 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9792
9793 // get_quota_root should always give us something
9794 // because client quotas are always enabled
9795 assert(quota_root != nullptr);
9796
9797 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9798
9799 // Skip the getattr if any sessions are stale, as we don't want to
9800 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9801 // is unhealthy.
9802 if (!_any_stale_sessions()) {
9803 int r = _getattr(quota_root, 0, perms, true);
9804 if (r != 0) {
9805 // Ignore return value: error getting latest inode metadata is not a good
9806 // reason to break "df".
9807 lderr(cct) << "Error in getattr on quota root 0x"
9808 << std::hex << quota_root->ino << std::dec
9809 << " statfs result may be outdated" << dendl;
9810 }
9811 }
9812
9813 // Special case: if there is a size quota set on the Inode acting
9814 // as the root for this client mount, then report the quota status
9815 // as the filesystem statistics.
9816 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9817 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
9818 // It is possible for a quota to be exceeded: arithmetic here must
9819 // handle case where used > total.
9820 const fsblkcnt_t free = total > used ? total - used : 0;
9821
9822 stbuf->f_blocks = total;
9823 stbuf->f_bfree = free;
9824 stbuf->f_bavail = free;
9825 } else {
9826 // General case: report the cluster statistics returned from RADOS. Because
9827 // multiple pools may be used without one filesystem namespace via
9828 // layouts, this is the most correct thing we can do.
9829 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9830 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9831 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9832 }
9833
9834 return rval;
9835 }
9836
9837 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9838 struct flock *fl, uint64_t owner, bool removing)
9839 {
9840 ldout(cct, 10) << "_do_filelock ino " << in->ino
9841 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9842 << " type " << fl->l_type << " owner " << owner
9843 << " " << fl->l_start << "~" << fl->l_len << dendl;
9844
9845 int lock_cmd;
9846 if (F_RDLCK == fl->l_type)
9847 lock_cmd = CEPH_LOCK_SHARED;
9848 else if (F_WRLCK == fl->l_type)
9849 lock_cmd = CEPH_LOCK_EXCL;
9850 else if (F_UNLCK == fl->l_type)
9851 lock_cmd = CEPH_LOCK_UNLOCK;
9852 else
9853 return -EIO;
9854
9855 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9856 sleep = 0;
9857
9858 /*
9859 * Set the most significant bit, so that MDS knows the 'owner'
9860 * is sufficient to identify the owner of lock. (old code uses
9861 * both 'owner' and 'pid')
9862 */
9863 owner |= (1ULL << 63);
9864
9865 MetaRequest *req = new MetaRequest(op);
9866 filepath path;
9867 in->make_nosnap_relative_path(path);
9868 req->set_filepath(path);
9869 req->set_inode(in);
9870
9871 req->head.args.filelock_change.rule = lock_type;
9872 req->head.args.filelock_change.type = lock_cmd;
9873 req->head.args.filelock_change.owner = owner;
9874 req->head.args.filelock_change.pid = fl->l_pid;
9875 req->head.args.filelock_change.start = fl->l_start;
9876 req->head.args.filelock_change.length = fl->l_len;
9877 req->head.args.filelock_change.wait = sleep;
9878
9879 int ret;
9880 bufferlist bl;
9881
9882 if (sleep && switch_interrupt_cb) {
9883 // enable interrupt
9884 switch_interrupt_cb(callback_handle, req->get());
9885 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9886 // disable interrupt
9887 switch_interrupt_cb(callback_handle, NULL);
9888 if (ret == 0 && req->aborted()) {
9889 // effect of this lock request has been revoked by the 'lock intr' request
9890 ret = req->get_abort_code();
9891 }
9892 put_request(req);
9893 } else {
9894 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9895 }
9896
9897 if (ret == 0) {
9898 if (op == CEPH_MDS_OP_GETFILELOCK) {
9899 ceph_filelock filelock;
9900 bufferlist::iterator p = bl.begin();
9901 ::decode(filelock, p);
9902
9903 if (CEPH_LOCK_SHARED == filelock.type)
9904 fl->l_type = F_RDLCK;
9905 else if (CEPH_LOCK_EXCL == filelock.type)
9906 fl->l_type = F_WRLCK;
9907 else
9908 fl->l_type = F_UNLCK;
9909
9910 fl->l_whence = SEEK_SET;
9911 fl->l_start = filelock.start;
9912 fl->l_len = filelock.length;
9913 fl->l_pid = filelock.pid;
9914 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9915 ceph_lock_state_t *lock_state;
9916 if (lock_type == CEPH_LOCK_FCNTL) {
9917 if (!in->fcntl_locks)
9918 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9919 lock_state = in->fcntl_locks;
9920 } else if (lock_type == CEPH_LOCK_FLOCK) {
9921 if (!in->flock_locks)
9922 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9923 lock_state = in->flock_locks;
9924 } else {
9925 ceph_abort();
9926 return -EINVAL;
9927 }
9928 _update_lock_state(fl, owner, lock_state);
9929
9930 if (!removing) {
9931 if (lock_type == CEPH_LOCK_FCNTL) {
9932 if (!fh->fcntl_locks)
9933 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9934 lock_state = fh->fcntl_locks;
9935 } else {
9936 if (!fh->flock_locks)
9937 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9938 lock_state = fh->flock_locks;
9939 }
9940 _update_lock_state(fl, owner, lock_state);
9941 }
9942 } else
9943 ceph_abort();
9944 }
9945 return ret;
9946 }
9947
9948 int Client::_interrupt_filelock(MetaRequest *req)
9949 {
9950 // Set abort code, but do not kick. The abort code prevents the request
9951 // from being re-sent.
9952 req->abort(-EINTR);
9953 if (req->mds < 0)
9954 return 0; // haven't sent the request
9955
9956 Inode *in = req->inode();
9957
9958 int lock_type;
9959 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9960 lock_type = CEPH_LOCK_FLOCK_INTR;
9961 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9962 lock_type = CEPH_LOCK_FCNTL_INTR;
9963 else {
9964 ceph_abort();
9965 return -EINVAL;
9966 }
9967
9968 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9969 filepath path;
9970 in->make_nosnap_relative_path(path);
9971 intr_req->set_filepath(path);
9972 intr_req->set_inode(in);
9973 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9974 intr_req->head.args.filelock_change.rule = lock_type;
9975 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9976
9977 UserPerm perms(req->get_uid(), req->get_gid());
9978 return make_request(intr_req, perms, NULL, NULL, -1);
9979 }
9980
9981 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9982 {
9983 if (!in->fcntl_locks && !in->flock_locks)
9984 return;
9985
9986 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9987 ::encode(nr_fcntl_locks, bl);
9988 if (nr_fcntl_locks) {
9989 ceph_lock_state_t* lock_state = in->fcntl_locks;
9990 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9991 p != lock_state->held_locks.end();
9992 ++p)
9993 ::encode(p->second, bl);
9994 }
9995
9996 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9997 ::encode(nr_flock_locks, bl);
9998 if (nr_flock_locks) {
9999 ceph_lock_state_t* lock_state = in->flock_locks;
10000 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10001 p != lock_state->held_locks.end();
10002 ++p)
10003 ::encode(p->second, bl);
10004 }
10005
10006 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
10007 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10008 }
10009
10010 void Client::_release_filelocks(Fh *fh)
10011 {
10012 if (!fh->fcntl_locks && !fh->flock_locks)
10013 return;
10014
10015 Inode *in = fh->inode.get();
10016 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
10017
10018 list<pair<int, ceph_filelock> > to_release;
10019
10020 if (fh->fcntl_locks) {
10021 ceph_lock_state_t* lock_state = fh->fcntl_locks;
10022 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10023 p != lock_state->held_locks.end();
10024 ++p)
10025 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10026 delete fh->fcntl_locks;
10027 }
10028 if (fh->flock_locks) {
10029 ceph_lock_state_t* lock_state = fh->flock_locks;
10030 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10031 p != lock_state->held_locks.end();
10032 ++p)
10033 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10034 delete fh->flock_locks;
10035 }
10036
10037 if (to_release.empty())
10038 return;
10039
10040 struct flock fl;
10041 memset(&fl, 0, sizeof(fl));
10042 fl.l_whence = SEEK_SET;
10043 fl.l_type = F_UNLCK;
10044
10045 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10046 p != to_release.end();
10047 ++p) {
10048 fl.l_start = p->second.start;
10049 fl.l_len = p->second.length;
10050 fl.l_pid = p->second.pid;
10051 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10052 p->second.owner, true);
10053 }
10054 }
10055
10056 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10057 ceph_lock_state_t *lock_state)
10058 {
10059 int lock_cmd;
10060 if (F_RDLCK == fl->l_type)
10061 lock_cmd = CEPH_LOCK_SHARED;
10062 else if (F_WRLCK == fl->l_type)
10063 lock_cmd = CEPH_LOCK_EXCL;
10064 else
10065 lock_cmd = CEPH_LOCK_UNLOCK;;
10066
10067 ceph_filelock filelock;
10068 filelock.start = fl->l_start;
10069 filelock.length = fl->l_len;
10070 filelock.client = 0;
10071 // see comment in _do_filelock()
10072 filelock.owner = owner | (1ULL << 63);
10073 filelock.pid = fl->l_pid;
10074 filelock.type = lock_cmd;
10075
10076 if (filelock.type == CEPH_LOCK_UNLOCK) {
10077 list<ceph_filelock> activated_locks;
10078 lock_state->remove_lock(filelock, activated_locks);
10079 } else {
10080 bool r = lock_state->add_lock(filelock, false, false, NULL);
10081 assert(r);
10082 }
10083 }
10084
10085 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10086 {
10087 Inode *in = fh->inode.get();
10088 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10089 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10090 return ret;
10091 }
10092
10093 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10094 {
10095 Inode *in = fh->inode.get();
10096 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10097 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10098 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10099 return ret;
10100 }
10101
10102 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10103 {
10104 Inode *in = fh->inode.get();
10105 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10106
10107 int sleep = !(cmd & LOCK_NB);
10108 cmd &= ~LOCK_NB;
10109
10110 int type;
10111 switch (cmd) {
10112 case LOCK_SH:
10113 type = F_RDLCK;
10114 break;
10115 case LOCK_EX:
10116 type = F_WRLCK;
10117 break;
10118 case LOCK_UN:
10119 type = F_UNLCK;
10120 break;
10121 default:
10122 return -EINVAL;
10123 }
10124
10125 struct flock fl;
10126 memset(&fl, 0, sizeof(fl));
10127 fl.l_type = type;
10128 fl.l_whence = SEEK_SET;
10129
10130 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10131 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10132 return ret;
10133 }
10134
10135 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10136 {
10137 /* Since the only thing this does is wrap a call to statfs, and
10138 statfs takes a lock, it doesn't seem we have a need to split it
10139 out. */
10140 return statfs(0, stbuf, perms);
10141 }
10142
10143 void Client::ll_register_callbacks(struct client_callback_args *args)
10144 {
10145 if (!args)
10146 return;
10147 Mutex::Locker l(client_lock);
10148 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10149 << " invalidate_ino_cb " << args->ino_cb
10150 << " invalidate_dentry_cb " << args->dentry_cb
10151 << " switch_interrupt_cb " << args->switch_intr_cb
10152 << " remount_cb " << args->remount_cb
10153 << dendl;
10154 callback_handle = args->handle;
10155 if (args->ino_cb) {
10156 ino_invalidate_cb = args->ino_cb;
10157 async_ino_invalidator.start();
10158 }
10159 if (args->dentry_cb) {
10160 dentry_invalidate_cb = args->dentry_cb;
10161 async_dentry_invalidator.start();
10162 }
10163 if (args->switch_intr_cb) {
10164 switch_interrupt_cb = args->switch_intr_cb;
10165 interrupt_finisher.start();
10166 }
10167 if (args->remount_cb) {
10168 remount_cb = args->remount_cb;
10169 remount_finisher.start();
10170 }
10171 umask_cb = args->umask_cb;
10172 }
10173
10174 int Client::test_dentry_handling(bool can_invalidate)
10175 {
10176 int r = 0;
10177
10178 can_invalidate_dentries = can_invalidate;
10179
10180 if (can_invalidate_dentries) {
10181 assert(dentry_invalidate_cb);
10182 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10183 r = 0;
10184 } else if (remount_cb) {
10185 ldout(cct, 1) << "using remount_cb" << dendl;
10186 r = _do_remount(false);
10187 }
10188 if (r) {
10189 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10190 if (should_abort) {
10191 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
10192 ceph_abort();
10193 } else {
10194 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10195 }
10196 }
10197 return r;
10198 }
10199
10200 int Client::_sync_fs()
10201 {
10202 ldout(cct, 10) << "_sync_fs" << dendl;
10203
10204 // flush file data
10205 Mutex lock("Client::_fsync::lock");
10206 Cond cond;
10207 bool flush_done = false;
10208 if (cct->_conf->client_oc)
10209 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10210 else
10211 flush_done = true;
10212
10213 // flush caps
10214 flush_caps_sync();
10215 ceph_tid_t flush_tid = last_flush_tid;
10216
10217 // wait for unsafe mds requests
10218 wait_unsafe_requests();
10219
10220 wait_sync_caps(flush_tid);
10221
10222 if (!flush_done) {
10223 client_lock.Unlock();
10224 lock.Lock();
10225 ldout(cct, 15) << "waiting on data to flush" << dendl;
10226 while (!flush_done)
10227 cond.Wait(lock);
10228 lock.Unlock();
10229 client_lock.Lock();
10230 }
10231
10232 return 0;
10233 }
10234
10235 int Client::sync_fs()
10236 {
10237 Mutex::Locker l(client_lock);
10238
10239 if (unmounting)
10240 return -ENOTCONN;
10241
10242 return _sync_fs();
10243 }
10244
10245 int64_t Client::drop_caches()
10246 {
10247 Mutex::Locker l(client_lock);
10248 return objectcacher->release_all();
10249 }
10250
10251
10252 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10253 {
10254 Mutex::Locker l(client_lock);
10255 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10256 << ", " << offset << ", " << count << ")" << dendl;
10257
10258 Fh *f = get_filehandle(fd);
10259 if (!f)
10260 return -EBADF;
10261
10262 // for now
10263 _fsync(f, true);
10264
10265 return 0;
10266 }
10267
10268 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10269 {
10270 Mutex::Locker l(client_lock);
10271 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10272 << ", " << offset << ", " << count << ")" << dendl;
10273
10274 Fh *f = get_filehandle(fd);
10275 if (!f)
10276 return -EBADF;
10277 Inode *in = f->inode.get();
10278
10279 _fsync(f, true);
10280 if (_release(in))
10281 check_caps(in, 0);
10282 return 0;
10283 }
10284
10285
10286 // =============================
10287 // snaps
10288
10289 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10290 {
10291 Mutex::Locker l(client_lock);
10292
10293 if (unmounting)
10294 return -ENOTCONN;
10295
10296 filepath path(relpath);
10297 InodeRef in;
10298 int r = path_walk(path, &in, perm);
10299 if (r < 0)
10300 return r;
10301 if (cct->_conf->client_permissions) {
10302 r = may_create(in.get(), perm);
10303 if (r < 0)
10304 return r;
10305 }
10306 Inode *snapdir = open_snapdir(in.get());
10307 return _mkdir(snapdir, name, 0, perm);
10308 }
10309
10310 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10311 {
10312 Mutex::Locker l(client_lock);
10313
10314 if (unmounting)
10315 return -ENOTCONN;
10316
10317 filepath path(relpath);
10318 InodeRef in;
10319 int r = path_walk(path, &in, perms);
10320 if (r < 0)
10321 return r;
10322 if (cct->_conf->client_permissions) {
10323 r = may_delete(in.get(), NULL, perms);
10324 if (r < 0)
10325 return r;
10326 }
10327 Inode *snapdir = open_snapdir(in.get());
10328 return _rmdir(snapdir, name, perms);
10329 }
10330
10331 // =============================
10332 // expose caps
10333
10334 int Client::get_caps_issued(int fd) {
10335
10336 Mutex::Locker lock(client_lock);
10337
10338 if (unmounting)
10339 return -ENOTCONN;
10340
10341 Fh *f = get_filehandle(fd);
10342 if (!f)
10343 return -EBADF;
10344
10345 return f->inode->caps_issued();
10346 }
10347
10348 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10349 {
10350 Mutex::Locker lock(client_lock);
10351
10352 if (unmounting)
10353 return -ENOTCONN;
10354
10355 filepath p(path);
10356 InodeRef in;
10357 int r = path_walk(p, &in, perms, true);
10358 if (r < 0)
10359 return r;
10360 return in->caps_issued();
10361 }
10362
10363 // =========================================
10364 // low level
10365
10366 Inode *Client::open_snapdir(Inode *diri)
10367 {
10368 Inode *in;
10369 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10370 if (!inode_map.count(vino)) {
10371 in = new Inode(this, vino, &diri->layout);
10372
10373 in->ino = diri->ino;
10374 in->snapid = CEPH_SNAPDIR;
10375 in->mode = diri->mode;
10376 in->uid = diri->uid;
10377 in->gid = diri->gid;
10378 in->mtime = diri->mtime;
10379 in->ctime = diri->ctime;
10380 in->btime = diri->btime;
10381 in->size = diri->size;
10382 in->change_attr = diri->change_attr;
10383
10384 in->dirfragtree.clear();
10385 in->snapdir_parent = diri;
10386 diri->flags |= I_SNAPDIR_OPEN;
10387 inode_map[vino] = in;
10388 if (use_faked_inos())
10389 _assign_faked_ino(in);
10390 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10391 } else {
10392 in = inode_map[vino];
10393 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10394 }
10395 return in;
10396 }
10397
10398 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10399 Inode **out, const UserPerm& perms)
10400 {
10401 Mutex::Locker lock(client_lock);
10402 vinodeno_t vparent = _get_vino(parent);
10403 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
10404 tout(cct) << "ll_lookup" << std::endl;
10405 tout(cct) << name << std::endl;
10406
10407 if (unmounting)
10408 return -ENOTCONN;
10409
10410 int r = 0;
10411 if (!cct->_conf->fuse_default_permissions) {
10412 r = may_lookup(parent, perms);
10413 if (r < 0)
10414 return r;
10415 }
10416
10417 string dname(name);
10418 InodeRef in;
10419
10420 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10421 if (r < 0) {
10422 attr->st_ino = 0;
10423 goto out;
10424 }
10425
10426 assert(in);
10427 fill_stat(in, attr);
10428 _ll_get(in.get());
10429
10430 out:
10431 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
10432 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10433 tout(cct) << attr->st_ino << std::endl;
10434 *out = in.get();
10435 return r;
10436 }
10437
10438 int Client::ll_lookup_inode(
10439 struct inodeno_t ino,
10440 const UserPerm& perms,
10441 Inode **inode)
10442 {
10443 Mutex::Locker lock(client_lock);
10444 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10445
10446 // Num1: get inode and *inode
10447 int r = _lookup_ino(ino, perms, inode);
10448 if (r) {
10449 return r;
10450 }
10451 assert(inode != NULL);
10452 assert(*inode != NULL);
10453
10454 // Num2: Request the parent inode, so that we can look up the name
10455 Inode *parent;
10456 r = _lookup_parent(*inode, perms, &parent);
10457 if (r && r != -EINVAL) {
10458 // Unexpected error
10459 _ll_forget(*inode, 1);
10460 return r;
10461 } else if (r == -EINVAL) {
10462 // EINVAL indicates node without parents (root), drop out now
10463 // and don't try to look up the non-existent dentry.
10464 return 0;
10465 }
10466 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10467 // is already in cache
10468 assert(parent != NULL);
10469
10470 // Num3: Finally, get the name (dentry) of the requested inode
10471 r = _lookup_name(*inode, parent, perms);
10472 if (r) {
10473 // Unexpected error
10474 _ll_forget(parent, 1);
10475 _ll_forget(*inode, 1);
10476 return r;
10477 }
10478
10479 _ll_forget(parent, 1);
10480 return 0;
10481 }
10482
10483 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10484 struct ceph_statx *stx, unsigned want, unsigned flags,
10485 const UserPerm& perms)
10486 {
10487 Mutex::Locker lock(client_lock);
10488 vinodeno_t vparent = _get_vino(parent);
10489 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
10490 tout(cct) << "ll_lookupx" << std::endl;
10491 tout(cct) << name << std::endl;
10492
10493 if (unmounting)
10494 return -ENOTCONN;
10495
10496 int r = 0;
10497 if (!cct->_conf->fuse_default_permissions) {
10498 r = may_lookup(parent, perms);
10499 if (r < 0)
10500 return r;
10501 }
10502
10503 string dname(name);
10504 InodeRef in;
10505
10506 unsigned mask = statx_to_mask(flags, want);
10507 r = _lookup(parent, dname, mask, &in, perms);
10508 if (r < 0) {
10509 stx->stx_ino = 0;
10510 stx->stx_mask = 0;
10511 } else {
10512 assert(in);
10513 fill_statx(in, mask, stx);
10514 _ll_get(in.get());
10515 }
10516
10517 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
10518 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10519 tout(cct) << stx->stx_ino << std::endl;
10520 *out = in.get();
10521 return r;
10522 }
10523
10524 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10525 unsigned int want, unsigned int flags, const UserPerm& perms)
10526 {
10527 Mutex::Locker lock(client_lock);
10528
10529 if (unmounting)
10530 return -ENOTCONN;
10531
10532 filepath fp(name, 0);
10533 InodeRef in;
10534 int rc;
10535 unsigned mask = statx_to_mask(flags, want);
10536
10537 ldout(cct, 3) << "ll_walk" << name << dendl;
10538 tout(cct) << "ll_walk" << std::endl;
10539 tout(cct) << name << std::endl;
10540
10541 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10542 if (rc < 0) {
10543 /* zero out mask, just in case... */
10544 stx->stx_mask = 0;
10545 stx->stx_ino = 0;
10546 *out = NULL;
10547 return rc;
10548 } else {
10549 assert(in);
10550 fill_statx(in, mask, stx);
10551 _ll_get(in.get());
10552 *out = in.get();
10553 return 0;
10554 }
10555 }
10556
10557 void Client::_ll_get(Inode *in)
10558 {
10559 if (in->ll_ref == 0) {
10560 in->get();
10561 if (in->is_dir() && !in->dn_set.empty()) {
10562 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10563 in->get_first_parent()->get(); // pin dentry
10564 }
10565 }
10566 in->ll_get();
10567 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10568 }
10569
10570 int Client::_ll_put(Inode *in, int num)
10571 {
10572 in->ll_put(num);
10573 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10574 if (in->ll_ref == 0) {
10575 if (in->is_dir() && !in->dn_set.empty()) {
10576 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10577 in->get_first_parent()->put(); // unpin dentry
10578 }
10579 put_inode(in);
10580 return 0;
10581 } else {
10582 return in->ll_ref;
10583 }
10584 }
10585
10586 void Client::_ll_drop_pins()
10587 {
10588 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10589 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10590 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10591 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10592 it != inode_map.end();
10593 it = next) {
10594 Inode *in = it->second;
10595 next = it;
10596 ++next;
10597 if (in->ll_ref){
10598 to_be_put.insert(in);
10599 _ll_put(in, in->ll_ref);
10600 }
10601 }
10602 }
10603
10604 bool Client::_ll_forget(Inode *in, int count)
10605 {
10606 inodeno_t ino = _get_inodeno(in);
10607
10608 ldout(cct, 8) << "ll_forget " << ino << " " << count << dendl;
10609 tout(cct) << "ll_forget" << std::endl;
10610 tout(cct) << ino.val << std::endl;
10611 tout(cct) << count << std::endl;
10612
10613 // Ignore forget if we're no longer mounted
10614 if (unmounting)
10615 return true;
10616
10617 if (ino == 1) return true; // ignore forget on root.
10618
10619 bool last = false;
10620 if (in->ll_ref < count) {
10621 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10622 << ", which only has ll_ref=" << in->ll_ref << dendl;
10623 _ll_put(in, in->ll_ref);
10624 last = true;
10625 } else {
10626 if (_ll_put(in, count) == 0)
10627 last = true;
10628 }
10629
10630 return last;
10631 }
10632
10633 bool Client::ll_forget(Inode *in, int count)
10634 {
10635 Mutex::Locker lock(client_lock);
10636 return _ll_forget(in, count);
10637 }
10638
10639 bool Client::ll_put(Inode *in)
10640 {
10641 /* ll_forget already takes the lock */
10642 return ll_forget(in, 1);
10643 }
10644
10645 snapid_t Client::ll_get_snapid(Inode *in)
10646 {
10647 Mutex::Locker lock(client_lock);
10648 return in->snapid;
10649 }
10650
10651 Inode *Client::ll_get_inode(ino_t ino)
10652 {
10653 Mutex::Locker lock(client_lock);
10654
10655 if (unmounting)
10656 return NULL;
10657
10658 vinodeno_t vino = _map_faked_ino(ino);
10659 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10660 if (p == inode_map.end())
10661 return NULL;
10662 Inode *in = p->second;
10663 _ll_get(in);
10664 return in;
10665 }
10666
10667 Inode *Client::ll_get_inode(vinodeno_t vino)
10668 {
10669 Mutex::Locker lock(client_lock);
10670
10671 if (unmounting)
10672 return NULL;
10673
10674 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10675 if (p == inode_map.end())
10676 return NULL;
10677 Inode *in = p->second;
10678 _ll_get(in);
10679 return in;
10680 }
10681
10682 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10683 {
10684 vinodeno_t vino = _get_vino(in);
10685
10686 ldout(cct, 8) << "ll_getattr " << vino << dendl;
10687 tout(cct) << "ll_getattr" << std::endl;
10688 tout(cct) << vino.ino.val << std::endl;
10689
10690 if (vino.snapid < CEPH_NOSNAP)
10691 return 0;
10692 else
10693 return _getattr(in, caps, perms);
10694 }
10695
10696 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10697 {
10698 Mutex::Locker lock(client_lock);
10699
10700 if (unmounting)
10701 return -ENOTCONN;
10702
10703 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10704
10705 if (res == 0)
10706 fill_stat(in, attr);
10707 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10708 return res;
10709 }
10710
10711 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10712 unsigned int flags, const UserPerm& perms)
10713 {
10714 Mutex::Locker lock(client_lock);
10715
10716 if (unmounting)
10717 return -ENOTCONN;
10718
10719 int res = 0;
10720 unsigned mask = statx_to_mask(flags, want);
10721
10722 if (mask && !in->caps_issued_mask(mask, true))
10723 res = _ll_getattr(in, mask, perms);
10724
10725 if (res == 0)
10726 fill_statx(in, mask, stx);
10727 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10728 return res;
10729 }
10730
10731 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10732 const UserPerm& perms, InodeRef *inp)
10733 {
10734 vinodeno_t vino = _get_vino(in);
10735
10736 ldout(cct, 8) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10737 << dendl;
10738 tout(cct) << "ll_setattrx" << std::endl;
10739 tout(cct) << vino.ino.val << std::endl;
10740 tout(cct) << stx->stx_mode << std::endl;
10741 tout(cct) << stx->stx_uid << std::endl;
10742 tout(cct) << stx->stx_gid << std::endl;
10743 tout(cct) << stx->stx_size << std::endl;
10744 tout(cct) << stx->stx_mtime << std::endl;
10745 tout(cct) << stx->stx_atime << std::endl;
10746 tout(cct) << stx->stx_btime << std::endl;
10747 tout(cct) << mask << std::endl;
10748
10749 if (!cct->_conf->fuse_default_permissions) {
10750 int res = may_setattr(in, stx, mask, perms);
10751 if (res < 0)
10752 return res;
10753 }
10754
10755 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10756
10757 return __setattrx(in, stx, mask, perms, inp);
10758 }
10759
10760 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10761 const UserPerm& perms)
10762 {
10763 Mutex::Locker lock(client_lock);
10764
10765 if (unmounting)
10766 return -ENOTCONN;
10767
10768 InodeRef target(in);
10769 int res = _ll_setattrx(in, stx, mask, perms, &target);
10770 if (res == 0) {
10771 assert(in == target.get());
10772 fill_statx(in, in->caps_issued(), stx);
10773 }
10774
10775 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10776 return res;
10777 }
10778
10779 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10780 const UserPerm& perms)
10781 {
10782 struct ceph_statx stx;
10783 stat_to_statx(attr, &stx);
10784
10785 Mutex::Locker lock(client_lock);
10786
10787 if (unmounting)
10788 return -ENOTCONN;
10789
10790 InodeRef target(in);
10791 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10792 if (res == 0) {
10793 assert(in == target.get());
10794 fill_stat(in, attr);
10795 }
10796
10797 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10798 return res;
10799 }
10800
10801
10802 // ----------
10803 // xattrs
10804
10805 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10806 const UserPerm& perms)
10807 {
10808 Mutex::Locker lock(client_lock);
10809
10810 if (unmounting)
10811 return -ENOTCONN;
10812
10813 InodeRef in;
10814 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10815 if (r < 0)
10816 return r;
10817 return _getxattr(in, name, value, size, perms);
10818 }
10819
10820 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10821 const UserPerm& perms)
10822 {
10823 Mutex::Locker lock(client_lock);
10824
10825 if (unmounting)
10826 return -ENOTCONN;
10827
10828 InodeRef in;
10829 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10830 if (r < 0)
10831 return r;
10832 return _getxattr(in, name, value, size, perms);
10833 }
10834
10835 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10836 const UserPerm& perms)
10837 {
10838 Mutex::Locker lock(client_lock);
10839
10840 if (unmounting)
10841 return -ENOTCONN;
10842
10843 Fh *f = get_filehandle(fd);
10844 if (!f)
10845 return -EBADF;
10846 return _getxattr(f->inode, name, value, size, perms);
10847 }
10848
10849 int Client::listxattr(const char *path, char *list, size_t size,
10850 const UserPerm& perms)
10851 {
10852 Mutex::Locker lock(client_lock);
10853
10854 if (unmounting)
10855 return -ENOTCONN;
10856
10857 InodeRef in;
10858 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10859 if (r < 0)
10860 return r;
10861 return Client::_listxattr(in.get(), list, size, perms);
10862 }
10863
10864 int Client::llistxattr(const char *path, char *list, size_t size,
10865 const UserPerm& perms)
10866 {
10867 Mutex::Locker lock(client_lock);
10868
10869 if (unmounting)
10870 return -ENOTCONN;
10871
10872 InodeRef in;
10873 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10874 if (r < 0)
10875 return r;
10876 return Client::_listxattr(in.get(), list, size, perms);
10877 }
10878
10879 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10880 {
10881 Mutex::Locker lock(client_lock);
10882
10883 if (unmounting)
10884 return -ENOTCONN;
10885
10886 Fh *f = get_filehandle(fd);
10887 if (!f)
10888 return -EBADF;
10889 return Client::_listxattr(f->inode.get(), list, size, perms);
10890 }
10891
10892 int Client::removexattr(const char *path, const char *name,
10893 const UserPerm& perms)
10894 {
10895 Mutex::Locker lock(client_lock);
10896
10897 if (unmounting)
10898 return -ENOTCONN;
10899
10900 InodeRef in;
10901 int r = Client::path_walk(path, &in, perms, true);
10902 if (r < 0)
10903 return r;
10904 return _removexattr(in, name, perms);
10905 }
10906
10907 int Client::lremovexattr(const char *path, const char *name,
10908 const UserPerm& perms)
10909 {
10910 Mutex::Locker lock(client_lock);
10911
10912 if (unmounting)
10913 return -ENOTCONN;
10914
10915 InodeRef in;
10916 int r = Client::path_walk(path, &in, perms, false);
10917 if (r < 0)
10918 return r;
10919 return _removexattr(in, name, perms);
10920 }
10921
10922 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10923 {
10924 Mutex::Locker lock(client_lock);
10925
10926 if (unmounting)
10927 return -ENOTCONN;
10928
10929 Fh *f = get_filehandle(fd);
10930 if (!f)
10931 return -EBADF;
10932 return _removexattr(f->inode, name, perms);
10933 }
10934
10935 int Client::setxattr(const char *path, const char *name, const void *value,
10936 size_t size, int flags, const UserPerm& perms)
10937 {
10938 _setxattr_maybe_wait_for_osdmap(name, value, size);
10939
10940 Mutex::Locker lock(client_lock);
10941
10942 if (unmounting)
10943 return -ENOTCONN;
10944
10945 InodeRef in;
10946 int r = Client::path_walk(path, &in, perms, true);
10947 if (r < 0)
10948 return r;
10949 return _setxattr(in, name, value, size, flags, perms);
10950 }
10951
10952 int Client::lsetxattr(const char *path, const char *name, const void *value,
10953 size_t size, int flags, const UserPerm& perms)
10954 {
10955 _setxattr_maybe_wait_for_osdmap(name, value, size);
10956
10957 Mutex::Locker lock(client_lock);
10958
10959 if (unmounting)
10960 return -ENOTCONN;
10961
10962 InodeRef in;
10963 int r = Client::path_walk(path, &in, perms, false);
10964 if (r < 0)
10965 return r;
10966 return _setxattr(in, name, value, size, flags, perms);
10967 }
10968
10969 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10970 int flags, const UserPerm& perms)
10971 {
10972 _setxattr_maybe_wait_for_osdmap(name, value, size);
10973
10974 Mutex::Locker lock(client_lock);
10975
10976 if (unmounting)
10977 return -ENOTCONN;
10978
10979 Fh *f = get_filehandle(fd);
10980 if (!f)
10981 return -EBADF;
10982 return _setxattr(f->inode, name, value, size, flags, perms);
10983 }
10984
10985 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10986 const UserPerm& perms)
10987 {
10988 int r;
10989
10990 const VXattr *vxattr = _match_vxattr(in, name);
10991 if (vxattr) {
10992 r = -ENODATA;
10993
10994 // Do a force getattr to get the latest quota before returning
10995 // a value to userspace.
10996 int flags = 0;
10997 if (vxattr->flags & VXATTR_RSTAT) {
10998 flags |= CEPH_STAT_RSTAT;
10999 }
11000 r = _getattr(in, flags, perms, true);
11001 if (r != 0) {
11002 // Error from getattr!
11003 return r;
11004 }
11005
11006 // call pointer-to-member function
11007 char buf[256];
11008 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11009 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11010 } else {
11011 r = -ENODATA;
11012 }
11013
11014 if (size != 0) {
11015 if (r > (int)size) {
11016 r = -ERANGE;
11017 } else if (r > 0) {
11018 memcpy(value, buf, r);
11019 }
11020 }
11021 goto out;
11022 }
11023
11024 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11025 r = -EOPNOTSUPP;
11026 goto out;
11027 }
11028
11029 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11030 if (r == 0) {
11031 string n(name);
11032 r = -ENODATA;
11033 if (in->xattrs.count(n)) {
11034 r = in->xattrs[n].length();
11035 if (r > 0 && size != 0) {
11036 if (size >= (unsigned)r)
11037 memcpy(value, in->xattrs[n].c_str(), r);
11038 else
11039 r = -ERANGE;
11040 }
11041 }
11042 }
11043 out:
11044 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11045 return r;
11046 }
11047
11048 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11049 const UserPerm& perms)
11050 {
11051 if (cct->_conf->client_permissions) {
11052 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11053 if (r < 0)
11054 return r;
11055 }
11056 return _getxattr(in.get(), name, value, size, perms);
11057 }
11058
11059 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11060 size_t size, const UserPerm& perms)
11061 {
11062 Mutex::Locker lock(client_lock);
11063
11064 if (unmounting)
11065 return -ENOTCONN;
11066
11067 vinodeno_t vino = _get_vino(in);
11068
11069 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
11070 tout(cct) << "ll_getxattr" << std::endl;
11071 tout(cct) << vino.ino.val << std::endl;
11072 tout(cct) << name << std::endl;
11073
11074 if (!cct->_conf->fuse_default_permissions) {
11075 int r = xattr_permission(in, name, MAY_READ, perms);
11076 if (r < 0)
11077 return r;
11078 }
11079
11080 return _getxattr(in, name, value, size, perms);
11081 }
11082
11083 int Client::_listxattr(Inode *in, char *name, size_t size,
11084 const UserPerm& perms)
11085 {
11086 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11087 if (r == 0) {
11088 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11089 p != in->xattrs.end();
11090 ++p)
11091 r += p->first.length() + 1;
11092
11093 const VXattr *vxattrs = _get_vxattrs(in);
11094 r += _vxattrs_name_size(vxattrs);
11095
11096 if (size != 0) {
11097 if (size >= (unsigned)r) {
11098 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11099 p != in->xattrs.end();
11100 ++p) {
11101 memcpy(name, p->first.c_str(), p->first.length());
11102 name += p->first.length();
11103 *name = '\0';
11104 name++;
11105 }
11106 if (vxattrs) {
11107 for (int i = 0; !vxattrs[i].name.empty(); i++) {
11108 const VXattr& vxattr = vxattrs[i];
11109 if (vxattr.hidden)
11110 continue;
11111 // call pointer-to-member function
11112 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11113 continue;
11114 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11115 name += vxattr.name.length();
11116 *name = '\0';
11117 name++;
11118 }
11119 }
11120 } else
11121 r = -ERANGE;
11122 }
11123 }
11124 ldout(cct, 8) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
11125 return r;
11126 }
11127
11128 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11129 const UserPerm& perms)
11130 {
11131 Mutex::Locker lock(client_lock);
11132
11133 if (unmounting)
11134 return -ENOTCONN;
11135
11136 vinodeno_t vino = _get_vino(in);
11137
11138 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
11139 tout(cct) << "ll_listxattr" << std::endl;
11140 tout(cct) << vino.ino.val << std::endl;
11141 tout(cct) << size << std::endl;
11142
11143 return _listxattr(in, names, size, perms);
11144 }
11145
11146 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11147 size_t size, int flags, const UserPerm& perms)
11148 {
11149
11150 int xattr_flags = 0;
11151 if (!value)
11152 xattr_flags |= CEPH_XATTR_REMOVE;
11153 if (flags & XATTR_CREATE)
11154 xattr_flags |= CEPH_XATTR_CREATE;
11155 if (flags & XATTR_REPLACE)
11156 xattr_flags |= CEPH_XATTR_REPLACE;
11157
11158 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11159 filepath path;
11160 in->make_nosnap_relative_path(path);
11161 req->set_filepath(path);
11162 req->set_string2(name);
11163 req->set_inode(in);
11164 req->head.args.setxattr.flags = xattr_flags;
11165
11166 bufferlist bl;
11167 bl.append((const char*)value, size);
11168 req->set_data(bl);
11169
11170 int res = make_request(req, perms);
11171
11172 trim_cache();
11173 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11174 res << dendl;
11175 return res;
11176 }
11177
11178 int Client::_setxattr(Inode *in, const char *name, const void *value,
11179 size_t size, int flags, const UserPerm& perms)
11180 {
11181 if (in->snapid != CEPH_NOSNAP) {
11182 return -EROFS;
11183 }
11184
11185 bool posix_acl_xattr = false;
11186 if (acl_type == POSIX_ACL)
11187 posix_acl_xattr = !strncmp(name, "system.", 7);
11188
11189 if (strncmp(name, "user.", 5) &&
11190 strncmp(name, "security.", 9) &&
11191 strncmp(name, "trusted.", 8) &&
11192 strncmp(name, "ceph.", 5) &&
11193 !posix_acl_xattr)
11194 return -EOPNOTSUPP;
11195
11196 if (posix_acl_xattr) {
11197 if (!strcmp(name, ACL_EA_ACCESS)) {
11198 mode_t new_mode = in->mode;
11199 if (value) {
11200 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11201 if (ret < 0)
11202 return ret;
11203 if (ret == 0) {
11204 value = NULL;
11205 size = 0;
11206 }
11207 if (new_mode != in->mode) {
11208 struct ceph_statx stx;
11209 stx.stx_mode = new_mode;
11210 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11211 if (ret < 0)
11212 return ret;
11213 }
11214 }
11215 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11216 if (value) {
11217 if (!S_ISDIR(in->mode))
11218 return -EACCES;
11219 int ret = posix_acl_check(value, size);
11220 if (ret < 0)
11221 return -EINVAL;
11222 if (ret == 0) {
11223 value = NULL;
11224 size = 0;
11225 }
11226 }
11227 } else {
11228 return -EOPNOTSUPP;
11229 }
11230 } else {
11231 const VXattr *vxattr = _match_vxattr(in, name);
11232 if (vxattr && vxattr->readonly)
11233 return -EOPNOTSUPP;
11234 }
11235
11236 return _do_setxattr(in, name, value, size, flags, perms);
11237 }
11238
11239 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11240 size_t size, int flags, const UserPerm& perms)
11241 {
11242 if (cct->_conf->client_permissions) {
11243 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11244 if (r < 0)
11245 return r;
11246 }
11247 return _setxattr(in.get(), name, value, size, flags, perms);
11248 }
11249
11250 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11251 {
11252 string tmp;
11253 if (name == "layout") {
11254 string::iterator begin = value.begin();
11255 string::iterator end = value.end();
11256 keys_and_values<string::iterator> p; // create instance of parser
11257 std::map<string, string> m; // map to receive results
11258 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11259 return -EINVAL;
11260 }
11261 if (begin != end)
11262 return -EINVAL;
11263 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11264 if (q->first == "pool") {
11265 tmp = q->second;
11266 break;
11267 }
11268 }
11269 } else if (name == "layout.pool") {
11270 tmp = value;
11271 }
11272
11273 if (tmp.length()) {
11274 int64_t pool;
11275 try {
11276 pool = boost::lexical_cast<unsigned>(tmp);
11277 if (!osdmap->have_pg_pool(pool))
11278 return -ENOENT;
11279 } catch (boost::bad_lexical_cast const&) {
11280 pool = osdmap->lookup_pg_pool_name(tmp);
11281 if (pool < 0) {
11282 return -ENOENT;
11283 }
11284 }
11285 }
11286
11287 return 0;
11288 }
11289
11290 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11291 {
11292 // For setting pool of layout, MetaRequest need osdmap epoch.
11293 // There is a race which create a new data pool but client and mds both don't have.
11294 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11295 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11296 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11297 string rest(strstr(name, "layout"));
11298 string v((const char*)value, size);
11299 int r = objecter->with_osdmap([&](const OSDMap& o) {
11300 return _setxattr_check_data_pool(rest, v, &o);
11301 });
11302
11303 if (r == -ENOENT) {
11304 C_SaferCond ctx;
11305 objecter->wait_for_latest_osdmap(&ctx);
11306 ctx.wait();
11307 }
11308 }
11309 }
11310
11311 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11312 size_t size, int flags, const UserPerm& perms)
11313 {
11314 _setxattr_maybe_wait_for_osdmap(name, value, size);
11315
11316 Mutex::Locker lock(client_lock);
11317
11318 if (unmounting)
11319 return -ENOTCONN;
11320
11321 vinodeno_t vino = _get_vino(in);
11322
11323 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11324 tout(cct) << "ll_setxattr" << std::endl;
11325 tout(cct) << vino.ino.val << std::endl;
11326 tout(cct) << name << std::endl;
11327
11328 if (!cct->_conf->fuse_default_permissions) {
11329 int r = xattr_permission(in, name, MAY_WRITE, perms);
11330 if (r < 0)
11331 return r;
11332 }
11333 return _setxattr(in, name, value, size, flags, perms);
11334 }
11335
11336 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11337 {
11338 if (in->snapid != CEPH_NOSNAP) {
11339 return -EROFS;
11340 }
11341
11342 // same xattrs supported by kernel client
11343 if (strncmp(name, "user.", 5) &&
11344 strncmp(name, "system.", 7) &&
11345 strncmp(name, "security.", 9) &&
11346 strncmp(name, "trusted.", 8) &&
11347 strncmp(name, "ceph.", 5))
11348 return -EOPNOTSUPP;
11349
11350 const VXattr *vxattr = _match_vxattr(in, name);
11351 if (vxattr && vxattr->readonly)
11352 return -EOPNOTSUPP;
11353
11354 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11355 filepath path;
11356 in->make_nosnap_relative_path(path);
11357 req->set_filepath(path);
11358 req->set_filepath2(name);
11359 req->set_inode(in);
11360
11361 int res = make_request(req, perms);
11362
11363 trim_cache();
11364 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11365 return res;
11366 }
11367
11368 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11369 {
11370 if (cct->_conf->client_permissions) {
11371 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11372 if (r < 0)
11373 return r;
11374 }
11375 return _removexattr(in.get(), name, perms);
11376 }
11377
11378 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11379 {
11380 Mutex::Locker lock(client_lock);
11381
11382 if (unmounting)
11383 return -ENOTCONN;
11384
11385 vinodeno_t vino = _get_vino(in);
11386
11387 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11388 tout(cct) << "ll_removexattr" << std::endl;
11389 tout(cct) << vino.ino.val << std::endl;
11390 tout(cct) << name << std::endl;
11391
11392 if (!cct->_conf->fuse_default_permissions) {
11393 int r = xattr_permission(in, name, MAY_WRITE, perms);
11394 if (r < 0)
11395 return r;
11396 }
11397
11398 return _removexattr(in, name, perms);
11399 }
11400
11401 bool Client::_vxattrcb_quota_exists(Inode *in)
11402 {
11403 return in->quota.is_enable();
11404 }
11405 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11406 {
11407 return snprintf(val, size,
11408 "max_bytes=%lld max_files=%lld",
11409 (long long int)in->quota.max_bytes,
11410 (long long int)in->quota.max_files);
11411 }
11412 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11413 {
11414 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11415 }
11416 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11417 {
11418 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11419 }
11420
11421 bool Client::_vxattrcb_layout_exists(Inode *in)
11422 {
11423 return in->layout != file_layout_t();
11424 }
11425 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11426 {
11427 int r = snprintf(val, size,
11428 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11429 (unsigned long long)in->layout.stripe_unit,
11430 (unsigned long long)in->layout.stripe_count,
11431 (unsigned long long)in->layout.object_size);
11432 objecter->with_osdmap([&](const OSDMap& o) {
11433 if (o.have_pg_pool(in->layout.pool_id))
11434 r += snprintf(val + r, size - r, "%s",
11435 o.get_pool_name(in->layout.pool_id).c_str());
11436 else
11437 r += snprintf(val + r, size - r, "%" PRIu64,
11438 (uint64_t)in->layout.pool_id);
11439 });
11440 if (in->layout.pool_ns.length())
11441 r += snprintf(val + r, size - r, " pool_namespace=%s",
11442 in->layout.pool_ns.c_str());
11443 return r;
11444 }
11445 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11446 {
11447 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11448 }
11449 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11450 {
11451 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11452 }
11453 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11454 {
11455 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11456 }
11457 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11458 {
11459 size_t r;
11460 objecter->with_osdmap([&](const OSDMap& o) {
11461 if (o.have_pg_pool(in->layout.pool_id))
11462 r = snprintf(val, size, "%s", o.get_pool_name(
11463 in->layout.pool_id).c_str());
11464 else
11465 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11466 });
11467 return r;
11468 }
11469 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11470 {
11471 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11472 }
11473 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11474 {
11475 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11476 }
11477 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11478 {
11479 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11480 }
11481 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11482 {
11483 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11484 }
11485 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11486 {
11487 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11488 }
11489 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11490 {
11491 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11492 }
11493 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11494 {
11495 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11496 }
11497 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11498 {
11499 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11500 }
11501 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11502 {
11503 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11504 (long)in->rstat.rctime.nsec());
11505 }
11506
11507 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11508 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11509
11510 #define XATTR_NAME_CEPH(_type, _name) \
11511 { \
11512 name: CEPH_XATTR_NAME(_type, _name), \
11513 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11514 readonly: true, \
11515 hidden: false, \
11516 exists_cb: NULL, \
11517 flags: 0, \
11518 }
11519 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11520 { \
11521 name: CEPH_XATTR_NAME(_type, _name), \
11522 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11523 readonly: true, \
11524 hidden: false, \
11525 exists_cb: NULL, \
11526 flags: _flags, \
11527 }
11528 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11529 { \
11530 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11531 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11532 readonly: false, \
11533 hidden: true, \
11534 exists_cb: &Client::_vxattrcb_layout_exists, \
11535 flags: 0, \
11536 }
11537 #define XATTR_QUOTA_FIELD(_type, _name) \
11538 { \
11539 name: CEPH_XATTR_NAME(_type, _name), \
11540 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11541 readonly: false, \
11542 hidden: true, \
11543 exists_cb: &Client::_vxattrcb_quota_exists, \
11544 flags: 0, \
11545 }
11546
11547 const Client::VXattr Client::_dir_vxattrs[] = {
11548 {
11549 name: "ceph.dir.layout",
11550 getxattr_cb: &Client::_vxattrcb_layout,
11551 readonly: false,
11552 hidden: true,
11553 exists_cb: &Client::_vxattrcb_layout_exists,
11554 flags: 0,
11555 },
11556 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11557 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11558 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11559 XATTR_LAYOUT_FIELD(dir, layout, pool),
11560 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11561 XATTR_NAME_CEPH(dir, entries),
11562 XATTR_NAME_CEPH(dir, files),
11563 XATTR_NAME_CEPH(dir, subdirs),
11564 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11565 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11566 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11567 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11568 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11569 {
11570 name: "ceph.quota",
11571 getxattr_cb: &Client::_vxattrcb_quota,
11572 readonly: false,
11573 hidden: true,
11574 exists_cb: &Client::_vxattrcb_quota_exists,
11575 flags: 0,
11576 },
11577 XATTR_QUOTA_FIELD(quota, max_bytes),
11578 XATTR_QUOTA_FIELD(quota, max_files),
11579 { name: "" } /* Required table terminator */
11580 };
11581
11582 const Client::VXattr Client::_file_vxattrs[] = {
11583 {
11584 name: "ceph.file.layout",
11585 getxattr_cb: &Client::_vxattrcb_layout,
11586 readonly: false,
11587 hidden: true,
11588 exists_cb: &Client::_vxattrcb_layout_exists,
11589 flags: 0,
11590 },
11591 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11592 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11593 XATTR_LAYOUT_FIELD(file, layout, object_size),
11594 XATTR_LAYOUT_FIELD(file, layout, pool),
11595 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11596 { name: "" } /* Required table terminator */
11597 };
11598
11599 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11600 {
11601 if (in->is_dir())
11602 return _dir_vxattrs;
11603 else if (in->is_file())
11604 return _file_vxattrs;
11605 return NULL;
11606 }
11607
11608 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11609 {
11610 if (strncmp(name, "ceph.", 5) == 0) {
11611 const VXattr *vxattr = _get_vxattrs(in);
11612 if (vxattr) {
11613 while (!vxattr->name.empty()) {
11614 if (vxattr->name == name)
11615 return vxattr;
11616 vxattr++;
11617 }
11618 }
11619 }
11620 return NULL;
11621 }
11622
11623 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11624 {
11625 size_t len = 0;
11626 while (!vxattr->name.empty()) {
11627 if (!vxattr->hidden)
11628 len += vxattr->name.length() + 1;
11629 vxattr++;
11630 }
11631 return len;
11632 }
11633
11634 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11635 {
11636 Mutex::Locker lock(client_lock);
11637
11638 if (unmounting)
11639 return -ENOTCONN;
11640
11641 vinodeno_t vino = _get_vino(in);
11642
11643 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11644 tout(cct) << "ll_readlink" << std::endl;
11645 tout(cct) << vino.ino.val << std::endl;
11646
11647 set<Dentry*>::iterator dn = in->dn_set.begin();
11648 while (dn != in->dn_set.end()) {
11649 touch_dn(*dn);
11650 ++dn;
11651 }
11652
11653 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11654 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11655 return r;
11656 }
11657
11658 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11659 const UserPerm& perms, InodeRef *inp)
11660 {
11661 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11662 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11663 << ", gid " << perms.gid() << ")" << dendl;
11664
11665 if (strlen(name) > NAME_MAX)
11666 return -ENAMETOOLONG;
11667
11668 if (dir->snapid != CEPH_NOSNAP) {
11669 return -EROFS;
11670 }
11671 if (is_quota_files_exceeded(dir, perms)) {
11672 return -EDQUOT;
11673 }
11674
11675 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11676
11677 filepath path;
11678 dir->make_nosnap_relative_path(path);
11679 path.push_dentry(name);
11680 req->set_filepath(path);
11681 req->set_inode(dir);
11682 req->head.args.mknod.rdev = rdev;
11683 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11684 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11685
11686 bufferlist xattrs_bl;
11687 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11688 if (res < 0)
11689 goto fail;
11690 req->head.args.mknod.mode = mode;
11691 if (xattrs_bl.length() > 0)
11692 req->set_data(xattrs_bl);
11693
11694 Dentry *de;
11695 res = get_or_create(dir, name, &de);
11696 if (res < 0)
11697 goto fail;
11698 req->set_dentry(de);
11699
11700 res = make_request(req, perms, inp);
11701
11702 trim_cache();
11703
11704 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11705 return res;
11706
11707 fail:
11708 put_request(req);
11709 return res;
11710 }
11711
11712 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11713 dev_t rdev, struct stat *attr, Inode **out,
11714 const UserPerm& perms)
11715 {
11716 Mutex::Locker lock(client_lock);
11717
11718 if (unmounting)
11719 return -ENOTCONN;
11720
11721 vinodeno_t vparent = _get_vino(parent);
11722
11723 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11724 tout(cct) << "ll_mknod" << std::endl;
11725 tout(cct) << vparent.ino.val << std::endl;
11726 tout(cct) << name << std::endl;
11727 tout(cct) << mode << std::endl;
11728 tout(cct) << rdev << std::endl;
11729
11730 if (!cct->_conf->fuse_default_permissions) {
11731 int r = may_create(parent, perms);
11732 if (r < 0)
11733 return r;
11734 }
11735
11736 InodeRef in;
11737 int r = _mknod(parent, name, mode, rdev, perms, &in);
11738 if (r == 0) {
11739 fill_stat(in, attr);
11740 _ll_get(in.get());
11741 }
11742 tout(cct) << attr->st_ino << std::endl;
11743 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11744 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11745 *out = in.get();
11746 return r;
11747 }
11748
11749 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11750 dev_t rdev, Inode **out,
11751 struct ceph_statx *stx, unsigned want, unsigned flags,
11752 const UserPerm& perms)
11753 {
11754 unsigned caps = statx_to_mask(flags, want);
11755 Mutex::Locker lock(client_lock);
11756
11757 if (unmounting)
11758 return -ENOTCONN;
11759
11760 vinodeno_t vparent = _get_vino(parent);
11761
11762 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11763 tout(cct) << "ll_mknodx" << std::endl;
11764 tout(cct) << vparent.ino.val << std::endl;
11765 tout(cct) << name << std::endl;
11766 tout(cct) << mode << std::endl;
11767 tout(cct) << rdev << std::endl;
11768
11769 if (!cct->_conf->fuse_default_permissions) {
11770 int r = may_create(parent, perms);
11771 if (r < 0)
11772 return r;
11773 }
11774
11775 InodeRef in;
11776 int r = _mknod(parent, name, mode, rdev, perms, &in);
11777 if (r == 0) {
11778 fill_statx(in, caps, stx);
11779 _ll_get(in.get());
11780 }
11781 tout(cct) << stx->stx_ino << std::endl;
11782 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11783 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11784 *out = in.get();
11785 return r;
11786 }
11787
11788 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11789 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11790 int object_size, const char *data_pool, bool *created,
11791 const UserPerm& perms)
11792 {
11793 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11794 mode << dec << ")" << dendl;
11795
11796 if (strlen(name) > NAME_MAX)
11797 return -ENAMETOOLONG;
11798 if (dir->snapid != CEPH_NOSNAP) {
11799 return -EROFS;
11800 }
11801 if (is_quota_files_exceeded(dir, perms)) {
11802 return -EDQUOT;
11803 }
11804
11805 // use normalized flags to generate cmode
11806 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11807 if (cmode < 0)
11808 return -EINVAL;
11809
11810 int64_t pool_id = -1;
11811 if (data_pool && *data_pool) {
11812 pool_id = objecter->with_osdmap(
11813 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11814 if (pool_id < 0)
11815 return -EINVAL;
11816 if (pool_id > 0xffffffffll)
11817 return -ERANGE; // bummer!
11818 }
11819
11820 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11821
11822 filepath path;
11823 dir->make_nosnap_relative_path(path);
11824 path.push_dentry(name);
11825 req->set_filepath(path);
11826 req->set_inode(dir);
11827 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11828
11829 req->head.args.open.stripe_unit = stripe_unit;
11830 req->head.args.open.stripe_count = stripe_count;
11831 req->head.args.open.object_size = object_size;
11832 if (cct->_conf->client_debug_getattr_caps)
11833 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11834 else
11835 req->head.args.open.mask = 0;
11836 req->head.args.open.pool = pool_id;
11837 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11838 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11839
11840 mode |= S_IFREG;
11841 bufferlist xattrs_bl;
11842 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11843 if (res < 0)
11844 goto fail;
11845 req->head.args.open.mode = mode;
11846 if (xattrs_bl.length() > 0)
11847 req->set_data(xattrs_bl);
11848
11849 Dentry *de;
11850 res = get_or_create(dir, name, &de);
11851 if (res < 0)
11852 goto fail;
11853 req->set_dentry(de);
11854
11855 res = make_request(req, perms, inp, created);
11856 if (res < 0) {
11857 goto reply_error;
11858 }
11859
11860 /* If the caller passed a value in fhp, do the open */
11861 if(fhp) {
11862 (*inp)->get_open_ref(cmode);
11863 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11864 }
11865
11866 reply_error:
11867 trim_cache();
11868
11869 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
11870 << " layout " << stripe_unit
11871 << ' ' << stripe_count
11872 << ' ' << object_size
11873 <<") = " << res << dendl;
11874 return res;
11875
11876 fail:
11877 put_request(req);
11878 return res;
11879 }
11880
11881
11882 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11883 InodeRef *inp)
11884 {
11885 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11886 << mode << dec << ", uid " << perm.uid()
11887 << ", gid " << perm.gid() << ")" << dendl;
11888
11889 if (strlen(name) > NAME_MAX)
11890 return -ENAMETOOLONG;
11891
11892 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11893 return -EROFS;
11894 }
11895 if (is_quota_files_exceeded(dir, perm)) {
11896 return -EDQUOT;
11897 }
11898 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11899 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11900
11901 filepath path;
11902 dir->make_nosnap_relative_path(path);
11903 path.push_dentry(name);
11904 req->set_filepath(path);
11905 req->set_inode(dir);
11906 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11907 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11908
11909 mode |= S_IFDIR;
11910 bufferlist xattrs_bl;
11911 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11912 if (res < 0)
11913 goto fail;
11914 req->head.args.mkdir.mode = mode;
11915 if (xattrs_bl.length() > 0)
11916 req->set_data(xattrs_bl);
11917
11918 Dentry *de;
11919 res = get_or_create(dir, name, &de);
11920 if (res < 0)
11921 goto fail;
11922 req->set_dentry(de);
11923
11924 ldout(cct, 10) << "_mkdir: making request" << dendl;
11925 res = make_request(req, perm, inp);
11926 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11927
11928 trim_cache();
11929
11930 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11931 return res;
11932
11933 fail:
11934 put_request(req);
11935 return res;
11936 }
11937
11938 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11939 struct stat *attr, Inode **out, const UserPerm& perm)
11940 {
11941 Mutex::Locker lock(client_lock);
11942
11943 if (unmounting)
11944 return -ENOTCONN;
11945
11946 vinodeno_t vparent = _get_vino(parent);
11947
11948 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11949 tout(cct) << "ll_mkdir" << std::endl;
11950 tout(cct) << vparent.ino.val << std::endl;
11951 tout(cct) << name << std::endl;
11952 tout(cct) << mode << std::endl;
11953
11954 if (!cct->_conf->fuse_default_permissions) {
11955 int r = may_create(parent, perm);
11956 if (r < 0)
11957 return r;
11958 }
11959
11960 InodeRef in;
11961 int r = _mkdir(parent, name, mode, perm, &in);
11962 if (r == 0) {
11963 fill_stat(in, attr);
11964 _ll_get(in.get());
11965 }
11966 tout(cct) << attr->st_ino << std::endl;
11967 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11968 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11969 *out = in.get();
11970 return r;
11971 }
11972
11973 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11974 struct ceph_statx *stx, unsigned want, unsigned flags,
11975 const UserPerm& perms)
11976 {
11977 Mutex::Locker lock(client_lock);
11978
11979 if (unmounting)
11980 return -ENOTCONN;
11981
11982 vinodeno_t vparent = _get_vino(parent);
11983
11984 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11985 tout(cct) << "ll_mkdirx" << std::endl;
11986 tout(cct) << vparent.ino.val << std::endl;
11987 tout(cct) << name << std::endl;
11988 tout(cct) << mode << std::endl;
11989
11990 if (!cct->_conf->fuse_default_permissions) {
11991 int r = may_create(parent, perms);
11992 if (r < 0)
11993 return r;
11994 }
11995
11996 InodeRef in;
11997 int r = _mkdir(parent, name, mode, perms, &in);
11998 if (r == 0) {
11999 fill_statx(in, statx_to_mask(flags, want), stx);
12000 _ll_get(in.get());
12001 } else {
12002 stx->stx_ino = 0;
12003 stx->stx_mask = 0;
12004 }
12005 tout(cct) << stx->stx_ino << std::endl;
12006 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12007 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12008 *out = in.get();
12009 return r;
12010 }
12011
12012 int Client::_symlink(Inode *dir, const char *name, const char *target,
12013 const UserPerm& perms, InodeRef *inp)
12014 {
12015 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12016 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12017 << dendl;
12018
12019 if (strlen(name) > NAME_MAX)
12020 return -ENAMETOOLONG;
12021
12022 if (dir->snapid != CEPH_NOSNAP) {
12023 return -EROFS;
12024 }
12025 if (is_quota_files_exceeded(dir, perms)) {
12026 return -EDQUOT;
12027 }
12028
12029 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12030
12031 filepath path;
12032 dir->make_nosnap_relative_path(path);
12033 path.push_dentry(name);
12034 req->set_filepath(path);
12035 req->set_inode(dir);
12036 req->set_string2(target);
12037 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12038 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12039
12040 Dentry *de;
12041 int res = get_or_create(dir, name, &de);
12042 if (res < 0)
12043 goto fail;
12044 req->set_dentry(de);
12045
12046 res = make_request(req, perms, inp);
12047
12048 trim_cache();
12049 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12050 res << dendl;
12051 return res;
12052
12053 fail:
12054 put_request(req);
12055 return res;
12056 }
12057
12058 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12059 struct stat *attr, Inode **out, const UserPerm& perms)
12060 {
12061 Mutex::Locker lock(client_lock);
12062
12063 if (unmounting)
12064 return -ENOTCONN;
12065
12066 vinodeno_t vparent = _get_vino(parent);
12067
12068 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12069 << dendl;
12070 tout(cct) << "ll_symlink" << std::endl;
12071 tout(cct) << vparent.ino.val << std::endl;
12072 tout(cct) << name << std::endl;
12073 tout(cct) << value << std::endl;
12074
12075 if (!cct->_conf->fuse_default_permissions) {
12076 int r = may_create(parent, perms);
12077 if (r < 0)
12078 return r;
12079 }
12080
12081 InodeRef in;
12082 int r = _symlink(parent, name, value, perms, &in);
12083 if (r == 0) {
12084 fill_stat(in, attr);
12085 _ll_get(in.get());
12086 }
12087 tout(cct) << attr->st_ino << std::endl;
12088 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12089 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12090 *out = in.get();
12091 return r;
12092 }
12093
12094 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12095 Inode **out, struct ceph_statx *stx, unsigned want,
12096 unsigned flags, const UserPerm& perms)
12097 {
12098 Mutex::Locker lock(client_lock);
12099
12100 if (unmounting)
12101 return -ENOTCONN;
12102
12103 vinodeno_t vparent = _get_vino(parent);
12104
12105 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12106 << dendl;
12107 tout(cct) << "ll_symlinkx" << std::endl;
12108 tout(cct) << vparent.ino.val << std::endl;
12109 tout(cct) << name << std::endl;
12110 tout(cct) << value << std::endl;
12111
12112 if (!cct->_conf->fuse_default_permissions) {
12113 int r = may_create(parent, perms);
12114 if (r < 0)
12115 return r;
12116 }
12117
12118 InodeRef in;
12119 int r = _symlink(parent, name, value, perms, &in);
12120 if (r == 0) {
12121 fill_statx(in, statx_to_mask(flags, want), stx);
12122 _ll_get(in.get());
12123 }
12124 tout(cct) << stx->stx_ino << std::endl;
12125 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12126 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12127 *out = in.get();
12128 return r;
12129 }
12130
12131 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12132 {
12133 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12134 << " uid " << perm.uid() << " gid " << perm.gid()
12135 << ")" << dendl;
12136
12137 if (dir->snapid != CEPH_NOSNAP) {
12138 return -EROFS;
12139 }
12140
12141 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12142
12143 filepath path;
12144 dir->make_nosnap_relative_path(path);
12145 path.push_dentry(name);
12146 req->set_filepath(path);
12147
12148 InodeRef otherin;
12149 Inode *in;
12150 Dentry *de;
12151
12152 int res = get_or_create(dir, name, &de);
12153 if (res < 0)
12154 goto fail;
12155 req->set_dentry(de);
12156 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12157 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12158
12159 res = _lookup(dir, name, 0, &otherin, perm);
12160 if (res < 0)
12161 goto fail;
12162
12163 in = otherin.get();
12164 req->set_other_inode(in);
12165 in->break_all_delegs();
12166 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12167
12168 req->set_inode(dir);
12169
12170 res = make_request(req, perm);
12171
12172 trim_cache();
12173 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12174 return res;
12175
12176 fail:
12177 put_request(req);
12178 return res;
12179 }
12180
12181 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12182 {
12183 Mutex::Locker lock(client_lock);
12184
12185 if (unmounting)
12186 return -ENOTCONN;
12187
12188 vinodeno_t vino = _get_vino(in);
12189
12190 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12191 tout(cct) << "ll_unlink" << std::endl;
12192 tout(cct) << vino.ino.val << std::endl;
12193 tout(cct) << name << std::endl;
12194
12195 if (!cct->_conf->fuse_default_permissions) {
12196 int r = may_delete(in, name, perm);
12197 if (r < 0)
12198 return r;
12199 }
12200 return _unlink(in, name, perm);
12201 }
12202
12203 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12204 {
12205 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12206 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12207
12208 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12209 return -EROFS;
12210 }
12211
12212 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12213 MetaRequest *req = new MetaRequest(op);
12214 filepath path;
12215 dir->make_nosnap_relative_path(path);
12216 path.push_dentry(name);
12217 req->set_filepath(path);
12218
12219 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12220 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12221 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12222
12223 InodeRef in;
12224
12225 Dentry *de;
12226 int res = get_or_create(dir, name, &de);
12227 if (res < 0)
12228 goto fail;
12229 if (op == CEPH_MDS_OP_RMDIR)
12230 req->set_dentry(de);
12231 else
12232 de->get();
12233
12234 res = _lookup(dir, name, 0, &in, perms);
12235 if (res < 0)
12236 goto fail;
12237 if (op == CEPH_MDS_OP_RMDIR) {
12238 req->set_inode(dir);
12239 req->set_other_inode(in.get());
12240 } else {
12241 unlink(de, true, true);
12242 de->put();
12243 req->set_other_inode(in.get());
12244 }
12245
12246 res = make_request(req, perms);
12247
12248 trim_cache();
12249 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12250 return res;
12251
12252 fail:
12253 put_request(req);
12254 return res;
12255 }
12256
12257 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12258 {
12259 Mutex::Locker lock(client_lock);
12260
12261 if (unmounting)
12262 return -ENOTCONN;
12263
12264 vinodeno_t vino = _get_vino(in);
12265
12266 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12267 tout(cct) << "ll_rmdir" << std::endl;
12268 tout(cct) << vino.ino.val << std::endl;
12269 tout(cct) << name << std::endl;
12270
12271 if (!cct->_conf->fuse_default_permissions) {
12272 int r = may_delete(in, name, perms);
12273 if (r < 0)
12274 return r;
12275 }
12276
12277 return _rmdir(in, name, perms);
12278 }
12279
12280 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12281 {
12282 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12283 << todir->ino << " " << toname
12284 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12285 << dendl;
12286
12287 if (fromdir->snapid != todir->snapid)
12288 return -EXDEV;
12289
12290 int op = CEPH_MDS_OP_RENAME;
12291 if (fromdir->snapid != CEPH_NOSNAP) {
12292 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12293 op = CEPH_MDS_OP_RENAMESNAP;
12294 else
12295 return -EROFS;
12296 }
12297 if (fromdir != todir) {
12298 Inode *fromdir_root =
12299 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12300 Inode *todir_root =
12301 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12302 if (fromdir_root != todir_root) {
12303 return -EXDEV;
12304 }
12305 }
12306
12307 InodeRef target;
12308 MetaRequest *req = new MetaRequest(op);
12309
12310 filepath from;
12311 fromdir->make_nosnap_relative_path(from);
12312 from.push_dentry(fromname);
12313 filepath to;
12314 todir->make_nosnap_relative_path(to);
12315 to.push_dentry(toname);
12316 req->set_filepath(to);
12317 req->set_filepath2(from);
12318
12319 Dentry *oldde;
12320 int res = get_or_create(fromdir, fromname, &oldde);
12321 if (res < 0)
12322 goto fail;
12323 Dentry *de;
12324 res = get_or_create(todir, toname, &de);
12325 if (res < 0)
12326 goto fail;
12327
12328 if (op == CEPH_MDS_OP_RENAME) {
12329 req->set_old_dentry(oldde);
12330 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12331 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12332
12333 req->set_dentry(de);
12334 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12335 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12336
12337 InodeRef oldin, otherin;
12338 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12339 if (res < 0)
12340 goto fail;
12341
12342 Inode *oldinode = oldin.get();
12343 oldinode->break_all_delegs();
12344 req->set_old_inode(oldinode);
12345 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12346
12347 res = _lookup(todir, toname, 0, &otherin, perm);
12348 switch (res) {
12349 case 0:
12350 {
12351 Inode *in = otherin.get();
12352 req->set_other_inode(in);
12353 in->break_all_delegs();
12354 }
12355 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12356 break;
12357 case -ENOENT:
12358 break;
12359 default:
12360 goto fail;
12361 }
12362
12363 req->set_inode(todir);
12364 } else {
12365 // renamesnap reply contains no tracedn, so we need to invalidate
12366 // dentry manually
12367 unlink(oldde, true, true);
12368 unlink(de, true, true);
12369 }
12370
12371 res = make_request(req, perm, &target);
12372 ldout(cct, 10) << "rename result is " << res << dendl;
12373
12374 // renamed item from our cache
12375
12376 trim_cache();
12377 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12378 return res;
12379
12380 fail:
12381 put_request(req);
12382 return res;
12383 }
12384
12385 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12386 const char *newname, const UserPerm& perm)
12387 {
12388 Mutex::Locker lock(client_lock);
12389
12390 if (unmounting)
12391 return -ENOTCONN;
12392
12393 vinodeno_t vparent = _get_vino(parent);
12394 vinodeno_t vnewparent = _get_vino(newparent);
12395
12396 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12397 << vnewparent << " " << newname << dendl;
12398 tout(cct) << "ll_rename" << std::endl;
12399 tout(cct) << vparent.ino.val << std::endl;
12400 tout(cct) << name << std::endl;
12401 tout(cct) << vnewparent.ino.val << std::endl;
12402 tout(cct) << newname << std::endl;
12403
12404 if (!cct->_conf->fuse_default_permissions) {
12405 int r = may_delete(parent, name, perm);
12406 if (r < 0)
12407 return r;
12408 r = may_delete(newparent, newname, perm);
12409 if (r < 0 && r != -ENOENT)
12410 return r;
12411 }
12412
12413 return _rename(parent, name, newparent, newname, perm);
12414 }
12415
12416 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12417 {
12418 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12419 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12420
12421 if (strlen(newname) > NAME_MAX)
12422 return -ENAMETOOLONG;
12423
12424 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12425 return -EROFS;
12426 }
12427 if (is_quota_files_exceeded(dir, perm)) {
12428 return -EDQUOT;
12429 }
12430
12431 in->break_all_delegs();
12432 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12433
12434 filepath path(newname, dir->ino);
12435 req->set_filepath(path);
12436 filepath existing(in->ino);
12437 req->set_filepath2(existing);
12438
12439 req->set_inode(dir);
12440 req->inode_drop = CEPH_CAP_FILE_SHARED;
12441 req->inode_unless = CEPH_CAP_FILE_EXCL;
12442
12443 Dentry *de;
12444 int res = get_or_create(dir, newname, &de);
12445 if (res < 0)
12446 goto fail;
12447 req->set_dentry(de);
12448
12449 res = make_request(req, perm, inp);
12450 ldout(cct, 10) << "link result is " << res << dendl;
12451
12452 trim_cache();
12453 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12454 return res;
12455
12456 fail:
12457 put_request(req);
12458 return res;
12459 }
12460
12461 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12462 const UserPerm& perm)
12463 {
12464 Mutex::Locker lock(client_lock);
12465
12466 if (unmounting)
12467 return -ENOTCONN;
12468
12469 vinodeno_t vino = _get_vino(in);
12470 vinodeno_t vnewparent = _get_vino(newparent);
12471
12472 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12473 newname << dendl;
12474 tout(cct) << "ll_link" << std::endl;
12475 tout(cct) << vino.ino.val << std::endl;
12476 tout(cct) << vnewparent << std::endl;
12477 tout(cct) << newname << std::endl;
12478
12479 int r = 0;
12480 InodeRef target;
12481
12482 if (!cct->_conf->fuse_default_permissions) {
12483 if (S_ISDIR(in->mode))
12484 return -EPERM;
12485
12486 r = may_hardlink(in, perm);
12487 if (r < 0)
12488 return r;
12489
12490 r = may_create(newparent, perm);
12491 if (r < 0)
12492 return r;
12493 }
12494
12495 return _link(in, newparent, newname, perm, &target);
12496 }
12497
12498 int Client::ll_num_osds(void)
12499 {
12500 Mutex::Locker lock(client_lock);
12501 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12502 }
12503
12504 int Client::ll_osdaddr(int osd, uint32_t *addr)
12505 {
12506 Mutex::Locker lock(client_lock);
12507
12508 entity_addr_t g;
12509 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12510 if (!o.exists(osd))
12511 return false;
12512 g = o.get_addr(osd);
12513 return true;
12514 });
12515 if (!exists)
12516 return -1;
12517 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12518 *addr = ntohl(nb_addr);
12519 return 0;
12520 }
12521
12522 uint32_t Client::ll_stripe_unit(Inode *in)
12523 {
12524 Mutex::Locker lock(client_lock);
12525 return in->layout.stripe_unit;
12526 }
12527
12528 uint64_t Client::ll_snap_seq(Inode *in)
12529 {
12530 Mutex::Locker lock(client_lock);
12531 return in->snaprealm->seq;
12532 }
12533
12534 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12535 {
12536 Mutex::Locker lock(client_lock);
12537 *layout = in->layout;
12538 return 0;
12539 }
12540
12541 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12542 {
12543 return ll_file_layout(fh->inode.get(), layout);
12544 }
12545
12546 /* Currently we cannot take advantage of redundancy in reads, since we
12547 would have to go through all possible placement groups (a
12548 potentially quite large number determined by a hash), and use CRUSH
12549 to calculate the appropriate set of OSDs for each placement group,
12550 then index into that. An array with one entry per OSD is much more
12551 tractable and works for demonstration purposes. */
12552
12553 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12554 file_layout_t* layout)
12555 {
12556 Mutex::Locker lock(client_lock);
12557
12558 inodeno_t ino = in->ino;
12559 uint32_t object_size = layout->object_size;
12560 uint32_t su = layout->stripe_unit;
12561 uint32_t stripe_count = layout->stripe_count;
12562 uint64_t stripes_per_object = object_size / su;
12563
12564 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12565 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12566 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12567 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12568
12569 object_t oid = file_object_t(ino, objectno);
12570 return objecter->with_osdmap([&](const OSDMap& o) {
12571 ceph_object_layout olayout =
12572 o.file_to_object_layout(oid, *layout);
12573 pg_t pg = (pg_t)olayout.ol_pgid;
12574 vector<int> osds;
12575 int primary;
12576 o.pg_to_acting_osds(pg, &osds, &primary);
12577 return primary;
12578 });
12579 }
12580
12581 /* Return the offset of the block, internal to the object */
12582
12583 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12584 {
12585 Mutex::Locker lock(client_lock);
12586 file_layout_t *layout=&(in->layout);
12587 uint32_t object_size = layout->object_size;
12588 uint32_t su = layout->stripe_unit;
12589 uint64_t stripes_per_object = object_size / su;
12590
12591 return (blockno % stripes_per_object) * su;
12592 }
12593
12594 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12595 const UserPerm& perms)
12596 {
12597 Mutex::Locker lock(client_lock);
12598
12599 if (unmounting)
12600 return -ENOTCONN;
12601
12602 vinodeno_t vino = _get_vino(in);
12603
12604 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12605 tout(cct) << "ll_opendir" << std::endl;
12606 tout(cct) << vino.ino.val << std::endl;
12607
12608 if (!cct->_conf->fuse_default_permissions) {
12609 int r = may_open(in, flags, perms);
12610 if (r < 0)
12611 return r;
12612 }
12613
12614 int r = _opendir(in, dirpp, perms);
12615 tout(cct) << (unsigned long)*dirpp << std::endl;
12616
12617 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12618 << dendl;
12619 return r;
12620 }
12621
12622 int Client::ll_releasedir(dir_result_t *dirp)
12623 {
12624 Mutex::Locker lock(client_lock);
12625 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12626 tout(cct) << "ll_releasedir" << std::endl;
12627 tout(cct) << (unsigned long)dirp << std::endl;
12628
12629 if (unmounting)
12630 return -ENOTCONN;
12631
12632 _closedir(dirp);
12633 return 0;
12634 }
12635
12636 int Client::ll_fsyncdir(dir_result_t *dirp)
12637 {
12638 Mutex::Locker lock(client_lock);
12639 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12640 tout(cct) << "ll_fsyncdir" << std::endl;
12641 tout(cct) << (unsigned long)dirp << std::endl;
12642
12643 if (unmounting)
12644 return -ENOTCONN;
12645
12646 return _fsync(dirp->inode.get(), false);
12647 }
12648
12649 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12650 {
12651 assert(!(flags & O_CREAT));
12652
12653 Mutex::Locker lock(client_lock);
12654
12655 if (unmounting)
12656 return -ENOTCONN;
12657
12658 vinodeno_t vino = _get_vino(in);
12659
12660 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12661 tout(cct) << "ll_open" << std::endl;
12662 tout(cct) << vino.ino.val << std::endl;
12663 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12664
12665 int r;
12666 if (!cct->_conf->fuse_default_permissions) {
12667 r = may_open(in, flags, perms);
12668 if (r < 0)
12669 goto out;
12670 }
12671
12672 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12673
12674 out:
12675 Fh *fhptr = fhp ? *fhp : NULL;
12676 if (fhptr) {
12677 ll_unclosed_fh_set.insert(fhptr);
12678 }
12679 tout(cct) << (unsigned long)fhptr << std::endl;
12680 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12681 " = " << r << " (" << fhptr << ")" << dendl;
12682 return r;
12683 }
12684
12685 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12686 int flags, InodeRef *in, int caps, Fh **fhp,
12687 const UserPerm& perms)
12688 {
12689 *fhp = NULL;
12690
12691 vinodeno_t vparent = _get_vino(parent);
12692
12693 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12694 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12695 << ", gid " << perms.gid() << dendl;
12696 tout(cct) << "ll_create" << std::endl;
12697 tout(cct) << vparent.ino.val << std::endl;
12698 tout(cct) << name << std::endl;
12699 tout(cct) << mode << std::endl;
12700 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12701
12702 bool created = false;
12703 int r = _lookup(parent, name, caps, in, perms);
12704
12705 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12706 return -EEXIST;
12707
12708 if (r == -ENOENT && (flags & O_CREAT)) {
12709 if (!cct->_conf->fuse_default_permissions) {
12710 r = may_create(parent, perms);
12711 if (r < 0)
12712 goto out;
12713 }
12714 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12715 perms);
12716 if (r < 0)
12717 goto out;
12718 }
12719
12720 if (r < 0)
12721 goto out;
12722
12723 assert(*in);
12724
12725 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12726 if (!created) {
12727 if (!cct->_conf->fuse_default_permissions) {
12728 r = may_open(in->get(), flags, perms);
12729 if (r < 0) {
12730 if (*fhp) {
12731 int release_r = _release_fh(*fhp);
12732 assert(release_r == 0); // during create, no async data ops should have happened
12733 }
12734 goto out;
12735 }
12736 }
12737 if (*fhp == NULL) {
12738 r = _open(in->get(), flags, mode, fhp, perms);
12739 if (r < 0)
12740 goto out;
12741 }
12742 }
12743
12744 out:
12745 if (*fhp) {
12746 ll_unclosed_fh_set.insert(*fhp);
12747 }
12748
12749 ino_t ino = 0;
12750 if (r >= 0) {
12751 Inode *inode = in->get();
12752 if (use_faked_inos())
12753 ino = inode->faked_ino;
12754 else
12755 ino = inode->ino;
12756 }
12757
12758 tout(cct) << (unsigned long)*fhp << std::endl;
12759 tout(cct) << ino << std::endl;
12760 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12761 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12762 *fhp << " " << hex << ino << dec << ")" << dendl;
12763
12764 return r;
12765 }
12766
12767 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12768 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12769 const UserPerm& perms)
12770 {
12771 Mutex::Locker lock(client_lock);
12772 InodeRef in;
12773
12774 if (unmounting)
12775 return -ENOTCONN;
12776
12777 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12778 fhp, perms);
12779 if (r >= 0) {
12780 assert(in);
12781
12782 // passing an Inode in outp requires an additional ref
12783 if (outp) {
12784 _ll_get(in.get());
12785 *outp = in.get();
12786 }
12787 fill_stat(in, attr);
12788 } else {
12789 attr->st_ino = 0;
12790 }
12791
12792 return r;
12793 }
12794
12795 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12796 int oflags, Inode **outp, Fh **fhp,
12797 struct ceph_statx *stx, unsigned want, unsigned lflags,
12798 const UserPerm& perms)
12799 {
12800 unsigned caps = statx_to_mask(lflags, want);
12801 Mutex::Locker lock(client_lock);
12802 InodeRef in;
12803
12804 if (unmounting)
12805 return -ENOTCONN;
12806
12807 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12808 if (r >= 0) {
12809 assert(in);
12810
12811 // passing an Inode in outp requires an additional ref
12812 if (outp) {
12813 _ll_get(in.get());
12814 *outp = in.get();
12815 }
12816 fill_statx(in, caps, stx);
12817 } else {
12818 stx->stx_ino = 0;
12819 stx->stx_mask = 0;
12820 }
12821
12822 return r;
12823 }
12824
12825 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12826 {
12827 Mutex::Locker lock(client_lock);
12828 tout(cct) << "ll_lseek" << std::endl;
12829 tout(cct) << offset << std::endl;
12830 tout(cct) << whence << std::endl;
12831
12832 if (unmounting)
12833 return -ENOTCONN;
12834
12835 return _lseek(fh, offset, whence);
12836 }
12837
12838 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12839 {
12840 Mutex::Locker lock(client_lock);
12841 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12842 tout(cct) << "ll_read" << std::endl;
12843 tout(cct) << (unsigned long)fh << std::endl;
12844 tout(cct) << off << std::endl;
12845 tout(cct) << len << std::endl;
12846
12847 if (unmounting)
12848 return -ENOTCONN;
12849
12850 return _read(fh, off, len, bl);
12851 }
12852
12853 int Client::ll_read_block(Inode *in, uint64_t blockid,
12854 char *buf,
12855 uint64_t offset,
12856 uint64_t length,
12857 file_layout_t* layout)
12858 {
12859 Mutex::Locker lock(client_lock);
12860
12861 if (unmounting)
12862 return -ENOTCONN;
12863
12864 vinodeno_t vino = _get_vino(in);
12865 object_t oid = file_object_t(vino.ino, blockid);
12866 C_SaferCond onfinish;
12867 bufferlist bl;
12868
12869 objecter->read(oid,
12870 object_locator_t(layout->pool_id),
12871 offset,
12872 length,
12873 vino.snapid,
12874 &bl,
12875 CEPH_OSD_FLAG_READ,
12876 &onfinish);
12877
12878 client_lock.Unlock();
12879 int r = onfinish.wait();
12880 client_lock.Lock();
12881
12882 if (r >= 0) {
12883 bl.copy(0, bl.length(), buf);
12884 r = bl.length();
12885 }
12886
12887 return r;
12888 }
12889
12890 /* It appears that the OSD doesn't return success unless the entire
12891 buffer was written, return the write length on success. */
12892
12893 int Client::ll_write_block(Inode *in, uint64_t blockid,
12894 char* buf, uint64_t offset,
12895 uint64_t length, file_layout_t* layout,
12896 uint64_t snapseq, uint32_t sync)
12897 {
12898 Mutex flock("Client::ll_write_block flock");
12899 vinodeno_t vino = ll_get_vino(in);
12900 Cond cond;
12901 bool done;
12902 int r = 0;
12903 Context *onsafe = nullptr;
12904
12905 if (length == 0) {
12906 return -EINVAL;
12907 }
12908 if (true || sync) {
12909 /* if write is stable, the epilogue is waiting on
12910 * flock */
12911 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12912 done = false;
12913 } else {
12914 /* if write is unstable, we just place a barrier for
12915 * future commits to wait on */
12916 /*onsafe = new C_Block_Sync(this, vino.ino,
12917 barrier_interval(offset, offset + length), &r);
12918 */
12919 done = true;
12920 }
12921 object_t oid = file_object_t(vino.ino, blockid);
12922 SnapContext fakesnap;
12923 bufferptr bp;
12924 if (length > 0) bp = buffer::copy(buf, length);
12925 bufferlist bl;
12926 bl.push_back(bp);
12927
12928 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12929 << dendl;
12930
12931 fakesnap.seq = snapseq;
12932
12933 /* lock just in time */
12934 client_lock.Lock();
12935 if (unmounting) {
12936 client_lock.Unlock();
12937 delete onsafe;
12938 return -ENOTCONN;
12939 }
12940
12941 objecter->write(oid,
12942 object_locator_t(layout->pool_id),
12943 offset,
12944 length,
12945 fakesnap,
12946 bl,
12947 ceph::real_clock::now(),
12948 0,
12949 onsafe);
12950
12951 client_lock.Unlock();
12952 if (!done /* also !sync */) {
12953 flock.Lock();
12954 while (! done)
12955 cond.Wait(flock);
12956 flock.Unlock();
12957 }
12958
12959 if (r < 0) {
12960 return r;
12961 } else {
12962 return length;
12963 }
12964 }
12965
12966 int Client::ll_commit_blocks(Inode *in,
12967 uint64_t offset,
12968 uint64_t length)
12969 {
12970 Mutex::Locker lock(client_lock);
12971 /*
12972 BarrierContext *bctx;
12973 vinodeno_t vino = _get_vino(in);
12974 uint64_t ino = vino.ino;
12975
12976 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12977 << offset << " to " << length << dendl;
12978
12979 if (length == 0) {
12980 return -EINVAL;
12981 }
12982
12983 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12984 if (p != barriers.end()) {
12985 barrier_interval civ(offset, offset + length);
12986 p->second->commit_barrier(civ);
12987 }
12988 */
12989 return 0;
12990 }
12991
12992 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12993 {
12994 Mutex::Locker lock(client_lock);
12995 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12996 "~" << len << dendl;
12997 tout(cct) << "ll_write" << std::endl;
12998 tout(cct) << (unsigned long)fh << std::endl;
12999 tout(cct) << off << std::endl;
13000 tout(cct) << len << std::endl;
13001
13002 if (unmounting)
13003 return -ENOTCONN;
13004
13005 int r = _write(fh, off, len, data, NULL, 0);
13006 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13007 << dendl;
13008 return r;
13009 }
13010
13011 int Client::ll_flush(Fh *fh)
13012 {
13013 Mutex::Locker lock(client_lock);
13014 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13015 tout(cct) << "ll_flush" << std::endl;
13016 tout(cct) << (unsigned long)fh << std::endl;
13017
13018 if (unmounting)
13019 return -ENOTCONN;
13020
13021 return _flush(fh);
13022 }
13023
13024 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13025 {
13026 Mutex::Locker lock(client_lock);
13027 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13028 tout(cct) << "ll_fsync" << std::endl;
13029 tout(cct) << (unsigned long)fh << std::endl;
13030
13031 if (unmounting)
13032 return -ENOTCONN;
13033
13034 int r = _fsync(fh, syncdataonly);
13035 if (r) {
13036 // If we're returning an error, clear it from the FH
13037 fh->take_async_err();
13038 }
13039 return r;
13040 }
13041
13042 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13043 {
13044 Mutex::Locker lock(client_lock);
13045 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13046 tout(cct) << "ll_sync_inode" << std::endl;
13047 tout(cct) << (unsigned long)in << std::endl;
13048
13049 if (unmounting)
13050 return -ENOTCONN;
13051
13052 return _fsync(in, syncdataonly);
13053 }
13054
13055 #ifdef FALLOC_FL_PUNCH_HOLE
13056
13057 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13058 {
13059 if (offset < 0 || length <= 0)
13060 return -EINVAL;
13061
13062 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13063 return -EOPNOTSUPP;
13064
13065 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13066 return -EOPNOTSUPP;
13067
13068 Inode *in = fh->inode.get();
13069
13070 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13071 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13072 return -ENOSPC;
13073 }
13074
13075 if (in->snapid != CEPH_NOSNAP)
13076 return -EROFS;
13077
13078 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13079 return -EBADF;
13080
13081 uint64_t size = offset + length;
13082 std::list<InodeRef> quota_roots;
13083 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13084 size > in->size &&
13085 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
13086 return -EDQUOT;
13087 }
13088
13089 int have;
13090 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13091 if (r < 0)
13092 return r;
13093
13094 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
13095 Cond uninline_cond;
13096 bool uninline_done = false;
13097 int uninline_ret = 0;
13098 Context *onuninline = NULL;
13099
13100 if (mode & FALLOC_FL_PUNCH_HOLE) {
13101 if (in->inline_version < CEPH_INLINE_NONE &&
13102 (have & CEPH_CAP_FILE_BUFFER)) {
13103 bufferlist bl;
13104 int len = in->inline_data.length();
13105 if (offset < len) {
13106 if (offset > 0)
13107 in->inline_data.copy(0, offset, bl);
13108 int size = length;
13109 if (offset + size > len)
13110 size = len - offset;
13111 if (size > 0)
13112 bl.append_zero(size);
13113 if (offset + size < len)
13114 in->inline_data.copy(offset + size, len - offset - size, bl);
13115 in->inline_data = bl;
13116 in->inline_version++;
13117 }
13118 in->mtime = in->ctime = ceph_clock_now();
13119 in->change_attr++;
13120 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13121 } else {
13122 if (in->inline_version < CEPH_INLINE_NONE) {
13123 onuninline = new C_SafeCond(&uninline_flock,
13124 &uninline_cond,
13125 &uninline_done,
13126 &uninline_ret);
13127 uninline_data(in, onuninline);
13128 }
13129
13130 Mutex flock("Client::_punch_hole flock");
13131 Cond cond;
13132 bool done = false;
13133 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
13134
13135 unsafe_sync_write++;
13136 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13137
13138 _invalidate_inode_cache(in, offset, length);
13139 filer->zero(in->ino, &in->layout,
13140 in->snaprealm->get_snap_context(),
13141 offset, length,
13142 ceph::real_clock::now(),
13143 0, true, onfinish);
13144 in->mtime = in->ctime = ceph_clock_now();
13145 in->change_attr++;
13146 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13147
13148 client_lock.Unlock();
13149 flock.Lock();
13150 while (!done)
13151 cond.Wait(flock);
13152 flock.Unlock();
13153 client_lock.Lock();
13154 _sync_write_commit(in);
13155 }
13156 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13157 uint64_t size = offset + length;
13158 if (size > in->size) {
13159 in->size = size;
13160 in->mtime = in->ctime = ceph_clock_now();
13161 in->change_attr++;
13162 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13163
13164 if (is_quota_bytes_approaching(in, quota_roots)) {
13165 check_caps(in, CHECK_CAPS_NODELAY);
13166 } else if (is_max_size_approaching(in)) {
13167 check_caps(in, 0);
13168 }
13169 }
13170 }
13171
13172 if (onuninline) {
13173 client_lock.Unlock();
13174 uninline_flock.Lock();
13175 while (!uninline_done)
13176 uninline_cond.Wait(uninline_flock);
13177 uninline_flock.Unlock();
13178 client_lock.Lock();
13179
13180 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13181 in->inline_data.clear();
13182 in->inline_version = CEPH_INLINE_NONE;
13183 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13184 check_caps(in, 0);
13185 } else
13186 r = uninline_ret;
13187 }
13188
13189 put_cap_ref(in, CEPH_CAP_FILE_WR);
13190 return r;
13191 }
13192 #else
13193
13194 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13195 {
13196 return -EOPNOTSUPP;
13197 }
13198
13199 #endif
13200
13201
13202 int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13203 {
13204 Mutex::Locker lock(client_lock);
13205 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13206 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13207 tout(cct) << (unsigned long)fh << std::endl;
13208
13209 if (unmounting)
13210 return -ENOTCONN;
13211
13212 return _fallocate(fh, mode, offset, length);
13213 }
13214
13215 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13216 {
13217 Mutex::Locker lock(client_lock);
13218 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13219
13220 if (unmounting)
13221 return -ENOTCONN;
13222
13223 Fh *fh = get_filehandle(fd);
13224 if (!fh)
13225 return -EBADF;
13226 #if defined(__linux__) && defined(O_PATH)
13227 if (fh->flags & O_PATH)
13228 return -EBADF;
13229 #endif
13230 return _fallocate(fh, mode, offset, length);
13231 }
13232
13233 int Client::ll_release(Fh *fh)
13234 {
13235 Mutex::Locker lock(client_lock);
13236
13237 if (unmounting)
13238 return -ENOTCONN;
13239
13240 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13241 dendl;
13242 tout(cct) << "ll_release (fh)" << std::endl;
13243 tout(cct) << (unsigned long)fh << std::endl;
13244
13245 if (ll_unclosed_fh_set.count(fh))
13246 ll_unclosed_fh_set.erase(fh);
13247 return _release_fh(fh);
13248 }
13249
13250 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13251 {
13252 Mutex::Locker lock(client_lock);
13253
13254 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13255 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13256
13257 if (unmounting)
13258 return -ENOTCONN;
13259
13260 return _getlk(fh, fl, owner);
13261 }
13262
13263 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13264 {
13265 Mutex::Locker lock(client_lock);
13266
13267 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13268 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13269
13270 if (unmounting)
13271 return -ENOTCONN;
13272
13273 return _setlk(fh, fl, owner, sleep);
13274 }
13275
13276 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13277 {
13278 Mutex::Locker lock(client_lock);
13279
13280 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13281 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13282
13283 if (unmounting)
13284 return -ENOTCONN;
13285
13286 return _flock(fh, cmd, owner);
13287 }
13288
13289 int Client::set_deleg_timeout(uint32_t timeout)
13290 {
13291 Mutex::Locker lock(client_lock);
13292
13293 /*
13294 * The whole point is to prevent blacklisting so we must time out the
13295 * delegation before the session autoclose timeout kicks in.
13296 */
13297 if (timeout >= mdsmap->get_session_autoclose())
13298 return -EINVAL;
13299
13300 deleg_timeout = timeout;
13301 return 0;
13302 }
13303
13304 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13305 {
13306 int ret = -EINVAL;
13307
13308 Mutex::Locker lock(client_lock);
13309
13310 if (!mounted)
13311 return -ENOTCONN;
13312
13313 Inode *inode = fh->inode.get();
13314
13315 switch(cmd) {
13316 case CEPH_DELEGATION_NONE:
13317 inode->unset_deleg(fh);
13318 ret = 0;
13319 break;
13320 default:
13321 try {
13322 ret = inode->set_deleg(fh, cmd, cb, priv);
13323 } catch (std::bad_alloc) {
13324 ret = -ENOMEM;
13325 }
13326 break;
13327 }
13328 return ret;
13329 }
13330
13331 class C_Client_RequestInterrupt : public Context {
13332 private:
13333 Client *client;
13334 MetaRequest *req;
13335 public:
13336 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13337 req->get();
13338 }
13339 void finish(int r) override {
13340 Mutex::Locker l(client->client_lock);
13341 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13342 client->_interrupt_filelock(req);
13343 client->put_request(req);
13344 }
13345 };
13346
13347 void Client::ll_interrupt(void *d)
13348 {
13349 MetaRequest *req = static_cast<MetaRequest*>(d);
13350 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13351 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13352 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13353 }
13354
13355 // =========================================
13356 // layout
13357
13358 // expose file layouts
13359
13360 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13361 const UserPerm& perms)
13362 {
13363 Mutex::Locker lock(client_lock);
13364
13365 if (unmounting)
13366 return -ENOTCONN;
13367
13368 filepath path(relpath);
13369 InodeRef in;
13370 int r = path_walk(path, &in, perms);
13371 if (r < 0)
13372 return r;
13373
13374 *lp = in->layout;
13375
13376 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13377 return 0;
13378 }
13379
13380 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13381 {
13382 Mutex::Locker lock(client_lock);
13383
13384 if (unmounting)
13385 return -ENOTCONN;
13386
13387 Fh *f = get_filehandle(fd);
13388 if (!f)
13389 return -EBADF;
13390 Inode *in = f->inode.get();
13391
13392 *lp = in->layout;
13393
13394 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13395 return 0;
13396 }
13397
13398 int64_t Client::get_default_pool_id()
13399 {
13400 Mutex::Locker lock(client_lock);
13401
13402 if (unmounting)
13403 return -ENOTCONN;
13404
13405 /* first data pool is the default */
13406 return mdsmap->get_first_data_pool();
13407 }
13408
13409 // expose osdmap
13410
13411 int64_t Client::get_pool_id(const char *pool_name)
13412 {
13413 Mutex::Locker lock(client_lock);
13414
13415 if (unmounting)
13416 return -ENOTCONN;
13417
13418 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13419 pool_name);
13420 }
13421
13422 string Client::get_pool_name(int64_t pool)
13423 {
13424 Mutex::Locker lock(client_lock);
13425
13426 if (unmounting)
13427 return string();
13428
13429 return objecter->with_osdmap([pool](const OSDMap& o) {
13430 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13431 });
13432 }
13433
13434 int Client::get_pool_replication(int64_t pool)
13435 {
13436 Mutex::Locker lock(client_lock);
13437
13438 if (unmounting)
13439 return -ENOTCONN;
13440
13441 return objecter->with_osdmap([pool](const OSDMap& o) {
13442 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13443 });
13444 }
13445
13446 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13447 {
13448 Mutex::Locker lock(client_lock);
13449
13450 if (unmounting)
13451 return -ENOTCONN;
13452
13453 Fh *f = get_filehandle(fd);
13454 if (!f)
13455 return -EBADF;
13456 Inode *in = f->inode.get();
13457
13458 vector<ObjectExtent> extents;
13459 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13460 assert(extents.size() == 1);
13461
13462 objecter->with_osdmap([&](const OSDMap& o) {
13463 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13464 o.pg_to_acting_osds(pg, osds);
13465 });
13466
13467 if (osds.empty())
13468 return -EINVAL;
13469
13470 /*
13471 * Return the remainder of the extent (stripe unit)
13472 *
13473 * If length = 1 is passed to Striper::file_to_extents we get a single
13474 * extent back, but its length is one so we still need to compute the length
13475 * to the end of the stripe unit.
13476 *
13477 * If length = su then we may get 1 or 2 objects back in the extents vector
13478 * which would have to be examined. Even then, the offsets are local to the
13479 * object, so matching up to the file offset is extra work.
13480 *
13481 * It seems simpler to stick with length = 1 and manually compute the
13482 * remainder.
13483 */
13484 if (len) {
13485 uint64_t su = in->layout.stripe_unit;
13486 *len = su - (off % su);
13487 }
13488
13489 return 0;
13490 }
13491
13492 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13493 {
13494 Mutex::Locker lock(client_lock);
13495
13496 if (unmounting)
13497 return -ENOTCONN;
13498
13499 if (id < 0)
13500 return -EINVAL;
13501 return objecter->with_osdmap([&](const OSDMap& o) {
13502 return o.crush->get_full_location_ordered(id, path);
13503 });
13504 }
13505
13506 int Client::get_file_stripe_address(int fd, loff_t offset,
13507 vector<entity_addr_t>& address)
13508 {
13509 Mutex::Locker lock(client_lock);
13510
13511 if (unmounting)
13512 return -ENOTCONN;
13513
13514 Fh *f = get_filehandle(fd);
13515 if (!f)
13516 return -EBADF;
13517 Inode *in = f->inode.get();
13518
13519 // which object?
13520 vector<ObjectExtent> extents;
13521 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13522 in->truncate_size, extents);
13523 assert(extents.size() == 1);
13524
13525 // now we have the object and its 'layout'
13526 return objecter->with_osdmap([&](const OSDMap& o) {
13527 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13528 vector<int> osds;
13529 o.pg_to_acting_osds(pg, osds);
13530 if (osds.empty())
13531 return -EINVAL;
13532 for (unsigned i = 0; i < osds.size(); i++) {
13533 entity_addr_t addr = o.get_addr(osds[i]);
13534 address.push_back(addr);
13535 }
13536 return 0;
13537 });
13538 }
13539
13540 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13541 {
13542 Mutex::Locker lock(client_lock);
13543
13544 if (unmounting)
13545 return -ENOTCONN;
13546
13547 return objecter->with_osdmap([&](const OSDMap& o) {
13548 if (!o.exists(osd))
13549 return -ENOENT;
13550
13551 addr = o.get_addr(osd);
13552 return 0;
13553 });
13554 }
13555
13556 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13557 loff_t length, loff_t offset)
13558 {
13559 Mutex::Locker lock(client_lock);
13560
13561 if (unmounting)
13562 return -ENOTCONN;
13563
13564 Fh *f = get_filehandle(fd);
13565 if (!f)
13566 return -EBADF;
13567 Inode *in = f->inode.get();
13568
13569 // map to a list of extents
13570 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13571
13572 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13573 return 0;
13574 }
13575
13576
13577 /* find an osd with the same ip. -ENXIO if none. */
13578 int Client::get_local_osd()
13579 {
13580 Mutex::Locker lock(client_lock);
13581
13582 if (unmounting)
13583 return -ENOTCONN;
13584
13585 objecter->with_osdmap([this](const OSDMap& o) {
13586 if (o.get_epoch() != local_osd_epoch) {
13587 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13588 local_osd_epoch = o.get_epoch();
13589 }
13590 });
13591 return local_osd;
13592 }
13593
13594
13595
13596
13597
13598
13599 // ===============================
13600
13601 void Client::ms_handle_connect(Connection *con)
13602 {
13603 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13604 }
13605
13606 bool Client::ms_handle_reset(Connection *con)
13607 {
13608 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13609 return false;
13610 }
13611
13612 void Client::ms_handle_remote_reset(Connection *con)
13613 {
13614 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13615 Mutex::Locker l(client_lock);
13616 switch (con->get_peer_type()) {
13617 case CEPH_ENTITY_TYPE_MDS:
13618 {
13619 // kludge to figure out which mds this is; fixme with a Connection* state
13620 mds_rank_t mds = MDS_RANK_NONE;
13621 MetaSession *s = NULL;
13622 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13623 p != mds_sessions.end();
13624 ++p) {
13625 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13626 mds = p->first;
13627 s = p->second;
13628 }
13629 }
13630 if (mds >= 0) {
13631 assert (s != NULL);
13632 switch (s->state) {
13633 case MetaSession::STATE_CLOSING:
13634 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13635 _closed_mds_session(s);
13636 break;
13637
13638 case MetaSession::STATE_OPENING:
13639 {
13640 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13641 list<Context*> waiters;
13642 waiters.swap(s->waiting_for_open);
13643 _closed_mds_session(s);
13644 MetaSession *news = _get_or_open_mds_session(mds);
13645 news->waiting_for_open.swap(waiters);
13646 }
13647 break;
13648
13649 case MetaSession::STATE_OPEN:
13650 {
13651 objecter->maybe_request_map(); /* to check if we are blacklisted */
13652 const md_config_t *conf = cct->_conf;
13653 if (conf->client_reconnect_stale) {
13654 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13655 _closed_mds_session(s);
13656 } else {
13657 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13658 s->state = MetaSession::STATE_STALE;
13659 }
13660 }
13661 break;
13662
13663 case MetaSession::STATE_NEW:
13664 case MetaSession::STATE_CLOSED:
13665 default:
13666 break;
13667 }
13668 }
13669 }
13670 break;
13671 }
13672 }
13673
13674 bool Client::ms_handle_refused(Connection *con)
13675 {
13676 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13677 return false;
13678 }
13679
13680 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13681 {
13682 if (dest_type == CEPH_ENTITY_TYPE_MON)
13683 return true;
13684 *authorizer = monclient->build_authorizer(dest_type);
13685 return true;
13686 }
13687
13688 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13689 {
13690 Inode *cur = in;
13691 utime_t now = ceph_clock_now();
13692
13693 while (cur) {
13694 if (cur != in && cur->quota.is_enable())
13695 break;
13696
13697 Inode *parent_in = NULL;
13698 if (!cur->dn_set.empty()) {
13699 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13700 Dentry *dn = *p;
13701 if (dn->lease_mds >= 0 &&
13702 dn->lease_ttl > now &&
13703 mds_sessions.count(dn->lease_mds)) {
13704 parent_in = dn->dir->parent_inode;
13705 } else {
13706 Inode *diri = dn->dir->parent_inode;
13707 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13708 diri->shared_gen == dn->cap_shared_gen) {
13709 parent_in = dn->dir->parent_inode;
13710 }
13711 }
13712 if (parent_in)
13713 break;
13714 }
13715 } else if (root_parents.count(cur)) {
13716 parent_in = root_parents[cur].get();
13717 }
13718
13719 if (parent_in) {
13720 cur = parent_in;
13721 continue;
13722 }
13723
13724 if (cur == root_ancestor)
13725 break;
13726
13727 // deleted inode
13728 if (cur->nlink == 0) {
13729 cur = root_ancestor;
13730 break;
13731 }
13732
13733 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13734 filepath path(cur->ino);
13735 req->set_filepath(path);
13736 req->set_inode(cur);
13737
13738 InodeRef parent_ref;
13739 int ret = make_request(req, perms, &parent_ref);
13740 if (ret < 0) {
13741 ldout(cct, 1) << __func__ << " " << in->vino()
13742 << " failed to find parent of " << cur->vino()
13743 << " err " << ret << dendl;
13744 // FIXME: what to do?
13745 cur = root_ancestor;
13746 break;
13747 }
13748
13749 now = ceph_clock_now();
13750 if (cur == in)
13751 cur = parent_ref.get();
13752 else
13753 cur = in; // start over
13754 }
13755
13756 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13757 return cur;
13758 }
13759
13760 /**
13761 * Traverse quota ancestors of the Inode, return true
13762 * if any of them passes the passed function
13763 */
13764 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13765 std::function<bool (const Inode &in)> test)
13766 {
13767 while (true) {
13768 assert(in != NULL);
13769 if (test(*in)) {
13770 return true;
13771 }
13772
13773 if (in == root_ancestor) {
13774 // We're done traversing, drop out
13775 return false;
13776 } else {
13777 // Continue up the tree
13778 in = get_quota_root(in, perms);
13779 }
13780 }
13781
13782 return false;
13783 }
13784
13785 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13786 {
13787 return check_quota_condition(in, perms,
13788 [](const Inode &in) {
13789 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13790 });
13791 }
13792
13793 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13794 const UserPerm& perms,
13795 std::list<InodeRef>* quota_roots)
13796 {
13797 return check_quota_condition(in, perms,
13798 [&new_bytes, quota_roots](const Inode &in) {
13799 if (quota_roots)
13800 quota_roots->emplace_back(const_cast<Inode*>(&in));
13801 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13802 > in.quota.max_bytes;
13803 });
13804 }
13805
13806 bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
13807 {
13808 assert(in->size >= in->reported_size);
13809 const uint64_t size = in->size - in->reported_size;
13810
13811 for (auto& diri : quota_roots) {
13812 if (diri->quota.max_bytes) {
13813 if (diri->rstat.rbytes >= diri->quota.max_bytes)
13814 return true;
13815
13816 uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
13817 if ((space >> 4) < size)
13818 return true;
13819 }
13820 }
13821 return false;
13822 }
13823
13824 enum {
13825 POOL_CHECKED = 1,
13826 POOL_CHECKING = 2,
13827 POOL_READ = 4,
13828 POOL_WRITE = 8,
13829 };
13830
13831 int Client::check_pool_perm(Inode *in, int need)
13832 {
13833 if (!cct->_conf->client_check_pool_perm)
13834 return 0;
13835
13836 int64_t pool_id = in->layout.pool_id;
13837 std::string pool_ns = in->layout.pool_ns;
13838 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13839 int have = 0;
13840 while (true) {
13841 auto it = pool_perms.find(perm_key);
13842 if (it == pool_perms.end())
13843 break;
13844 if (it->second == POOL_CHECKING) {
13845 // avoid concurrent checkings
13846 wait_on_list(waiting_for_pool_perm);
13847 } else {
13848 have = it->second;
13849 assert(have & POOL_CHECKED);
13850 break;
13851 }
13852 }
13853
13854 if (!have) {
13855 if (in->snapid != CEPH_NOSNAP) {
13856 // pool permission check needs to write to the first object. But for snapshot,
13857 // head of the first object may have alread been deleted. To avoid creating
13858 // orphan object, skip the check for now.
13859 return 0;
13860 }
13861
13862 pool_perms[perm_key] = POOL_CHECKING;
13863
13864 char oid_buf[32];
13865 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13866 object_t oid = oid_buf;
13867
13868 SnapContext nullsnapc;
13869
13870 C_SaferCond rd_cond;
13871 ObjectOperation rd_op;
13872 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13873
13874 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13875 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13876
13877 C_SaferCond wr_cond;
13878 ObjectOperation wr_op;
13879 wr_op.create(true);
13880
13881 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13882 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13883
13884 client_lock.Unlock();
13885 int rd_ret = rd_cond.wait();
13886 int wr_ret = wr_cond.wait();
13887 client_lock.Lock();
13888
13889 bool errored = false;
13890
13891 if (rd_ret == 0 || rd_ret == -ENOENT)
13892 have |= POOL_READ;
13893 else if (rd_ret != -EPERM) {
13894 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13895 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13896 errored = true;
13897 }
13898
13899 if (wr_ret == 0 || wr_ret == -EEXIST)
13900 have |= POOL_WRITE;
13901 else if (wr_ret != -EPERM) {
13902 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13903 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13904 errored = true;
13905 }
13906
13907 if (errored) {
13908 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13909 // Raise EIO because actual error code might be misleading for
13910 // userspace filesystem user.
13911 pool_perms.erase(perm_key);
13912 signal_cond_list(waiting_for_pool_perm);
13913 return -EIO;
13914 }
13915
13916 pool_perms[perm_key] = have | POOL_CHECKED;
13917 signal_cond_list(waiting_for_pool_perm);
13918 }
13919
13920 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13921 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13922 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13923 return -EPERM;
13924 }
13925 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13926 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13927 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13928 return -EPERM;
13929 }
13930
13931 return 0;
13932 }
13933
13934 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13935 {
13936 if (acl_type == POSIX_ACL) {
13937 if (in->xattrs.count(ACL_EA_ACCESS)) {
13938 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13939
13940 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13941 }
13942 }
13943 return -EAGAIN;
13944 }
13945
13946 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13947 {
13948 if (acl_type == NO_ACL)
13949 return 0;
13950
13951 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13952 if (r < 0)
13953 goto out;
13954
13955 if (acl_type == POSIX_ACL) {
13956 if (in->xattrs.count(ACL_EA_ACCESS)) {
13957 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13958 bufferptr acl(access_acl.c_str(), access_acl.length());
13959 r = posix_acl_access_chmod(acl, mode);
13960 if (r < 0)
13961 goto out;
13962 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13963 } else {
13964 r = 0;
13965 }
13966 }
13967 out:
13968 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13969 return r;
13970 }
13971
13972 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13973 const UserPerm& perms)
13974 {
13975 if (acl_type == NO_ACL)
13976 return 0;
13977
13978 if (S_ISLNK(*mode))
13979 return 0;
13980
13981 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13982 if (r < 0)
13983 goto out;
13984
13985 if (acl_type == POSIX_ACL) {
13986 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13987 map<string, bufferptr> xattrs;
13988
13989 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13990 bufferptr acl(default_acl.c_str(), default_acl.length());
13991 r = posix_acl_inherit_mode(acl, mode);
13992 if (r < 0)
13993 goto out;
13994
13995 if (r > 0) {
13996 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13997 if (r < 0)
13998 goto out;
13999 if (r > 0)
14000 xattrs[ACL_EA_ACCESS] = acl;
14001 }
14002
14003 if (S_ISDIR(*mode))
14004 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14005
14006 r = xattrs.size();
14007 if (r > 0)
14008 ::encode(xattrs, xattrs_bl);
14009 } else {
14010 if (umask_cb)
14011 *mode &= ~umask_cb(callback_handle);
14012 r = 0;
14013 }
14014 }
14015 out:
14016 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14017 return r;
14018 }
14019
14020 void Client::set_filer_flags(int flags)
14021 {
14022 Mutex::Locker l(client_lock);
14023 assert(flags == 0 ||
14024 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14025 objecter->add_global_op_flags(flags);
14026 }
14027
14028 void Client::clear_filer_flags(int flags)
14029 {
14030 Mutex::Locker l(client_lock);
14031 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14032 objecter->clear_global_op_flag(flags);
14033 }
14034
14035 /**
14036 * This is included in cap release messages, to cause
14037 * the MDS to wait until this OSD map epoch. It is necessary
14038 * in corner cases where we cancel RADOS ops, so that
14039 * nobody else tries to do IO to the same objects in
14040 * the same epoch as the cancelled ops.
14041 */
14042 void Client::set_cap_epoch_barrier(epoch_t e)
14043 {
14044 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14045 cap_epoch_barrier = e;
14046 }
14047
14048 const char** Client::get_tracked_conf_keys() const
14049 {
14050 static const char* keys[] = {
14051 "client_cache_size",
14052 "client_cache_mid",
14053 "client_acl_type",
14054 "client_deleg_timeout",
14055 "client_deleg_break_on_open",
14056 NULL
14057 };
14058 return keys;
14059 }
14060
14061 void Client::handle_conf_change(const struct md_config_t *conf,
14062 const std::set <std::string> &changed)
14063 {
14064 Mutex::Locker lock(client_lock);
14065
14066 if (changed.count("client_cache_mid")) {
14067 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14068 }
14069 if (changed.count("client_acl_type")) {
14070 acl_type = NO_ACL;
14071 if (cct->_conf->client_acl_type == "posix_acl")
14072 acl_type = POSIX_ACL;
14073 }
14074 }
14075
14076 void intrusive_ptr_add_ref(Inode *in)
14077 {
14078 in->get();
14079 }
14080
14081 void intrusive_ptr_release(Inode *in)
14082 {
14083 in->client->put_inode(in);
14084 }
14085
14086 mds_rank_t Client::_get_random_up_mds() const
14087 {
14088 assert(client_lock.is_locked_by_me());
14089
14090 std::set<mds_rank_t> up;
14091 mdsmap->get_up_mds_set(up);
14092
14093 if (up.empty())
14094 return MDS_RANK_NONE;
14095 std::set<mds_rank_t>::const_iterator p = up.begin();
14096 for (int n = rand() % up.size(); n; n--)
14097 ++p;
14098 return *p;
14099 }
14100
14101
14102 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14103 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14104 {
14105 monclient->set_messenger(m);
14106 objecter->set_client_incarnation(0);
14107 }
14108
14109 StandaloneClient::~StandaloneClient()
14110 {
14111 delete objecter;
14112 objecter = nullptr;
14113 }
14114
14115 int StandaloneClient::init()
14116 {
14117 timer.init();
14118 objectcacher->start();
14119 objecter->init();
14120
14121 client_lock.Lock();
14122 assert(!initialized);
14123
14124 messenger->add_dispatcher_tail(objecter);
14125 messenger->add_dispatcher_tail(this);
14126
14127 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14128 int r = monclient->init();
14129 if (r < 0) {
14130 // need to do cleanup because we're in an intermediate init state
14131 timer.shutdown();
14132 client_lock.Unlock();
14133 objecter->shutdown();
14134 objectcacher->stop();
14135 monclient->shutdown();
14136 return r;
14137 }
14138 objecter->start();
14139
14140 client_lock.Unlock();
14141 _finish_init();
14142
14143 return 0;
14144 }
14145
14146 void StandaloneClient::shutdown()
14147 {
14148 Client::shutdown();
14149 objecter->shutdown();
14150 monclient->shutdown();
14151 }