]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to v12.2.1
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
86#include "Dir.h"
87#include "ClientSnapRealm.h"
88#include "Fh.h"
89#include "MetaSession.h"
90#include "MetaRequest.h"
91#include "ObjecterWriteback.h"
92#include "posix_acl.h"
93
94#include "include/assert.h"
95#include "include/stat.h"
96
97#include "include/cephfs/ceph_statx.h"
98
99#if HAVE_GETGROUPLIST
100#include <grp.h>
101#include <pwd.h>
102#include <unistd.h>
103#endif
104
105#undef dout_prefix
106#define dout_prefix *_dout << "client." << whoami << " "
107
108#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
109
110// FreeBSD fails to define this
111#ifndef O_DSYNC
112#define O_DSYNC 0x0
113#endif
114// Darwin fails to define this
115#ifndef O_RSYNC
116#define O_RSYNC 0x0
117#endif
118
119#ifndef O_DIRECT
120#define O_DIRECT 0x0
121#endif
122
123#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
124
125void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
126{
127 Client *client = static_cast<Client*>(p);
128 client->flush_set_callback(oset);
129}
130
131
132// -------------
133
134Client::CommandHook::CommandHook(Client *client) :
135 m_client(client)
136{
137}
138
139bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
140 std::string format, bufferlist& out)
141{
142 Formatter *f = Formatter::create(format);
143 f->open_object_section("result");
144 m_client->client_lock.Lock();
145 if (command == "mds_requests")
146 m_client->dump_mds_requests(f);
147 else if (command == "mds_sessions")
148 m_client->dump_mds_sessions(f);
149 else if (command == "dump_cache")
150 m_client->dump_cache(f);
151 else if (command == "kick_stale_sessions")
152 m_client->_kick_stale_sessions();
153 else if (command == "status")
154 m_client->dump_status(f);
155 else
156 assert(0 == "bad command registered");
157 m_client->client_lock.Unlock();
158 f->close_section();
159 f->flush(out);
160 delete f;
161 return true;
162}
163
164
165// -------------
166
167dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
168 : inode(in), offset(0), next_offset(2),
169 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
170 perms(perms)
171 { }
172
173void Client::_reset_faked_inos()
174{
175 ino_t start = 1024;
176 free_faked_inos.clear();
177 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
178 last_used_faked_ino = 0;
179 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
180}
181
182void Client::_assign_faked_ino(Inode *in)
183{
184 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
185 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
186 last_used_faked_ino = 0;
187 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
188 }
189 assert(it != free_faked_inos.end());
190 if (last_used_faked_ino < it.get_start()) {
191 assert(it.get_len() > 0);
192 last_used_faked_ino = it.get_start();
193 } else {
194 ++last_used_faked_ino;
195 assert(it.get_start() + it.get_len() > last_used_faked_ino);
196 }
197 in->faked_ino = last_used_faked_ino;
198 free_faked_inos.erase(in->faked_ino);
199 faked_ino_map[in->faked_ino] = in->vino();
200}
201
202void Client::_release_faked_ino(Inode *in)
203{
204 free_faked_inos.insert(in->faked_ino);
205 faked_ino_map.erase(in->faked_ino);
206}
207
208vinodeno_t Client::_map_faked_ino(ino_t ino)
209{
210 vinodeno_t vino;
211 if (ino == 1)
212 vino = root->vino();
213 else if (faked_ino_map.count(ino))
214 vino = faked_ino_map[ino];
215 else
216 vino = vinodeno_t(0, CEPH_NOSNAP);
217 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
218 return vino;
219}
220
221vinodeno_t Client::map_faked_ino(ino_t ino)
222{
223 Mutex::Locker lock(client_lock);
224 return _map_faked_ino(ino);
225}
226
227// cons/des
228
229Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
230 : Dispatcher(m->cct),
231 m_command_hook(this),
232 timer(m->cct, client_lock),
233 callback_handle(NULL),
234 switch_interrupt_cb(NULL),
235 remount_cb(NULL),
236 ino_invalidate_cb(NULL),
237 dentry_invalidate_cb(NULL),
238 getgroups_cb(NULL),
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
241 require_remount(false),
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
247 tick_event(NULL),
248 messenger(m), monclient(mc),
249 objecter(objecter_),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 initialized(false),
31f18b77 253 mounted(false), unmounting(false), blacklisted(false),
7c673cae
FG
254 local_osd(-1), local_osd_epoch(0),
255 unsafe_sync_write(0),
256 client_lock("Client::client_lock")
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
7c673cae
FG
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
275
276 // file handles
277 free_fd_set.insert(10, 1<<30);
278
279 mdsmap.reset(new MDSMap);
280
281 // osd interfaces
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
283 &client_lock));
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
286 (void*)this,
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
292 true));
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 295 objecter->enable_blacklist_events();
7c673cae
FG
296}
297
298
299Client::~Client()
300{
301 assert(!client_lock.is_locked());
302
31f18b77
FG
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
306 client_lock.Lock();
7c673cae 307 tear_down_cache();
31f18b77 308 client_lock.Unlock();
7c673cae
FG
309}
310
311void Client::tear_down_cache()
312{
313 // fd's
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
315 it != fd_map.end();
316 ++it) {
317 Fh *fh = it->second;
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
319 _release_fh(fh);
320 }
321 fd_map.clear();
322
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
326 _closedir(dirp);
327 }
328
329 // caps!
330 // *** FIXME ***
331
332 // empty lru
7c673cae
FG
333 trim_cache();
334 assert(lru.lru_get_size() == 0);
335
336 // close root ino
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
339 delete root;
340 root = 0;
341 root_ancestor = 0;
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
344 inode_map.clear();
345 _reset_faked_inos();
346 }
347
348 assert(inode_map.empty());
349}
350
351inodeno_t Client::get_root_ino()
352{
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
356 else
357 return root->ino;
358}
359
360Inode *Client::get_root()
361{
362 Mutex::Locker l(client_lock);
363 root->ll_get();
364 return root;
365}
366
367
368// debug crapola
369
370void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
371{
372 filepath path;
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
377 << " " << path
378 << " ref " << in->get_num_ref()
379 << *in << dendl;
380
381 if (f) {
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
384 if (disconnected)
385 f->dump_int("disconnected", 1);
386 in->dump(f);
387 f->close_section();
388 }
389
390 did.insert(in);
391 if (in->dir) {
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
395 ++it) {
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
397 if (f) {
398 f->open_object_section("dentry");
399 it->second->dump(f);
400 f->close_section();
401 }
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
404 }
405 }
406}
407
408void Client::dump_cache(Formatter *f)
409{
410 set<Inode*> did;
411
412 ldout(cct, 1) << "dump_cache" << dendl;
413
414 if (f)
415 f->open_array_section("cache");
416
417 if (root)
418 dump_inode(f, root, did, true);
419
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
423 ++it) {
424 if (did.count(it->second))
425 continue;
426 dump_inode(f, it->second, did, true);
427 }
428
429 if (f)
430 f->close_section();
431}
432
433void Client::dump_status(Formatter *f)
434{
435 assert(client_lock.is_locked_by_me());
436
437 ldout(cct, 1) << __func__ << dendl;
438
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
441
442 if (f) {
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
446 f->close_section();
447
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
451 f->dump_int("inode_count", inode_map.size());
452 f->dump_int("mds_epoch", mdsmap->get_epoch());
453 f->dump_int("osd_epoch", osd_epoch);
454 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
455 }
456}
457
458int Client::init()
459{
460 timer.init();
461 objectcacher->start();
462
463 client_lock.Lock();
464 assert(!initialized);
465
466 messenger->add_dispatcher_tail(this);
467 client_lock.Unlock();
468
469 _finish_init();
470 return 0;
471}
472
473void Client::_finish_init()
474{
475 client_lock.Lock();
476 // logger
477 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
478 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
479 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
480 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
481 logger.reset(plb.create_perf_counters());
482 cct->get_perfcounters_collection()->add(logger.get());
483
484 client_lock.Unlock();
485
486 cct->_conf->add_observer(this);
487
488 AdminSocket* admin_socket = cct->get_admin_socket();
489 int ret = admin_socket->register_command("mds_requests",
490 "mds_requests",
491 &m_command_hook,
492 "show in-progress mds requests");
493 if (ret < 0) {
494 lderr(cct) << "error registering admin socket command: "
495 << cpp_strerror(-ret) << dendl;
496 }
497 ret = admin_socket->register_command("mds_sessions",
498 "mds_sessions",
499 &m_command_hook,
500 "show mds session state");
501 if (ret < 0) {
502 lderr(cct) << "error registering admin socket command: "
503 << cpp_strerror(-ret) << dendl;
504 }
505 ret = admin_socket->register_command("dump_cache",
506 "dump_cache",
507 &m_command_hook,
508 "show in-memory metadata cache contents");
509 if (ret < 0) {
510 lderr(cct) << "error registering admin socket command: "
511 << cpp_strerror(-ret) << dendl;
512 }
513 ret = admin_socket->register_command("kick_stale_sessions",
514 "kick_stale_sessions",
515 &m_command_hook,
516 "kick sessions that were remote reset");
517 if (ret < 0) {
518 lderr(cct) << "error registering admin socket command: "
519 << cpp_strerror(-ret) << dendl;
520 }
521 ret = admin_socket->register_command("status",
522 "status",
523 &m_command_hook,
524 "show overall client status");
525 if (ret < 0) {
526 lderr(cct) << "error registering admin socket command: "
527 << cpp_strerror(-ret) << dendl;
528 }
529
530 client_lock.Lock();
531 initialized = true;
532 client_lock.Unlock();
533}
534
535void Client::shutdown()
536{
537 ldout(cct, 1) << "shutdown" << dendl;
538
539 // If we were not mounted, but were being used for sending
540 // MDS commands, we may have sessions that need closing.
541 client_lock.Lock();
542 _close_sessions();
543 client_lock.Unlock();
544
545 cct->_conf->remove_observer(this);
546
547 AdminSocket* admin_socket = cct->get_admin_socket();
548 admin_socket->unregister_command("mds_requests");
549 admin_socket->unregister_command("mds_sessions");
550 admin_socket->unregister_command("dump_cache");
551 admin_socket->unregister_command("kick_stale_sessions");
552 admin_socket->unregister_command("status");
553
554 if (ino_invalidate_cb) {
555 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
556 async_ino_invalidator.wait_for_empty();
557 async_ino_invalidator.stop();
558 }
559
560 if (dentry_invalidate_cb) {
561 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
562 async_dentry_invalidator.wait_for_empty();
563 async_dentry_invalidator.stop();
564 }
565
566 if (switch_interrupt_cb) {
567 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
568 interrupt_finisher.wait_for_empty();
569 interrupt_finisher.stop();
570 }
571
572 if (remount_cb) {
573 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
574 remount_finisher.wait_for_empty();
575 remount_finisher.stop();
576 }
577
578 objectcacher->stop(); // outside of client_lock! this does a join.
579
580 client_lock.Lock();
581 assert(initialized);
582 initialized = false;
583 timer.shutdown();
584 client_lock.Unlock();
585
586 objecter_finisher.wait_for_empty();
587 objecter_finisher.stop();
588
589 if (logger) {
590 cct->get_perfcounters_collection()->remove(logger.get());
591 logger.reset();
592 }
593}
594
595
596// ===================
597// metadata cache stuff
598
599void Client::trim_cache(bool trim_kernel_dcache)
600{
181888fb
FG
601 uint64_t max = cct->_conf->client_cache_size;
602 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
603 unsigned last = 0;
604 while (lru.lru_get_size() != last) {
605 last = lru.lru_get_size();
606
181888fb 607 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
608
609 // trim!
31f18b77 610 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
611 if (!dn)
612 break; // done
613
614 trim_dentry(dn);
615 }
616
181888fb 617 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
618 _invalidate_kernel_dcache();
619
620 // hose root?
621 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
622 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
623 delete root;
624 root = 0;
625 root_ancestor = 0;
626 while (!root_parents.empty())
627 root_parents.erase(root_parents.begin());
628 inode_map.clear();
629 _reset_faked_inos();
630 }
631}
632
633void Client::trim_cache_for_reconnect(MetaSession *s)
634{
635 mds_rank_t mds = s->mds_num;
636 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
637
638 int trimmed = 0;
639 list<Dentry*> skipped;
640 while (lru.lru_get_size() > 0) {
641 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
642 if (!dn)
643 break;
644
645 if ((dn->inode && dn->inode->caps.count(mds)) ||
646 dn->dir->parent_inode->caps.count(mds)) {
647 trim_dentry(dn);
648 trimmed++;
649 } else
650 skipped.push_back(dn);
651 }
652
653 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
654 lru.lru_insert_mid(*p);
655
656 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
657 << " trimmed " << trimmed << " dentries" << dendl;
658
659 if (s->caps.size() > 0)
660 _invalidate_kernel_dcache();
661}
662
663void Client::trim_dentry(Dentry *dn)
664{
665 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
666 << " in dir " << hex << dn->dir->parent_inode->ino
667 << dendl;
668 if (dn->inode) {
669 Inode *diri = dn->dir->parent_inode;
670 diri->dir_release_count++;
671 clear_dir_complete_and_ordered(diri, true);
672 }
673 unlink(dn, false, false); // drop dir, drop dentry
674}
675
676
677void Client::update_inode_file_bits(Inode *in,
678 uint64_t truncate_seq, uint64_t truncate_size,
679 uint64_t size, uint64_t change_attr,
680 uint64_t time_warp_seq, utime_t ctime,
681 utime_t mtime,
682 utime_t atime,
683 version_t inline_version,
684 bufferlist& inline_data,
685 int issued)
686{
687 bool warn = false;
688 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
689 << " mtime " << mtime << dendl;
690 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
691 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
692 << " local " << in->time_warp_seq << dendl;
693 uint64_t prior_size = in->size;
694
695 if (inline_version > in->inline_version) {
696 in->inline_data = inline_data;
697 in->inline_version = inline_version;
698 }
699
700 /* always take a newer change attr */
701 if (change_attr > in->change_attr)
702 in->change_attr = change_attr;
703
704 if (truncate_seq > in->truncate_seq ||
705 (truncate_seq == in->truncate_seq && size > in->size)) {
706 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
707 in->size = size;
708 in->reported_size = size;
709 if (truncate_seq != in->truncate_seq) {
710 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
711 << truncate_seq << dendl;
712 in->truncate_seq = truncate_seq;
713 in->oset.truncate_seq = truncate_seq;
714
715 // truncate cached file data
716 if (prior_size > size) {
717 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
718 }
719 }
720
721 // truncate inline data
722 if (in->inline_version < CEPH_INLINE_NONE) {
723 uint32_t len = in->inline_data.length();
724 if (size < len)
725 in->inline_data.splice(size, len - size);
726 }
727 }
728 if (truncate_seq >= in->truncate_seq &&
729 in->truncate_size != truncate_size) {
730 if (in->is_file()) {
731 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
732 << truncate_size << dendl;
733 in->truncate_size = truncate_size;
734 in->oset.truncate_size = truncate_size;
735 } else {
736 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
737 }
738 }
739
740 // be careful with size, mtime, atime
741 if (issued & (CEPH_CAP_FILE_EXCL|
742 CEPH_CAP_FILE_WR|
743 CEPH_CAP_FILE_BUFFER|
744 CEPH_CAP_AUTH_EXCL|
745 CEPH_CAP_XATTR_EXCL)) {
746 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
747 if (ctime > in->ctime)
748 in->ctime = ctime;
749 if (time_warp_seq > in->time_warp_seq) {
750 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
751 << " is higher than local time_warp_seq "
752 << in->time_warp_seq << dendl;
753 //the mds updated times, so take those!
754 in->mtime = mtime;
755 in->atime = atime;
756 in->time_warp_seq = time_warp_seq;
757 } else if (time_warp_seq == in->time_warp_seq) {
758 //take max times
759 if (mtime > in->mtime)
760 in->mtime = mtime;
761 if (atime > in->atime)
762 in->atime = atime;
763 } else if (issued & CEPH_CAP_FILE_EXCL) {
764 //ignore mds values as we have a higher seq
765 } else warn = true;
766 } else {
767 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
768 if (time_warp_seq >= in->time_warp_seq) {
769 in->ctime = ctime;
770 in->mtime = mtime;
771 in->atime = atime;
772 in->time_warp_seq = time_warp_seq;
773 } else warn = true;
774 }
775 if (warn) {
776 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
777 << time_warp_seq << " is lower than local time_warp_seq "
778 << in->time_warp_seq
779 << dendl;
780 }
781}
782
783void Client::_fragmap_remove_non_leaves(Inode *in)
784{
785 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
786 if (!in->dirfragtree.is_leaf(p->first))
787 in->fragmap.erase(p++);
788 else
789 ++p;
790}
791
792void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
793{
794 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
795 if (p->second == mds)
796 in->fragmap.erase(p++);
797 else
798 ++p;
799}
800
801Inode * Client::add_update_inode(InodeStat *st, utime_t from,
802 MetaSession *session,
803 const UserPerm& request_perms)
804{
805 Inode *in;
806 bool was_new = false;
807 if (inode_map.count(st->vino)) {
808 in = inode_map[st->vino];
809 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
810 } else {
811 in = new Inode(this, st->vino, &st->layout);
812 inode_map[st->vino] = in;
813
814 if (use_faked_inos())
815 _assign_faked_ino(in);
816
817 if (!root) {
818 root = in;
819 root_ancestor = in;
820 cwd = root;
821 } else if (!mounted) {
822 root_parents[root_ancestor] = in;
823 root_ancestor = in;
824 }
825
826 // immutable bits
827 in->ino = st->vino.ino;
828 in->snapid = st->vino.snapid;
829 in->mode = st->mode & S_IFMT;
830 was_new = true;
831 }
832
833 in->rdev = st->rdev;
834 if (in->is_symlink())
835 in->symlink = st->symlink;
836
837 if (was_new)
838 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
839
840 if (!st->cap.caps)
841 return in; // as with readdir returning indoes in different snaprealms (no caps!)
842
843 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
844 bool updating_inode = false;
845 int issued = 0;
846 if (st->version == 0 ||
847 (in->version & ~1) < st->version) {
848 updating_inode = true;
849
850 int implemented = 0;
851 issued = in->caps_issued(&implemented) | in->caps_dirty();
852 issued |= implemented;
853
854 in->version = st->version;
855
856 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
857 in->mode = st->mode;
858 in->uid = st->uid;
859 in->gid = st->gid;
860 in->btime = st->btime;
861 }
862
863 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
864 in->nlink = st->nlink;
865 }
866
867 in->dirstat = st->dirstat;
868 in->rstat = st->rstat;
869 in->quota = st->quota;
870 in->layout = st->layout;
871
872 if (in->is_dir()) {
873 in->dir_layout = st->dir_layout;
874 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
875 }
876
877 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
878 st->change_attr, st->time_warp_seq, st->ctime,
879 st->mtime, st->atime, st->inline_version,
880 st->inline_data, issued);
881 } else if (st->inline_version > in->inline_version) {
882 in->inline_data = st->inline_data;
883 in->inline_version = st->inline_version;
884 }
885
886 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
887 st->xattrbl.length() &&
888 st->xattr_version > in->xattr_version) {
889 bufferlist::iterator p = st->xattrbl.begin();
890 ::decode(in->xattrs, p);
891 in->xattr_version = st->xattr_version;
892 }
893
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
898 }
899
900 if (in->snapid == CEPH_NOSNAP) {
901 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
902 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
903 request_perms);
904 if (in->auth_cap && in->auth_cap->session == session)
905 in->max_size = st->max_size;
906 } else
907 in->snap_caps |= st->cap.caps;
908
909 // setting I_COMPLETE needs to happen after adding the cap
910 if (updating_inode &&
911 in->is_dir() &&
912 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
913 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
914 in->dirstat.nfiles == 0 &&
915 in->dirstat.nsubdirs == 0) {
916 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
917 in->flags |= I_COMPLETE | I_DIR_ORDERED;
918 if (in->dir) {
919 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
920 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
921 in->dir->readdir_cache.clear();
922 for (auto p = in->dir->dentries.begin();
923 p != in->dir->dentries.end();
924 ++p) {
925 unlink(p->second, true, true); // keep dir, keep dentry
926 }
927 if (in->dir->dentries.empty())
928 close_dir(in->dir);
929 }
930 }
931
932 return in;
933}
934
935
936/*
937 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
938 */
939Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
940 Inode *in, utime_t from, MetaSession *session,
941 Dentry *old_dentry)
942{
943 Dentry *dn = NULL;
944 if (dir->dentries.count(dname))
945 dn = dir->dentries[dname];
946
947 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
948 << " in dir " << dir->parent_inode->vino() << " dn " << dn
949 << dendl;
950
951 if (dn && dn->inode) {
952 if (dn->inode->vino() == in->vino()) {
953 touch_dn(dn);
954 ldout(cct, 12) << " had dentry " << dname
955 << " with correct vino " << dn->inode->vino()
956 << dendl;
957 } else {
958 ldout(cct, 12) << " had dentry " << dname
959 << " with WRONG vino " << dn->inode->vino()
960 << dendl;
961 unlink(dn, true, true); // keep dir, keep dentry
962 }
963 }
964
965 if (!dn || !dn->inode) {
966 InodeRef tmp_ref(in);
967 if (old_dentry) {
968 if (old_dentry->dir != dir) {
969 Inode *old_diri = old_dentry->dir->parent_inode;
970 old_diri->dir_ordered_count++;
971 clear_dir_complete_and_ordered(old_diri, false);
972 }
973 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
974 }
975 Inode *diri = dir->parent_inode;
976 diri->dir_ordered_count++;
977 clear_dir_complete_and_ordered(diri, false);
978 dn = link(dir, dname, in, dn);
979 }
980
981 update_dentry_lease(dn, dlease, from, session);
982 return dn;
983}
984
985void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
986{
987 utime_t dttl = from;
988 dttl += (float)dlease->duration_ms / 1000.0;
989
990 assert(dn);
991
992 if (dlease->mask & CEPH_LOCK_DN) {
993 if (dttl > dn->lease_ttl) {
994 ldout(cct, 10) << "got dentry lease on " << dn->name
995 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
996 dn->lease_ttl = dttl;
997 dn->lease_mds = session->mds_num;
998 dn->lease_seq = dlease->seq;
999 dn->lease_gen = session->cap_gen;
1000 }
1001 }
1002 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1003}
1004
1005
1006/*
1007 * update MDS location cache for a single inode
1008 */
1009void Client::update_dir_dist(Inode *in, DirStat *dst)
1010{
1011 // auth
1012 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1013 if (dst->auth >= 0) {
1014 in->fragmap[dst->frag] = dst->auth;
1015 } else {
1016 in->fragmap.erase(dst->frag);
1017 }
1018 if (!in->dirfragtree.is_leaf(dst->frag)) {
1019 in->dirfragtree.force_to_leaf(cct, dst->frag);
1020 _fragmap_remove_non_leaves(in);
1021 }
1022
1023 // replicated
1024 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1025
1026 // dist
1027 /*
1028 if (!st->dirfrag_dist.empty()) { // FIXME
1029 set<int> dist = st->dirfrag_dist.begin()->second;
1030 if (dist.empty() && !in->dir_contacts.empty())
1031 ldout(cct, 9) << "lost dist spec for " << in->ino
1032 << " " << dist << dendl;
1033 if (!dist.empty() && in->dir_contacts.empty())
1034 ldout(cct, 9) << "got dist spec for " << in->ino
1035 << " " << dist << dendl;
1036 in->dir_contacts = dist;
1037 }
1038 */
1039}
1040
1041void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1042{
1043 if (diri->flags & I_COMPLETE) {
1044 if (complete) {
1045 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1046 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1047 } else {
1048 if (diri->flags & I_DIR_ORDERED) {
1049 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1050 diri->flags &= ~I_DIR_ORDERED;
1051 }
1052 }
1053 if (diri->dir)
1054 diri->dir->readdir_cache.clear();
1055 }
1056}
1057
1058/*
1059 * insert results from readdir or lssnap into the metadata cache.
1060 */
1061void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1062
1063 MClientReply *reply = request->reply;
1064 ConnectionRef con = request->reply->get_connection();
1065 uint64_t features = con->get_features();
1066
1067 dir_result_t *dirp = request->dirp;
1068 assert(dirp);
1069
1070 // the extra buffer list is only set for readdir and lssnap replies
1071 bufferlist::iterator p = reply->get_extra_bl().begin();
1072 if (!p.end()) {
1073 // snapdir?
1074 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1075 assert(diri);
1076 diri = open_snapdir(diri);
1077 }
1078
1079 // only open dir if we're actually adding stuff to it!
1080 Dir *dir = diri->open_dir();
1081 assert(dir);
1082
1083 // dirstat
1084 DirStat dst(p);
1085 __u32 numdn;
1086 __u16 flags;
1087 ::decode(numdn, p);
1088 ::decode(flags, p);
1089
1090 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1091 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1092
1093 frag_t fg = (unsigned)request->head.args.readdir.frag;
1094 unsigned readdir_offset = dirp->next_offset;
1095 string readdir_start = dirp->last_name;
1096 assert(!readdir_start.empty() || readdir_offset == 2);
1097
1098 unsigned last_hash = 0;
1099 if (hash_order) {
1100 if (!readdir_start.empty()) {
1101 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1102 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1103 /* mds understands offset_hash */
1104 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1105 }
1106 }
1107
1108 if (fg != dst.frag) {
1109 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1110 fg = dst.frag;
1111 if (!hash_order) {
1112 readdir_offset = 2;
1113 readdir_start.clear();
1114 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1115 }
1116 }
1117
1118 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1119 << ", hash_order=" << hash_order
1120 << ", readdir_start " << readdir_start
1121 << ", last_hash " << last_hash
1122 << ", next_offset " << readdir_offset << dendl;
1123
1124 if (diri->snapid != CEPH_SNAPDIR &&
1125 fg.is_leftmost() && readdir_offset == 2 &&
1126 !(hash_order && last_hash)) {
1127 dirp->release_count = diri->dir_release_count;
1128 dirp->ordered_count = diri->dir_ordered_count;
1129 dirp->start_shared_gen = diri->shared_gen;
1130 dirp->cache_index = 0;
1131 }
1132
1133 dirp->buffer_frag = fg;
1134
1135 _readdir_drop_dirp_buffer(dirp);
1136 dirp->buffer.reserve(numdn);
1137
1138 string dname;
1139 LeaseStat dlease;
1140 for (unsigned i=0; i<numdn; i++) {
1141 ::decode(dname, p);
1142 ::decode(dlease, p);
1143 InodeStat ist(p, features);
1144
1145 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1146
1147 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1148 request->perms);
1149 Dentry *dn;
1150 if (diri->dir->dentries.count(dname)) {
1151 Dentry *olddn = diri->dir->dentries[dname];
1152 if (olddn->inode != in) {
1153 // replace incorrect dentry
1154 unlink(olddn, true, true); // keep dir, dentry
1155 dn = link(dir, dname, in, olddn);
1156 assert(dn == olddn);
1157 } else {
1158 // keep existing dn
1159 dn = olddn;
1160 touch_dn(dn);
1161 }
1162 } else {
1163 // new dn
1164 dn = link(dir, dname, in, NULL);
1165 }
1166
1167 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1168 if (hash_order) {
1169 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1170 if (hash != last_hash)
1171 readdir_offset = 2;
1172 last_hash = hash;
1173 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1174 } else {
1175 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1176 }
1177 // add to readdir cache
1178 if (dirp->release_count == diri->dir_release_count &&
1179 dirp->ordered_count == diri->dir_ordered_count &&
1180 dirp->start_shared_gen == diri->shared_gen) {
1181 if (dirp->cache_index == dir->readdir_cache.size()) {
1182 if (i == 0) {
1183 assert(!dirp->inode->is_complete_and_ordered());
1184 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1185 }
1186 dir->readdir_cache.push_back(dn);
1187 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1188 if (dirp->inode->is_complete_and_ordered())
1189 assert(dir->readdir_cache[dirp->cache_index] == dn);
1190 else
1191 dir->readdir_cache[dirp->cache_index] = dn;
1192 } else {
1193 assert(0 == "unexpected readdir buffer idx");
1194 }
1195 dirp->cache_index++;
1196 }
1197 // add to cached result list
1198 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1199 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1200 }
1201
1202 if (numdn > 0)
1203 dirp->last_name = dname;
1204 if (end)
1205 dirp->next_offset = 2;
1206 else
1207 dirp->next_offset = readdir_offset;
1208
1209 if (dir->is_empty())
1210 close_dir(dir);
1211 }
1212}
1213
1214/** insert_trace
1215 *
1216 * insert a trace from a MDS reply into the cache.
1217 */
1218Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1219{
1220 MClientReply *reply = request->reply;
1221 int op = request->get_op();
1222
1223 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1224 << " is_target=" << (int)reply->head.is_target
1225 << " is_dentry=" << (int)reply->head.is_dentry
1226 << dendl;
1227
1228 bufferlist::iterator p = reply->get_trace_bl().begin();
1229 if (request->got_unsafe) {
1230 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1231 assert(p.end());
1232 return NULL;
1233 }
1234
1235 if (p.end()) {
1236 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1237
1238 Dentry *d = request->dentry();
1239 if (d) {
1240 Inode *diri = d->dir->parent_inode;
1241 diri->dir_release_count++;
1242 clear_dir_complete_and_ordered(diri, true);
1243 }
1244
1245 if (d && reply->get_result() == 0) {
1246 if (op == CEPH_MDS_OP_RENAME) {
1247 // rename
1248 Dentry *od = request->old_dentry();
1249 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1250 assert(od);
1251 unlink(od, true, true); // keep dir, dentry
1252 } else if (op == CEPH_MDS_OP_RMDIR ||
1253 op == CEPH_MDS_OP_UNLINK) {
1254 // unlink, rmdir
1255 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1256 unlink(d, true, true); // keep dir, dentry
1257 }
1258 }
1259 return NULL;
1260 }
1261
1262 ConnectionRef con = request->reply->get_connection();
1263 uint64_t features = con->get_features();
1264 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1265
1266 // snap trace
1267 SnapRealm *realm = NULL;
1268 if (reply->snapbl.length())
1269 update_snap_trace(reply->snapbl, &realm);
1270
1271 ldout(cct, 10) << " hrm "
1272 << " is_target=" << (int)reply->head.is_target
1273 << " is_dentry=" << (int)reply->head.is_dentry
1274 << dendl;
1275
1276 InodeStat dirst;
1277 DirStat dst;
1278 string dname;
1279 LeaseStat dlease;
1280 InodeStat ist;
1281
1282 if (reply->head.is_dentry) {
1283 dirst.decode(p, features);
1284 dst.decode(p);
1285 ::decode(dname, p);
1286 ::decode(dlease, p);
1287 }
1288
1289 Inode *in = 0;
1290 if (reply->head.is_target) {
1291 ist.decode(p, features);
1292 if (cct->_conf->client_debug_getattr_caps) {
1293 unsigned wanted = 0;
1294 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1295 wanted = request->head.args.getattr.mask;
1296 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1297 wanted = request->head.args.open.mask;
1298
1299 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1300 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1301 assert(0 == "MDS reply does not contain xattrs");
1302 }
1303
1304 in = add_update_inode(&ist, request->sent_stamp, session,
1305 request->perms);
1306 }
1307
1308 Inode *diri = NULL;
1309 if (reply->head.is_dentry) {
1310 diri = add_update_inode(&dirst, request->sent_stamp, session,
1311 request->perms);
1312 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1313
1314 if (in) {
1315 Dir *dir = diri->open_dir();
1316 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1317 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1318 } else {
1319 Dentry *dn = NULL;
1320 if (diri->dir && diri->dir->dentries.count(dname)) {
1321 dn = diri->dir->dentries[dname];
1322 if (dn->inode) {
1323 diri->dir_ordered_count++;
1324 clear_dir_complete_and_ordered(diri, false);
1325 unlink(dn, true, true); // keep dir, dentry
1326 }
1327 }
1328 if (dlease.duration_ms > 0) {
1329 if (!dn) {
1330 Dir *dir = diri->open_dir();
1331 dn = link(dir, dname, NULL, NULL);
1332 }
1333 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1334 }
1335 }
1336 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1337 op == CEPH_MDS_OP_MKSNAP) {
1338 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1339 // fake it for snap lookup
1340 vinodeno_t vino = ist.vino;
1341 vino.snapid = CEPH_SNAPDIR;
1342 assert(inode_map.count(vino));
1343 diri = inode_map[vino];
1344
1345 string dname = request->path.last_dentry();
1346
1347 LeaseStat dlease;
1348 dlease.duration_ms = 0;
1349
1350 if (in) {
1351 Dir *dir = diri->open_dir();
1352 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1353 } else {
1354 if (diri->dir && diri->dir->dentries.count(dname)) {
1355 Dentry *dn = diri->dir->dentries[dname];
1356 if (dn->inode)
1357 unlink(dn, true, true); // keep dir, dentry
1358 }
1359 }
1360 }
1361
1362 if (in) {
1363 if (op == CEPH_MDS_OP_READDIR ||
1364 op == CEPH_MDS_OP_LSSNAP) {
1365 insert_readdir_results(request, session, in);
1366 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1367 // hack: return parent inode instead
1368 in = diri;
1369 }
1370
1371 if (request->dentry() == NULL && in != request->inode()) {
1372 // pin the target inode if its parent dentry is not pinned
1373 request->set_other_inode(in);
1374 }
1375 }
1376
1377 if (realm)
1378 put_snap_realm(realm);
1379
1380 request->target = in;
1381 return in;
1382}
1383
1384// -------
1385
1386mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1387{
1388 mds_rank_t mds = MDS_RANK_NONE;
1389 __u32 hash = 0;
1390 bool is_hash = false;
1391
1392 Inode *in = NULL;
1393 Dentry *de = NULL;
1394 Cap *cap = NULL;
1395
1396 if (req->resend_mds >= 0) {
1397 mds = req->resend_mds;
1398 req->resend_mds = -1;
1399 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1400 goto out;
1401 }
1402
1403 if (cct->_conf->client_use_random_mds)
1404 goto random_mds;
1405
1406 in = req->inode();
1407 de = req->dentry();
1408 if (in) {
1409 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1410 if (req->path.depth()) {
1411 hash = in->hash_dentry_name(req->path[0]);
1412 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1413 << " on " << req->path[0]
1414 << " => " << hash << dendl;
1415 is_hash = true;
1416 }
1417 } else if (de) {
1418 if (de->inode) {
1419 in = de->inode.get();
1420 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1421 } else {
1422 in = de->dir->parent_inode;
1423 hash = in->hash_dentry_name(de->name);
1424 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1425 << " on " << de->name
1426 << " => " << hash << dendl;
1427 is_hash = true;
1428 }
1429 }
1430 if (in) {
1431 if (in->snapid != CEPH_NOSNAP) {
1432 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1433 while (in->snapid != CEPH_NOSNAP) {
1434 if (in->snapid == CEPH_SNAPDIR)
1435 in = in->snapdir_parent.get();
1436 else if (!in->dn_set.empty())
1437 /* In most cases there will only be one dentry, so getting it
1438 * will be the correct action. If there are multiple hard links,
1439 * I think the MDS should be able to redirect as needed*/
1440 in = in->get_first_parent()->dir->parent_inode;
1441 else {
1442 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1443 break;
1444 }
1445 }
1446 is_hash = false;
1447 }
1448
1449 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1450 << " hash=" << hash << dendl;
1451
1452 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1453 frag_t fg = in->dirfragtree[hash];
1454 if (in->fragmap.count(fg)) {
1455 mds = in->fragmap[fg];
1456 if (phash_diri)
1457 *phash_diri = in;
1458 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1459 goto out;
1460 }
1461 }
1462
1463 if (req->auth_is_best())
1464 cap = in->auth_cap;
1465 if (!cap && !in->caps.empty())
1466 cap = in->caps.begin()->second;
1467 if (!cap)
1468 goto random_mds;
1469 mds = cap->session->mds_num;
1470 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1471
1472 goto out;
1473 }
1474
1475random_mds:
1476 if (mds < 0) {
1477 mds = _get_random_up_mds();
1478 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1479 }
1480
1481out:
1482 ldout(cct, 20) << "mds is " << mds << dendl;
1483 return mds;
1484}
1485
1486
1487void Client::connect_mds_targets(mds_rank_t mds)
1488{
1489 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1490 assert(mds_sessions.count(mds));
1491 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1492 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1493 q != info.export_targets.end();
1494 ++q) {
1495 if (mds_sessions.count(*q) == 0 &&
1496 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1497 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1498 << " export target mds." << *q << dendl;
1499 _open_mds_session(*q);
1500 }
1501 }
1502}
1503
1504void Client::dump_mds_sessions(Formatter *f)
1505{
1506 f->dump_int("id", get_nodeid().v);
1507 f->open_array_section("sessions");
1508 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1509 f->open_object_section("session");
1510 p->second->dump(f);
1511 f->close_section();
1512 }
1513 f->close_section();
1514 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1515}
1516void Client::dump_mds_requests(Formatter *f)
1517{
1518 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1519 p != mds_requests.end();
1520 ++p) {
1521 f->open_object_section("request");
1522 p->second->dump(f);
1523 f->close_section();
1524 }
1525}
1526
1527int Client::verify_reply_trace(int r,
1528 MetaRequest *request, MClientReply *reply,
1529 InodeRef *ptarget, bool *pcreated,
1530 const UserPerm& perms)
1531{
1532 // check whether this request actually did the create, and set created flag
1533 bufferlist extra_bl;
1534 inodeno_t created_ino;
1535 bool got_created_ino = false;
1536 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1537
1538 extra_bl.claim(reply->get_extra_bl());
1539 if (extra_bl.length() >= 8) {
1540 // if the extra bufferlist has a buffer, we assume its the created inode
1541 // and that this request to create succeeded in actually creating
1542 // the inode (won the race with other create requests)
1543 ::decode(created_ino, extra_bl);
1544 got_created_ino = true;
1545 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1546 }
1547
1548 if (pcreated)
1549 *pcreated = got_created_ino;
1550
1551 if (request->target) {
1552 *ptarget = request->target;
1553 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1554 } else {
1555 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1556 (*ptarget) = p->second;
1557 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1558 } else {
1559 // we got a traceless reply, and need to look up what we just
1560 // created. for now, do this by name. someday, do this by the
1561 // ino... which we know! FIXME.
1562 InodeRef target;
1563 Dentry *d = request->dentry();
1564 if (d) {
1565 if (d->dir) {
1566 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1567 << d->dir->parent_inode->ino << "/" << d->name
1568 << " got_ino " << got_created_ino
1569 << " ino " << created_ino
1570 << dendl;
1571 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1572 &target, perms);
1573 } else {
1574 // if the dentry is not linked, just do our best. see #5021.
1575 assert(0 == "how did this happen? i want logs!");
1576 }
1577 } else {
1578 Inode *in = request->inode();
1579 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1580 << in->ino << dendl;
1581 r = _getattr(in, request->regetattr_mask, perms, true);
1582 target = in;
1583 }
1584 if (r >= 0) {
1585 // verify ino returned in reply and trace_dist are the same
1586 if (got_created_ino &&
1587 created_ino.val != target->ino.val) {
1588 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1589 r = -EINTR;
1590 }
1591 if (ptarget)
1592 ptarget->swap(target);
1593 }
1594 }
1595 }
1596
1597 return r;
1598}
1599
1600
1601/**
1602 * make a request
1603 *
1604 * Blocking helper to make an MDS request.
1605 *
1606 * If the ptarget flag is set, behavior changes slightly: the caller
1607 * expects to get a pointer to the inode we are creating or operating
1608 * on. As a result, we will follow up any traceless mutation reply
1609 * with a getattr or lookup to transparently handle a traceless reply
1610 * from the MDS (as when the MDS restarts and the client has to replay
1611 * a request).
1612 *
1613 * @param request the MetaRequest to execute
1614 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1615 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1616 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1617 * @param use_mds [optional] prefer a specific mds (-1 for default)
1618 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1619 */
1620int Client::make_request(MetaRequest *request,
1621 const UserPerm& perms,
1622 InodeRef *ptarget, bool *pcreated,
1623 mds_rank_t use_mds,
1624 bufferlist *pdirbl)
1625{
1626 int r = 0;
1627
1628 // assign a unique tid
1629 ceph_tid_t tid = ++last_tid;
1630 request->set_tid(tid);
1631
1632 // and timestamp
1633 request->op_stamp = ceph_clock_now();
1634
1635 // make note
1636 mds_requests[tid] = request->get();
1637 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1638 oldest_tid = tid;
1639
1640 request->set_caller_perms(perms);
1641
1642 if (cct->_conf->client_inject_fixed_oldest_tid) {
1643 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1644 request->set_oldest_client_tid(1);
1645 } else {
1646 request->set_oldest_client_tid(oldest_tid);
1647 }
1648
1649 // hack target mds?
1650 if (use_mds >= 0)
1651 request->resend_mds = use_mds;
1652
1653 while (1) {
1654 if (request->aborted())
1655 break;
1656
31f18b77
FG
1657 if (blacklisted) {
1658 request->abort(-EBLACKLISTED);
1659 break;
1660 }
1661
7c673cae
FG
1662 // set up wait cond
1663 Cond caller_cond;
1664 request->caller_cond = &caller_cond;
1665
1666 // choose mds
1667 Inode *hash_diri = NULL;
1668 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1669 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1670 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1671 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1672 if (hash_diri) {
1673 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1674 _fragmap_remove_stopped_mds(hash_diri, mds);
1675 } else {
1676 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1677 request->resend_mds = _get_random_up_mds();
1678 }
1679 } else {
1680 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1681 wait_on_list(waiting_for_mdsmap);
1682 }
1683 continue;
1684 }
1685
1686 // open a session?
1687 MetaSession *session = NULL;
1688 if (!have_open_session(mds)) {
1689 session = _get_or_open_mds_session(mds);
1690
1691 // wait
1692 if (session->state == MetaSession::STATE_OPENING) {
1693 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1694 wait_on_context_list(session->waiting_for_open);
1695 // Abort requests on REJECT from MDS
1696 if (rejected_by_mds.count(mds)) {
1697 request->abort(-EPERM);
1698 break;
1699 }
1700 continue;
1701 }
1702
1703 if (!have_open_session(mds))
1704 continue;
1705 } else {
1706 session = mds_sessions[mds];
1707 }
1708
1709 // send request.
1710 send_request(request, session);
1711
1712 // wait for signal
1713 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1714 request->kick = false;
1715 while (!request->reply && // reply
1716 request->resend_mds < 0 && // forward
1717 !request->kick)
1718 caller_cond.Wait(client_lock);
1719 request->caller_cond = NULL;
1720
1721 // did we get a reply?
1722 if (request->reply)
1723 break;
1724 }
1725
1726 if (!request->reply) {
1727 assert(request->aborted());
1728 assert(!request->got_unsafe);
1729 r = request->get_abort_code();
1730 request->item.remove_myself();
1731 unregister_request(request);
1732 put_request(request); // ours
1733 return r;
1734 }
1735
1736 // got it!
1737 MClientReply *reply = request->reply;
1738 request->reply = NULL;
1739 r = reply->get_result();
1740 if (r >= 0)
1741 request->success = true;
1742
1743 // kick dispatcher (we've got it!)
1744 assert(request->dispatch_cond);
1745 request->dispatch_cond->Signal();
1746 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1747 request->dispatch_cond = 0;
1748
1749 if (r >= 0 && ptarget)
1750 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1751
1752 if (pdirbl)
1753 pdirbl->claim(reply->get_extra_bl());
1754
1755 // -- log times --
1756 utime_t lat = ceph_clock_now();
1757 lat -= request->sent_stamp;
1758 ldout(cct, 20) << "lat " << lat << dendl;
1759 logger->tinc(l_c_lat, lat);
1760 logger->tinc(l_c_reply, lat);
1761
1762 put_request(request);
1763
1764 reply->put();
1765 return r;
1766}
1767
1768void Client::unregister_request(MetaRequest *req)
1769{
1770 mds_requests.erase(req->tid);
1771 if (req->tid == oldest_tid) {
1772 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1773 while (true) {
1774 if (p == mds_requests.end()) {
1775 oldest_tid = 0;
1776 break;
1777 }
1778 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1779 oldest_tid = p->first;
1780 break;
1781 }
1782 ++p;
1783 }
1784 }
1785 put_request(req);
1786}
1787
1788void Client::put_request(MetaRequest *request)
1789{
1790 if (request->_put()) {
1791 int op = -1;
1792 if (request->success)
1793 op = request->get_op();
1794 InodeRef other_in;
1795 request->take_other_inode(&other_in);
1796 delete request;
1797
1798 if (other_in &&
1799 (op == CEPH_MDS_OP_RMDIR ||
1800 op == CEPH_MDS_OP_RENAME ||
1801 op == CEPH_MDS_OP_RMSNAP)) {
1802 _try_to_trim_inode(other_in.get(), false);
1803 }
1804 }
1805}
1806
1807int Client::encode_inode_release(Inode *in, MetaRequest *req,
1808 mds_rank_t mds, int drop,
1809 int unless, int force)
1810{
1811 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1812 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1813 << ", have:" << ", force:" << force << ")" << dendl;
1814 int released = 0;
1815 if (in->caps.count(mds)) {
1816 Cap *caps = in->caps[mds];
1817 drop &= ~(in->dirty_caps | get_caps_used(in));
1818 if ((drop & caps->issued) &&
1819 !(unless & caps->issued)) {
1820 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1821 caps->issued &= ~drop;
1822 caps->implemented &= ~drop;
1823 released = 1;
1824 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1825 } else {
1826 released = force;
1827 }
1828 if (released) {
1829 ceph_mds_request_release rel;
1830 rel.ino = in->ino;
1831 rel.cap_id = caps->cap_id;
1832 rel.seq = caps->seq;
1833 rel.issue_seq = caps->issue_seq;
1834 rel.mseq = caps->mseq;
1835 rel.caps = caps->implemented;
1836 rel.wanted = caps->wanted;
1837 rel.dname_len = 0;
1838 rel.dname_seq = 0;
1839 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1840 }
1841 }
1842 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1843 << released << dendl;
1844 return released;
1845}
1846
1847void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1848 mds_rank_t mds, int drop, int unless)
1849{
1850 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1851 << dn << ")" << dendl;
1852 int released = 0;
1853 if (dn->dir)
1854 released = encode_inode_release(dn->dir->parent_inode, req,
1855 mds, drop, unless, 1);
1856 if (released && dn->lease_mds == mds) {
1857 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1858 MClientRequest::Release& rel = req->cap_releases.back();
1859 rel.item.dname_len = dn->name.length();
1860 rel.item.dname_seq = dn->lease_seq;
1861 rel.dname = dn->name;
1862 }
1863 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1864 << dn << ")" << dendl;
1865}
1866
1867
1868/*
1869 * This requires the MClientRequest *request member to be set.
1870 * It will error out horribly without one.
1871 * Additionally, if you set any *drop member, you'd better have
1872 * set the corresponding dentry!
1873 */
1874void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1875{
1876 ldout(cct, 20) << "encode_cap_releases enter (req: "
1877 << req << ", mds: " << mds << ")" << dendl;
1878 if (req->inode_drop && req->inode())
1879 encode_inode_release(req->inode(), req,
1880 mds, req->inode_drop,
1881 req->inode_unless);
1882
1883 if (req->old_inode_drop && req->old_inode())
1884 encode_inode_release(req->old_inode(), req,
1885 mds, req->old_inode_drop,
1886 req->old_inode_unless);
1887 if (req->other_inode_drop && req->other_inode())
1888 encode_inode_release(req->other_inode(), req,
1889 mds, req->other_inode_drop,
1890 req->other_inode_unless);
1891
1892 if (req->dentry_drop && req->dentry())
1893 encode_dentry_release(req->dentry(), req,
1894 mds, req->dentry_drop,
1895 req->dentry_unless);
1896
1897 if (req->old_dentry_drop && req->old_dentry())
1898 encode_dentry_release(req->old_dentry(), req,
1899 mds, req->old_dentry_drop,
1900 req->old_dentry_unless);
1901 ldout(cct, 25) << "encode_cap_releases exit (req: "
1902 << req << ", mds " << mds <<dendl;
1903}
1904
1905bool Client::have_open_session(mds_rank_t mds)
1906{
1907 return
1908 mds_sessions.count(mds) &&
1909 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1910 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1911}
1912
1913MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1914{
1915 if (mds_sessions.count(mds) == 0)
1916 return NULL;
1917 MetaSession *s = mds_sessions[mds];
1918 if (s->con != con)
1919 return NULL;
1920 return s;
1921}
1922
1923MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1924{
1925 if (mds_sessions.count(mds))
1926 return mds_sessions[mds];
1927 return _open_mds_session(mds);
1928}
1929
1930/**
1931 * Populate a map of strings with client-identifying metadata,
1932 * such as the hostname. Call this once at initialization.
1933 */
1934void Client::populate_metadata(const std::string &mount_root)
1935{
1936 // Hostname
1937 struct utsname u;
1938 int r = uname(&u);
1939 if (r >= 0) {
1940 metadata["hostname"] = u.nodename;
1941 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1942 } else {
1943 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1944 }
1945
1946 metadata["pid"] = stringify(getpid());
1947
1948 // Ceph entity id (the '0' in "client.0")
1949 metadata["entity_id"] = cct->_conf->name.get_id();
1950
1951 // Our mount position
1952 if (!mount_root.empty()) {
1953 metadata["root"] = mount_root;
1954 }
1955
1956 // Ceph version
1957 metadata["ceph_version"] = pretty_version_to_str();
1958 metadata["ceph_sha1"] = git_version_to_str();
1959
1960 // Apply any metadata from the user's configured overrides
1961 std::vector<std::string> tokens;
1962 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1963 for (const auto &i : tokens) {
1964 auto eqpos = i.find("=");
1965 // Throw out anything that isn't of the form "<str>=<str>"
1966 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1967 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1968 continue;
1969 }
1970 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1971 }
1972}
1973
1974/**
1975 * Optionally add or override client metadata fields.
1976 */
1977void Client::update_metadata(std::string const &k, std::string const &v)
1978{
1979 Mutex::Locker l(client_lock);
1980 assert(initialized);
1981
1982 if (metadata.count(k)) {
1983 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1984 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1985 }
1986
1987 metadata[k] = v;
1988}
1989
1990MetaSession *Client::_open_mds_session(mds_rank_t mds)
1991{
1992 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1993 assert(mds_sessions.count(mds) == 0);
1994 MetaSession *session = new MetaSession;
1995 session->mds_num = mds;
1996 session->seq = 0;
1997 session->inst = mdsmap->get_inst(mds);
1998 session->con = messenger->get_connection(session->inst);
1999 session->state = MetaSession::STATE_OPENING;
2000 session->mds_state = MDSMap::STATE_NULL;
2001 mds_sessions[mds] = session;
2002
2003 // Maybe skip sending a request to open if this MDS daemon
2004 // has previously sent us a REJECT.
2005 if (rejected_by_mds.count(mds)) {
2006 if (rejected_by_mds[mds] == session->inst) {
2007 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2008 "because we were rejected" << dendl;
2009 return session;
2010 } else {
2011 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2012 "rejected us, trying with new inst" << dendl;
2013 rejected_by_mds.erase(mds);
2014 }
2015 }
2016
2017 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2018 m->client_meta = metadata;
2019 session->con->send_message(m);
2020 return session;
2021}
2022
2023void Client::_close_mds_session(MetaSession *s)
2024{
2025 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2026 s->state = MetaSession::STATE_CLOSING;
2027 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2028}
2029
2030void Client::_closed_mds_session(MetaSession *s)
2031{
2032 s->state = MetaSession::STATE_CLOSED;
2033 s->con->mark_down();
2034 signal_context_list(s->waiting_for_open);
2035 mount_cond.Signal();
2036 remove_session_caps(s);
2037 kick_requests_closed(s);
2038 mds_sessions.erase(s->mds_num);
2039 delete s;
2040}
2041
2042void Client::handle_client_session(MClientSession *m)
2043{
2044 mds_rank_t from = mds_rank_t(m->get_source().num());
2045 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2046
2047 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2048 if (!session) {
2049 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2050 m->put();
2051 return;
2052 }
2053
2054 switch (m->get_op()) {
2055 case CEPH_SESSION_OPEN:
2056 renew_caps(session);
2057 session->state = MetaSession::STATE_OPEN;
2058 if (unmounting)
2059 mount_cond.Signal();
2060 else
2061 connect_mds_targets(from);
2062 signal_context_list(session->waiting_for_open);
2063 break;
2064
2065 case CEPH_SESSION_CLOSE:
2066 _closed_mds_session(session);
2067 break;
2068
2069 case CEPH_SESSION_RENEWCAPS:
2070 if (session->cap_renew_seq == m->get_seq()) {
2071 session->cap_ttl =
2072 session->last_cap_renew_request + mdsmap->get_session_timeout();
2073 wake_inode_waiters(session);
2074 }
2075 break;
2076
2077 case CEPH_SESSION_STALE:
2078 renew_caps(session);
2079 break;
2080
2081 case CEPH_SESSION_RECALL_STATE:
2082 trim_caps(session, m->get_max_caps());
2083 break;
2084
2085 case CEPH_SESSION_FLUSHMSG:
2086 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2087 break;
2088
2089 case CEPH_SESSION_FORCE_RO:
2090 force_session_readonly(session);
2091 break;
2092
2093 case CEPH_SESSION_REJECT:
2094 rejected_by_mds[session->mds_num] = session->inst;
2095 _closed_mds_session(session);
2096
2097 break;
2098
2099 default:
2100 ceph_abort();
2101 }
2102
2103 m->put();
2104}
2105
2106bool Client::_any_stale_sessions() const
2107{
2108 assert(client_lock.is_locked_by_me());
2109
2110 for (const auto &i : mds_sessions) {
2111 if (i.second->state == MetaSession::STATE_STALE) {
2112 return true;
2113 }
2114 }
2115
2116 return false;
2117}
2118
2119void Client::_kick_stale_sessions()
2120{
2121 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2122
2123 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2124 p != mds_sessions.end(); ) {
2125 MetaSession *s = p->second;
2126 ++p;
2127 if (s->state == MetaSession::STATE_STALE)
2128 _closed_mds_session(s);
2129 }
2130}
2131
2132void Client::send_request(MetaRequest *request, MetaSession *session,
2133 bool drop_cap_releases)
2134{
2135 // make the request
2136 mds_rank_t mds = session->mds_num;
2137 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2138 << " for mds." << mds << dendl;
2139 MClientRequest *r = build_client_request(request);
2140 if (request->dentry()) {
2141 r->set_dentry_wanted();
2142 }
2143 if (request->got_unsafe) {
2144 r->set_replayed_op();
2145 if (request->target)
2146 r->head.ino = request->target->ino;
2147 } else {
2148 encode_cap_releases(request, mds);
2149 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2150 request->cap_releases.clear();
2151 else
2152 r->releases.swap(request->cap_releases);
2153 }
2154 r->set_mdsmap_epoch(mdsmap->get_epoch());
2155 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2156 objecter->with_osdmap([r](const OSDMap& o) {
2157 r->set_osdmap_epoch(o.get_epoch());
2158 });
2159 }
2160
2161 if (request->mds == -1) {
2162 request->sent_stamp = ceph_clock_now();
2163 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2164 }
2165 request->mds = mds;
2166
2167 Inode *in = request->inode();
2168 if (in && in->caps.count(mds))
2169 request->sent_on_mseq = in->caps[mds]->mseq;
2170
2171 session->requests.push_back(&request->item);
2172
2173 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2174 session->con->send_message(r);
2175}
2176
2177MClientRequest* Client::build_client_request(MetaRequest *request)
2178{
2179 MClientRequest *req = new MClientRequest(request->get_op());
2180 req->set_tid(request->tid);
2181 req->set_stamp(request->op_stamp);
2182 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2183
2184 // if the filepath's haven't been set, set them!
2185 if (request->path.empty()) {
2186 Inode *in = request->inode();
2187 Dentry *de = request->dentry();
2188 if (in)
2189 in->make_nosnap_relative_path(request->path);
2190 else if (de) {
2191 if (de->inode)
2192 de->inode->make_nosnap_relative_path(request->path);
2193 else if (de->dir) {
2194 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2195 request->path.push_dentry(de->name);
2196 }
2197 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2198 << " No path, inode, or appropriately-endowed dentry given!"
2199 << dendl;
2200 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2201 << " No path, inode, or dentry given!"
2202 << dendl;
2203 }
2204 req->set_filepath(request->get_filepath());
2205 req->set_filepath2(request->get_filepath2());
2206 req->set_data(request->data);
2207 req->set_retry_attempt(request->retry_attempt++);
2208 req->head.num_fwd = request->num_fwd;
2209 const gid_t *_gids;
2210 int gid_count = request->perms.get_gids(&_gids);
2211 req->set_gid_list(gid_count, _gids);
2212 return req;
2213}
2214
2215
2216
2217void Client::handle_client_request_forward(MClientRequestForward *fwd)
2218{
2219 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2220 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2221 if (!session) {
2222 fwd->put();
2223 return;
2224 }
2225 ceph_tid_t tid = fwd->get_tid();
2226
2227 if (mds_requests.count(tid) == 0) {
2228 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2229 fwd->put();
2230 return;
2231 }
2232
2233 MetaRequest *request = mds_requests[tid];
2234 assert(request);
2235
2236 // reset retry counter
2237 request->retry_attempt = 0;
2238
2239 // request not forwarded, or dest mds has no session.
2240 // resend.
2241 ldout(cct, 10) << "handle_client_request tid " << tid
2242 << " fwd " << fwd->get_num_fwd()
2243 << " to mds." << fwd->get_dest_mds()
2244 << ", resending to " << fwd->get_dest_mds()
2245 << dendl;
2246
2247 request->mds = -1;
2248 request->item.remove_myself();
2249 request->num_fwd = fwd->get_num_fwd();
2250 request->resend_mds = fwd->get_dest_mds();
2251 request->caller_cond->Signal();
2252
2253 fwd->put();
2254}
2255
2256bool Client::is_dir_operation(MetaRequest *req)
2257{
2258 int op = req->get_op();
2259 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2260 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2261 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2262 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2263 return true;
2264 return false;
2265}
2266
2267void Client::handle_client_reply(MClientReply *reply)
2268{
2269 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2270 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2271 if (!session) {
2272 reply->put();
2273 return;
2274 }
2275
2276 ceph_tid_t tid = reply->get_tid();
2277 bool is_safe = reply->is_safe();
2278
2279 if (mds_requests.count(tid) == 0) {
2280 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2281 << " safe is:" << is_safe << dendl;
2282 reply->put();
2283 return;
2284 }
2285 MetaRequest *request = mds_requests.at(tid);
2286
2287 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2288 << " tid " << tid << dendl;
2289
2290 if (request->got_unsafe && !is_safe) {
2291 //duplicate response
2292 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2293 << mds_num << " safe:" << is_safe << dendl;
2294 reply->put();
2295 return;
2296 }
2297
2298 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2299 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2300 << " from mds." << request->mds << dendl;
2301 request->send_to_auth = true;
2302 request->resend_mds = choose_target_mds(request);
2303 Inode *in = request->inode();
2304 if (request->resend_mds >= 0 &&
2305 request->resend_mds == request->mds &&
2306 (in == NULL ||
2307 in->caps.count(request->resend_mds) == 0 ||
2308 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2309 // have to return ESTALE
2310 } else {
2311 request->caller_cond->Signal();
2312 reply->put();
2313 return;
2314 }
2315 ldout(cct, 20) << "have to return ESTALE" << dendl;
2316 }
2317
2318 assert(request->reply == NULL);
2319 request->reply = reply;
2320 insert_trace(request, session);
2321
2322 // Handle unsafe reply
2323 if (!is_safe) {
2324 request->got_unsafe = true;
2325 session->unsafe_requests.push_back(&request->unsafe_item);
2326 if (is_dir_operation(request)) {
2327 Inode *dir = request->inode();
2328 assert(dir);
2329 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2330 }
2331 if (request->target) {
2332 InodeRef &in = request->target;
2333 in->unsafe_ops.push_back(&request->unsafe_target_item);
2334 }
2335 }
2336
2337 // Only signal the caller once (on the first reply):
2338 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2339 if (!is_safe || !request->got_unsafe) {
2340 Cond cond;
2341 request->dispatch_cond = &cond;
2342
2343 // wake up waiter
2344 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2345 request->caller_cond->Signal();
2346
2347 // wake for kick back
2348 while (request->dispatch_cond) {
2349 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2350 cond.Wait(client_lock);
2351 }
2352 }
2353
2354 if (is_safe) {
2355 // the filesystem change is committed to disk
2356 // we're done, clean up
2357 if (request->got_unsafe) {
2358 request->unsafe_item.remove_myself();
2359 request->unsafe_dir_item.remove_myself();
2360 request->unsafe_target_item.remove_myself();
2361 signal_cond_list(request->waitfor_safe);
2362 }
2363 request->item.remove_myself();
2364 unregister_request(request);
2365 }
2366 if (unmounting)
2367 mount_cond.Signal();
2368}
2369
2370void Client::_handle_full_flag(int64_t pool)
2371{
2372 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2373 << "on " << pool << dendl;
2374 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2375 // to do this rather than blocking, because otherwise when we fill up we
2376 // potentially lock caps forever on files with dirty pages, and we need
2377 // to be able to release those caps to the MDS so that it can delete files
2378 // and free up space.
2379 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2380
2381 // For all inodes with layouts in this pool and a pending flush write op
2382 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2383 // from ObjectCacher so that it doesn't re-issue the write in response to
2384 // the ENOSPC error.
2385 // Fortunately since we're cancelling everything in a given pool, we don't
2386 // need to know which ops belong to which ObjectSet, we can just blow all
2387 // the un-flushed cached data away and mark any dirty inodes' async_err
2388 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2389 // affecting this pool, and all the objectsets we're purging were also
2390 // in this pool.
2391 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2392 i != inode_map.end(); ++i)
2393 {
2394 Inode *inode = i->second;
2395 if (inode->oset.dirty_or_tx
2396 && (pool == -1 || inode->layout.pool_id == pool)) {
2397 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2398 << " has dirty objects, purging and setting ENOSPC" << dendl;
2399 objectcacher->purge_set(&inode->oset);
2400 inode->set_async_err(-ENOSPC);
2401 }
2402 }
2403
2404 if (cancelled_epoch != (epoch_t)-1) {
2405 set_cap_epoch_barrier(cancelled_epoch);
2406 }
2407}
2408
2409void Client::handle_osd_map(MOSDMap *m)
2410{
31f18b77
FG
2411 std::set<entity_addr_t> new_blacklists;
2412 objecter->consume_blacklist_events(&new_blacklists);
2413
2414 const auto myaddr = messenger->get_myaddr();
2415 if (!blacklisted && new_blacklists.count(myaddr)) {
2416 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2417 return o.get_epoch();
2418 });
2419 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2420 blacklisted = true;
2421 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2422 p != mds_requests.end(); ) {
2423 auto req = p->second;
2424 ++p;
2425 req->abort(-EBLACKLISTED);
2426 if (req->caller_cond) {
2427 req->kick = true;
2428 req->caller_cond->Signal();
2429 }
2430 }
2431
2432 // Progress aborts on any requests that were on this waitlist. Any
2433 // requests that were on a waiting_for_open session waitlist
2434 // will get kicked during close session below.
2435 signal_cond_list(waiting_for_mdsmap);
2436
2437 // Force-close all sessions: assume this is not abandoning any state
2438 // on the MDS side because the MDS will have seen the blacklist too.
2439 while(!mds_sessions.empty()) {
2440 auto i = mds_sessions.begin();
2441 auto session = i->second;
2442 _closed_mds_session(session);
2443 }
2444
2445 // Since we know all our OSD ops will fail, cancel them all preemtively,
2446 // so that on an unhealthy cluster we can umount promptly even if e.g.
2447 // some PGs were inaccessible.
2448 objecter->op_cancel_writes(-EBLACKLISTED);
2449
2450 } else if (blacklisted) {
2451 // Handle case where we were blacklisted but no longer are
2452 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2453 return o.is_blacklisted(myaddr);});
2454 }
2455
7c673cae
FG
2456 if (objecter->osdmap_full_flag()) {
2457 _handle_full_flag(-1);
2458 } else {
2459 // Accumulate local list of full pools so that I can drop
2460 // the objecter lock before re-entering objecter in
2461 // cancel_writes
2462 std::vector<int64_t> full_pools;
2463
2464 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2465 for (const auto& kv : o.get_pools()) {
2466 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2467 full_pools.push_back(kv.first);
2468 }
2469 }
2470 });
2471
2472 for (auto p : full_pools)
2473 _handle_full_flag(p);
2474
2475 // Subscribe to subsequent maps to watch for the full flag going
2476 // away. For the global full flag objecter does this for us, but
2477 // it pays no attention to the per-pool full flag so in this branch
2478 // we do it ourselves.
2479 if (!full_pools.empty()) {
2480 objecter->maybe_request_map();
2481 }
2482 }
2483
2484 m->put();
2485}
2486
2487
2488// ------------------------
2489// incoming messages
2490
2491
2492bool Client::ms_dispatch(Message *m)
2493{
2494 Mutex::Locker l(client_lock);
2495 if (!initialized) {
2496 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2497 m->put();
2498 return true;
2499 }
2500
2501 switch (m->get_type()) {
2502 // mounting and mds sessions
2503 case CEPH_MSG_MDS_MAP:
2504 handle_mds_map(static_cast<MMDSMap*>(m));
2505 break;
2506 case CEPH_MSG_FS_MAP:
2507 handle_fs_map(static_cast<MFSMap*>(m));
2508 break;
2509 case CEPH_MSG_FS_MAP_USER:
2510 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2511 break;
2512 case CEPH_MSG_CLIENT_SESSION:
2513 handle_client_session(static_cast<MClientSession*>(m));
2514 break;
2515
2516 case CEPH_MSG_OSD_MAP:
2517 handle_osd_map(static_cast<MOSDMap*>(m));
2518 break;
2519
2520 // requests
2521 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2522 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2523 break;
2524 case CEPH_MSG_CLIENT_REPLY:
2525 handle_client_reply(static_cast<MClientReply*>(m));
2526 break;
2527
2528 case CEPH_MSG_CLIENT_SNAP:
2529 handle_snap(static_cast<MClientSnap*>(m));
2530 break;
2531 case CEPH_MSG_CLIENT_CAPS:
2532 handle_caps(static_cast<MClientCaps*>(m));
2533 break;
2534 case CEPH_MSG_CLIENT_LEASE:
2535 handle_lease(static_cast<MClientLease*>(m));
2536 break;
2537 case MSG_COMMAND_REPLY:
2538 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2539 handle_command_reply(static_cast<MCommandReply*>(m));
2540 } else {
2541 return false;
2542 }
2543 break;
2544 case CEPH_MSG_CLIENT_QUOTA:
2545 handle_quota(static_cast<MClientQuota*>(m));
2546 break;
2547
2548 default:
2549 return false;
2550 }
2551
2552 // unmounting?
2553 if (unmounting) {
2554 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2555 << "+" << inode_map.size() << dendl;
2556 long unsigned size = lru.lru_get_size() + inode_map.size();
2557 trim_cache();
2558 if (size < lru.lru_get_size() + inode_map.size()) {
2559 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2560 mount_cond.Signal();
2561 } else {
2562 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2563 << "+" << inode_map.size() << dendl;
2564 }
2565 }
2566
2567 return true;
2568}
2569
2570void Client::handle_fs_map(MFSMap *m)
2571{
2572 fsmap.reset(new FSMap(m->get_fsmap()));
2573 m->put();
2574
2575 signal_cond_list(waiting_for_fsmap);
2576
2577 monclient->sub_got("fsmap", fsmap->get_epoch());
2578}
2579
2580void Client::handle_fs_map_user(MFSMapUser *m)
2581{
2582 fsmap_user.reset(new FSMapUser);
2583 *fsmap_user = m->get_fsmap();
2584 m->put();
2585
2586 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2587 signal_cond_list(waiting_for_fsmap);
2588}
2589
2590void Client::handle_mds_map(MMDSMap* m)
2591{
2592 if (m->get_epoch() <= mdsmap->get_epoch()) {
2593 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2594 << " is identical to or older than our "
2595 << mdsmap->get_epoch() << dendl;
2596 m->put();
2597 return;
2598 }
2599
2600 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2601
2602 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2603 oldmap.swap(mdsmap);
2604
2605 mdsmap->decode(m->get_encoded());
2606
2607 // Cancel any commands for missing or laggy GIDs
2608 std::list<ceph_tid_t> cancel_ops;
2609 auto &commands = command_table.get_commands();
2610 for (const auto &i : commands) {
2611 auto &op = i.second;
2612 const mds_gid_t op_mds_gid = op.mds_gid;
2613 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2614 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2615 cancel_ops.push_back(i.first);
2616 if (op.outs) {
2617 std::ostringstream ss;
2618 ss << "MDS " << op_mds_gid << " went away";
2619 *(op.outs) = ss.str();
2620 }
2621 op.con->mark_down();
2622 if (op.on_finish) {
2623 op.on_finish->complete(-ETIMEDOUT);
2624 }
2625 }
2626 }
2627
2628 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2629 i != cancel_ops.end(); ++i) {
2630 command_table.erase(*i);
2631 }
2632
2633 // reset session
2634 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2635 p != mds_sessions.end(); ) {
2636 mds_rank_t mds = p->first;
2637 MetaSession *session = p->second;
2638 ++p;
2639
2640 int oldstate = oldmap->get_state(mds);
2641 int newstate = mdsmap->get_state(mds);
2642 if (!mdsmap->is_up(mds)) {
2643 session->con->mark_down();
2644 } else if (mdsmap->get_inst(mds) != session->inst) {
2645 session->con->mark_down();
2646 session->inst = mdsmap->get_inst(mds);
2647 // When new MDS starts to take over, notify kernel to trim unused entries
2648 // in its dcache/icache. Hopefully, the kernel will release some unused
2649 // inodes before the new MDS enters reconnect state.
2650 trim_cache_for_reconnect(session);
2651 } else if (oldstate == newstate)
2652 continue; // no change
2653
2654 session->mds_state = newstate;
2655 if (newstate == MDSMap::STATE_RECONNECT) {
2656 session->con = messenger->get_connection(session->inst);
2657 send_reconnect(session);
2658 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2659 if (oldstate < MDSMap::STATE_ACTIVE) {
2660 // kick new requests
2661 kick_requests(session);
2662 kick_flushing_caps(session);
2663 signal_context_list(session->waiting_for_open);
2664 kick_maxsize_requests(session);
2665 wake_inode_waiters(session);
2666 }
2667 connect_mds_targets(mds);
2668 } else if (newstate == MDSMap::STATE_NULL &&
2669 mds >= mdsmap->get_max_mds()) {
2670 _closed_mds_session(session);
2671 }
2672 }
2673
2674 // kick any waiting threads
2675 signal_cond_list(waiting_for_mdsmap);
2676
2677 m->put();
2678
2679 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2680}
2681
2682void Client::send_reconnect(MetaSession *session)
2683{
2684 mds_rank_t mds = session->mds_num;
2685 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2686
2687 // trim unused caps to reduce MDS's cache rejoin time
2688 trim_cache_for_reconnect(session);
2689
2690 session->readonly = false;
2691
2692 if (session->release) {
2693 session->release->put();
2694 session->release = NULL;
2695 }
2696
2697 // reset my cap seq number
2698 session->seq = 0;
2699 //connect to the mds' offload targets
2700 connect_mds_targets(mds);
2701 //make sure unsafe requests get saved
2702 resend_unsafe_requests(session);
2703
2704 MClientReconnect *m = new MClientReconnect;
2705
2706 // i have an open session.
2707 ceph::unordered_set<inodeno_t> did_snaprealm;
2708 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2709 p != inode_map.end();
2710 ++p) {
2711 Inode *in = p->second;
2712 if (in->caps.count(mds)) {
2713 ldout(cct, 10) << " caps on " << p->first
2714 << " " << ccap_string(in->caps[mds]->issued)
2715 << " wants " << ccap_string(in->caps_wanted())
2716 << dendl;
2717 filepath path;
2718 in->make_long_path(path);
2719 ldout(cct, 10) << " path " << path << dendl;
2720
2721 bufferlist flockbl;
2722 _encode_filelocks(in, flockbl);
2723
2724 Cap *cap = in->caps[mds];
2725 cap->seq = 0; // reset seq.
2726 cap->issue_seq = 0; // reset seq.
2727 cap->mseq = 0; // reset seq.
2728 cap->issued = cap->implemented;
2729
2730 snapid_t snap_follows = 0;
2731 if (!in->cap_snaps.empty())
2732 snap_follows = in->cap_snaps.begin()->first;
2733
2734 m->add_cap(p->first.ino,
2735 cap->cap_id,
2736 path.get_ino(), path.get_path(), // ino
2737 in->caps_wanted(), // wanted
2738 cap->issued, // issued
2739 in->snaprealm->ino,
2740 snap_follows,
2741 flockbl);
2742
2743 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2744 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2745 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2746 did_snaprealm.insert(in->snaprealm->ino);
2747 }
2748 }
2749 }
2750
2751 early_kick_flushing_caps(session);
2752
2753 session->con->send_message(m);
2754
2755 mount_cond.Signal();
2756}
2757
2758
2759void Client::kick_requests(MetaSession *session)
2760{
2761 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2762 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2763 p != mds_requests.end();
2764 ++p) {
31f18b77
FG
2765 MetaRequest *req = p->second;
2766 if (req->got_unsafe)
2767 continue;
2768 if (req->aborted()) {
2769 if (req->caller_cond) {
2770 req->kick = true;
2771 req->caller_cond->Signal();
2772 }
7c673cae 2773 continue;
31f18b77
FG
2774 }
2775 if (req->retry_attempt > 0)
7c673cae 2776 continue; // new requests only
31f18b77 2777 if (req->mds == session->mds_num) {
7c673cae
FG
2778 send_request(p->second, session);
2779 }
2780 }
2781}
2782
2783void Client::resend_unsafe_requests(MetaSession *session)
2784{
2785 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2786 !iter.end();
2787 ++iter)
2788 send_request(*iter, session);
2789
2790 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2791 // process completed requests in clientreplay stage.
2792 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2793 p != mds_requests.end();
2794 ++p) {
2795 MetaRequest *req = p->second;
2796 if (req->got_unsafe)
2797 continue;
31f18b77
FG
2798 if (req->aborted())
2799 continue;
7c673cae
FG
2800 if (req->retry_attempt == 0)
2801 continue; // old requests only
2802 if (req->mds == session->mds_num)
2803 send_request(req, session, true);
2804 }
2805}
2806
2807void Client::wait_unsafe_requests()
2808{
2809 list<MetaRequest*> last_unsafe_reqs;
2810 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2811 p != mds_sessions.end();
2812 ++p) {
2813 MetaSession *s = p->second;
2814 if (!s->unsafe_requests.empty()) {
2815 MetaRequest *req = s->unsafe_requests.back();
2816 req->get();
2817 last_unsafe_reqs.push_back(req);
2818 }
2819 }
2820
2821 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2822 p != last_unsafe_reqs.end();
2823 ++p) {
2824 MetaRequest *req = *p;
2825 if (req->unsafe_item.is_on_list())
2826 wait_on_list(req->waitfor_safe);
2827 put_request(req);
2828 }
2829}
2830
2831void Client::kick_requests_closed(MetaSession *session)
2832{
2833 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2834 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2835 p != mds_requests.end(); ) {
2836 MetaRequest *req = p->second;
2837 ++p;
2838 if (req->mds == session->mds_num) {
2839 if (req->caller_cond) {
2840 req->kick = true;
2841 req->caller_cond->Signal();
2842 }
2843 req->item.remove_myself();
2844 if (req->got_unsafe) {
2845 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2846 req->unsafe_item.remove_myself();
2847 req->unsafe_dir_item.remove_myself();
2848 req->unsafe_target_item.remove_myself();
2849 signal_cond_list(req->waitfor_safe);
2850 unregister_request(req);
2851 }
2852 }
2853 }
2854 assert(session->requests.empty());
2855 assert(session->unsafe_requests.empty());
2856}
2857
2858
2859
2860
2861/************
2862 * leases
2863 */
2864
2865void Client::got_mds_push(MetaSession *s)
2866{
2867 s->seq++;
2868 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2869 if (s->state == MetaSession::STATE_CLOSING) {
2870 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2871 }
2872}
2873
2874void Client::handle_lease(MClientLease *m)
2875{
2876 ldout(cct, 10) << "handle_lease " << *m << dendl;
2877
2878 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2879
2880 mds_rank_t mds = mds_rank_t(m->get_source().num());
2881 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2882 if (!session) {
2883 m->put();
2884 return;
2885 }
2886
2887 got_mds_push(session);
2888
2889 ceph_seq_t seq = m->get_seq();
2890
2891 Inode *in;
2892 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2893 if (inode_map.count(vino) == 0) {
2894 ldout(cct, 10) << " don't have vino " << vino << dendl;
2895 goto revoke;
2896 }
2897 in = inode_map[vino];
2898
2899 if (m->get_mask() & CEPH_LOCK_DN) {
2900 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2901 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2902 goto revoke;
2903 }
2904 Dentry *dn = in->dir->dentries[m->dname];
2905 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2906 dn->lease_mds = -1;
2907 }
2908
2909 revoke:
2910 m->get_connection()->send_message(
2911 new MClientLease(
2912 CEPH_MDS_LEASE_RELEASE, seq,
2913 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2914 m->put();
2915}
2916
2917void Client::put_inode(Inode *in, int n)
2918{
2919 ldout(cct, 10) << "put_inode on " << *in << dendl;
2920 int left = in->_put(n);
2921 if (left == 0) {
2922 // release any caps
2923 remove_all_caps(in);
2924
2925 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2926 bool unclean = objectcacher->release_set(&in->oset);
2927 assert(!unclean);
2928 inode_map.erase(in->vino());
2929 if (use_faked_inos())
2930 _release_faked_ino(in);
2931
2932 if (in == root) {
2933 root = 0;
2934 root_ancestor = 0;
2935 while (!root_parents.empty())
2936 root_parents.erase(root_parents.begin());
2937 }
2938
2939 delete in;
2940 }
2941}
2942
2943void Client::close_dir(Dir *dir)
2944{
2945 Inode *in = dir->parent_inode;
2946 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2947 assert(dir->is_empty());
2948 assert(in->dir == dir);
2949 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2950 if (!in->dn_set.empty())
2951 in->get_first_parent()->put(); // unpin dentry
2952
2953 delete in->dir;
2954 in->dir = 0;
2955 put_inode(in); // unpin inode
2956}
2957
2958 /**
2959 * Don't call this with in==NULL, use get_or_create for that
2960 * leave dn set to default NULL unless you're trying to add
2961 * a new inode to a pre-created Dentry
2962 */
2963Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2964{
2965 if (!dn) {
2966 // create a new Dentry
2967 dn = new Dentry;
2968 dn->name = name;
2969
2970 // link to dir
2971 dn->dir = dir;
2972 dir->dentries[dn->name] = dn;
2973 lru.lru_insert_mid(dn); // mid or top?
2974
2975 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2976 << " dn " << dn << " (new dn)" << dendl;
2977 } else {
2978 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2979 << " dn " << dn << " (old dn)" << dendl;
2980 }
2981
2982 if (in) { // link to inode
2983 dn->inode = in;
2984 if (in->is_dir()) {
2985 if (in->dir)
2986 dn->get(); // dir -> dn pin
2987 if (in->ll_ref)
2988 dn->get(); // ll_ref -> dn pin
2989 }
2990
2991 assert(in->dn_set.count(dn) == 0);
2992
2993 // only one parent for directories!
2994 if (in->is_dir() && !in->dn_set.empty()) {
2995 Dentry *olddn = in->get_first_parent();
2996 assert(olddn->dir != dir || olddn->name != name);
2997 Inode *old_diri = olddn->dir->parent_inode;
2998 old_diri->dir_release_count++;
2999 clear_dir_complete_and_ordered(old_diri, true);
3000 unlink(olddn, true, true); // keep dir, dentry
3001 }
3002
3003 in->dn_set.insert(dn);
3004
3005 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3006 }
3007
3008 return dn;
3009}
3010
3011void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3012{
3013 InodeRef in;
3014 in.swap(dn->inode);
3015 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3016 << " inode " << dn->inode << dendl;
3017
3018 // unlink from inode
3019 if (in) {
3020 if (in->is_dir()) {
3021 if (in->dir)
3022 dn->put(); // dir -> dn pin
3023 if (in->ll_ref)
3024 dn->put(); // ll_ref -> dn pin
3025 }
3026 dn->inode = 0;
3027 assert(in->dn_set.count(dn));
3028 in->dn_set.erase(dn);
3029 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3030 }
3031
3032 if (keepdentry) {
3033 dn->lease_mds = -1;
3034 } else {
3035 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3036
3037 // unlink from dir
3038 dn->dir->dentries.erase(dn->name);
3039 if (dn->dir->is_empty() && !keepdir)
3040 close_dir(dn->dir);
3041 dn->dir = 0;
3042
3043 // delete den
3044 lru.lru_remove(dn);
3045 dn->put();
3046 }
3047}
3048
3049/**
3050 * For asynchronous flushes, check for errors from the IO and
3051 * update the inode if necessary
3052 */
3053class C_Client_FlushComplete : public Context {
3054private:
3055 Client *client;
3056 InodeRef inode;
3057public:
3058 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3059 void finish(int r) override {
3060 assert(client->client_lock.is_locked_by_me());
3061 if (r != 0) {
3062 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3063 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3064 << " 0x" << std::hex << inode->ino << std::dec
3065 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3066 inode->set_async_err(r);
3067 }
3068 }
3069};
3070
3071
3072/****
3073 * caps
3074 */
3075
3076void Client::get_cap_ref(Inode *in, int cap)
3077{
3078 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3079 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3080 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3081 in->get();
3082 }
3083 if ((cap & CEPH_CAP_FILE_CACHE) &&
3084 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3085 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3086 in->get();
3087 }
3088 in->get_cap_ref(cap);
3089}
3090
3091void Client::put_cap_ref(Inode *in, int cap)
3092{
3093 int last = in->put_cap_ref(cap);
3094 if (last) {
3095 int put_nref = 0;
3096 int drop = last & ~in->caps_issued();
3097 if (in->snapid == CEPH_NOSNAP) {
3098 if ((last & CEPH_CAP_FILE_WR) &&
3099 !in->cap_snaps.empty() &&
3100 in->cap_snaps.rbegin()->second.writing) {
3101 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3102 in->cap_snaps.rbegin()->second.writing = 0;
3103 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3104 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3105 }
3106 if (last & CEPH_CAP_FILE_BUFFER) {
3107 for (auto &p : in->cap_snaps)
3108 p.second.dirty_data = 0;
3109 signal_cond_list(in->waitfor_commit);
3110 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3111 ++put_nref;
3112 }
3113 }
3114 if (last & CEPH_CAP_FILE_CACHE) {
3115 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3116 ++put_nref;
3117 }
3118 if (drop)
3119 check_caps(in, 0);
3120 if (put_nref)
3121 put_inode(in, put_nref);
3122 }
3123}
3124
3125int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3126{
3127 int r = check_pool_perm(in, need);
3128 if (r < 0)
3129 return r;
3130
3131 while (1) {
3132 int file_wanted = in->caps_file_wanted();
3133 if ((file_wanted & need) != need) {
3134 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3135 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3136 << dendl;
3137 return -EBADF;
3138 }
3139
3140 int implemented;
3141 int have = in->caps_issued(&implemented);
3142
3143 bool waitfor_caps = false;
3144 bool waitfor_commit = false;
3145
3146 if (have & need & CEPH_CAP_FILE_WR) {
3147 if (endoff > 0 &&
3148 (endoff >= (loff_t)in->max_size ||
3149 endoff > (loff_t)(in->size << 1)) &&
3150 endoff > (loff_t)in->wanted_max_size) {
3151 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3152 in->wanted_max_size = endoff;
3153 check_caps(in, 0);
3154 }
3155
3156 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3157 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3158 waitfor_caps = true;
3159 }
3160 if (!in->cap_snaps.empty()) {
3161 if (in->cap_snaps.rbegin()->second.writing) {
3162 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3163 waitfor_caps = true;
3164 }
3165 for (auto &p : in->cap_snaps) {
3166 if (p.second.dirty_data) {
3167 waitfor_commit = true;
3168 break;
3169 }
3170 }
3171 if (waitfor_commit) {
3172 _flush(in, new C_Client_FlushComplete(this, in));
3173 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3174 }
3175 }
3176 }
3177
3178 if (!waitfor_caps && !waitfor_commit) {
3179 if ((have & need) == need) {
7c673cae
FG
3180 int revoking = implemented & ~have;
3181 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3182 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3183 << " revoking " << ccap_string(revoking)
7c673cae 3184 << dendl;
c07f9fc5 3185 if ((revoking & want) == 0) {
7c673cae
FG
3186 *phave = need | (have & want);
3187 in->get_cap_ref(need);
3188 return 0;
3189 }
3190 }
3191 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3192 waitfor_caps = true;
3193 }
3194
3195 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3196 in->auth_cap->session->readonly)
3197 return -EROFS;
3198
3199 if (in->flags & I_CAP_DROPPED) {
3200 int mds_wanted = in->caps_mds_wanted();
3201 if ((mds_wanted & need) != need) {
3202 int ret = _renew_caps(in);
3203 if (ret < 0)
3204 return ret;
3205 continue;
3206 }
3207 if ((mds_wanted & file_wanted) ==
3208 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3209 in->flags &= ~I_CAP_DROPPED;
3210 }
3211 }
3212
3213 if (waitfor_caps)
3214 wait_on_list(in->waitfor_caps);
3215 else if (waitfor_commit)
3216 wait_on_list(in->waitfor_commit);
3217 }
3218}
3219
3220int Client::get_caps_used(Inode *in)
3221{
3222 unsigned used = in->caps_used();
3223 if (!(used & CEPH_CAP_FILE_CACHE) &&
3224 !objectcacher->set_is_empty(&in->oset))
3225 used |= CEPH_CAP_FILE_CACHE;
3226 return used;
3227}
3228
3229void Client::cap_delay_requeue(Inode *in)
3230{
3231 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3232 in->hold_caps_until = ceph_clock_now();
3233 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3234 delayed_caps.push_back(&in->cap_item);
3235}
3236
3237void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3238 bool sync, int used, int want, int retain,
3239 int flush, ceph_tid_t flush_tid)
3240{
3241 int held = cap->issued | cap->implemented;
3242 int revoking = cap->implemented & ~cap->issued;
3243 retain &= ~revoking;
3244 int dropping = cap->issued & ~retain;
3245 int op = CEPH_CAP_OP_UPDATE;
3246
3247 ldout(cct, 10) << "send_cap " << *in
3248 << " mds." << session->mds_num << " seq " << cap->seq
3249 << (sync ? " sync " : " async ")
3250 << " used " << ccap_string(used)
3251 << " want " << ccap_string(want)
3252 << " flush " << ccap_string(flush)
3253 << " retain " << ccap_string(retain)
3254 << " held "<< ccap_string(held)
3255 << " revoking " << ccap_string(revoking)
3256 << " dropping " << ccap_string(dropping)
3257 << dendl;
3258
3259 if (cct->_conf->client_inject_release_failure && revoking) {
3260 const int would_have_issued = cap->issued & retain;
3261 const int would_have_implemented = cap->implemented & (cap->issued | used);
3262 // Simulated bug:
3263 // - tell the server we think issued is whatever they issued plus whatever we implemented
3264 // - leave what we have implemented in place
3265 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3266 cap->issued = cap->issued | cap->implemented;
3267
3268 // Make an exception for revoking xattr caps: we are injecting
3269 // failure to release other caps, but allow xattr because client
3270 // will block on xattr ops if it can't release these to MDS (#9800)
3271 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3272 cap->issued ^= xattr_mask & revoking;
3273 cap->implemented ^= xattr_mask & revoking;
3274
3275 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3276 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3277 } else {
3278 // Normal behaviour
3279 cap->issued &= retain;
3280 cap->implemented &= cap->issued | used;
3281 }
3282
3283 snapid_t follows = 0;
3284
3285 if (flush)
3286 follows = in->snaprealm->get_snap_context().seq;
3287
3288 MClientCaps *m = new MClientCaps(op,
3289 in->ino,
3290 0,
3291 cap->cap_id, cap->seq,
3292 cap->implemented,
3293 want,
3294 flush,
3295 cap->mseq,
3296 cap_epoch_barrier);
3297 m->caller_uid = in->cap_dirtier_uid;
3298 m->caller_gid = in->cap_dirtier_gid;
3299
3300 m->head.issue_seq = cap->issue_seq;
3301 m->set_tid(flush_tid);
3302
3303 m->head.uid = in->uid;
3304 m->head.gid = in->gid;
3305 m->head.mode = in->mode;
3306
3307 m->head.nlink = in->nlink;
3308
3309 if (flush & CEPH_CAP_XATTR_EXCL) {
3310 ::encode(in->xattrs, m->xattrbl);
3311 m->head.xattr_version = in->xattr_version;
3312 }
3313
3314 m->size = in->size;
3315 m->max_size = in->max_size;
3316 m->truncate_seq = in->truncate_seq;
3317 m->truncate_size = in->truncate_size;
3318 m->mtime = in->mtime;
3319 m->atime = in->atime;
3320 m->ctime = in->ctime;
3321 m->btime = in->btime;
3322 m->time_warp_seq = in->time_warp_seq;
3323 m->change_attr = in->change_attr;
3324 if (sync)
3325 m->flags |= CLIENT_CAPS_SYNC;
3326
3327 if (flush & CEPH_CAP_FILE_WR) {
3328 m->inline_version = in->inline_version;
3329 m->inline_data = in->inline_data;
3330 }
3331
3332 in->reported_size = in->size;
3333 m->set_snap_follows(follows);
3334 cap->wanted = want;
3335 if (cap == in->auth_cap) {
3336 m->set_max_size(in->wanted_max_size);
3337 in->requested_max_size = in->wanted_max_size;
3338 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3339 }
3340
3341 if (!session->flushing_caps_tids.empty())
3342 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3343
3344 session->con->send_message(m);
3345}
3346
31f18b77
FG
3347static bool is_max_size_approaching(Inode *in)
3348{
3349 /* mds will adjust max size according to the reported size */
3350 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3351 return false;
3352 if (in->size >= in->max_size)
3353 return true;
3354 /* half of previous max_size increment has been used */
3355 if (in->max_size > in->reported_size &&
3356 (in->size << 1) >= in->max_size + in->reported_size)
3357 return true;
3358 return false;
3359}
7c673cae
FG
3360
3361/**
3362 * check_caps
3363 *
3364 * Examine currently used and wanted versus held caps. Release, flush or ack
3365 * revoked caps to the MDS as appropriate.
3366 *
3367 * @param in the inode to check
3368 * @param flags flags to apply to cap check
3369 */
3370void Client::check_caps(Inode *in, unsigned flags)
3371{
3372 unsigned wanted = in->caps_wanted();
3373 unsigned used = get_caps_used(in);
3374 unsigned cap_used;
3375
3376 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3377 // we do this here because we don't want to drop to Fs (and then
3378 // drop the Fs if we do a create!) if that alone makes us send lookups
3379 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3380 wanted |= CEPH_CAP_FILE_EXCL;
3381 }
3382
3383 int implemented;
3384 int issued = in->caps_issued(&implemented);
3385 int revoking = implemented & ~issued;
3386
3387 int retain = wanted | used | CEPH_CAP_PIN;
3388 if (!unmounting) {
3389 if (wanted)
3390 retain |= CEPH_CAP_ANY;
3391 else
3392 retain |= CEPH_CAP_ANY_SHARED;
3393 }
3394
3395 ldout(cct, 10) << "check_caps on " << *in
3396 << " wanted " << ccap_string(wanted)
3397 << " used " << ccap_string(used)
3398 << " issued " << ccap_string(issued)
3399 << " revoking " << ccap_string(revoking)
3400 << " flags=" << flags
3401 << dendl;
3402
3403 if (in->snapid != CEPH_NOSNAP)
3404 return; //snap caps last forever, can't write
3405
3406 if (in->caps.empty())
3407 return; // guard if at end of func
3408
3409 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3410 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
3411 _release(in);
3412
3413 if (!in->cap_snaps.empty())
3414 flush_snaps(in);
3415
3416 if (flags & CHECK_CAPS_NODELAY)
3417 in->hold_caps_until = utime_t();
3418 else
3419 cap_delay_requeue(in);
3420
3421 utime_t now = ceph_clock_now();
3422
3423 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3424 while (it != in->caps.end()) {
3425 mds_rank_t mds = it->first;
3426 Cap *cap = it->second;
3427 ++it;
3428
3429 MetaSession *session = mds_sessions[mds];
3430 assert(session);
3431
3432 cap_used = used;
3433 if (in->auth_cap && cap != in->auth_cap)
3434 cap_used &= ~in->auth_cap->issued;
3435
3436 revoking = cap->implemented & ~cap->issued;
3437
3438 ldout(cct, 10) << " cap mds." << mds
3439 << " issued " << ccap_string(cap->issued)
3440 << " implemented " << ccap_string(cap->implemented)
3441 << " revoking " << ccap_string(revoking) << dendl;
3442
3443 if (in->wanted_max_size > in->max_size &&
3444 in->wanted_max_size > in->requested_max_size &&
3445 cap == in->auth_cap)
3446 goto ack;
3447
3448 /* approaching file_max? */
3449 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3450 cap == in->auth_cap &&
3451 is_max_size_approaching(in)) {
7c673cae 3452 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3453 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3454 goto ack;
3455 }
3456
3457 /* completed revocation? */
3458 if (revoking && (revoking & cap_used) == 0) {
3459 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3460 goto ack;
3461 }
3462
3463 /* want more caps from mds? */
3464 if (wanted & ~(cap->wanted | cap->issued))
3465 goto ack;
3466
3467 if (!revoking && unmounting && (cap_used == 0))
3468 goto ack;
3469
3470 if (wanted == cap->wanted && // mds knows what we want.
3471 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3472 !in->dirty_caps) // and we have no dirty caps
3473 continue;
3474
3475 if (now < in->hold_caps_until) {
3476 ldout(cct, 10) << "delaying cap release" << dendl;
3477 continue;
3478 }
3479
3480 ack:
3481 // re-send old cap/snapcap flushes first.
3482 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3483 session->mds_state < MDSMap::STATE_ACTIVE &&
3484 session->early_flushing_caps.count(in) == 0) {
3485 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3486 << " to mds." << session->mds_num << dendl;
3487 session->early_flushing_caps.insert(in);
3488 if (in->cap_snaps.size())
3489 flush_snaps(in, true);
3490 if (in->flushing_caps)
3491 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3492 }
3493
3494 int flushing;
3495 ceph_tid_t flush_tid;
3496 if (in->auth_cap == cap && in->dirty_caps) {
3497 flushing = mark_caps_flushing(in, &flush_tid);
3498 } else {
3499 flushing = 0;
3500 flush_tid = 0;
3501 }
3502
3503 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3504 retain, flushing, flush_tid);
3505 }
3506}
3507
3508
3509void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3510{
3511 int used = get_caps_used(in);
3512 int dirty = in->caps_dirty();
3513 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3514
3515 if (in->cap_snaps.size() &&
3516 in->cap_snaps.rbegin()->second.writing) {
3517 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3518 return;
3519 } else if (in->caps_dirty() ||
3520 (used & CEPH_CAP_FILE_WR) ||
3521 (dirty & CEPH_CAP_ANY_WR)) {
3522 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3523 assert(capsnapem.second == true); /* element inserted */
3524 CapSnap &capsnap = capsnapem.first->second;
3525 capsnap.context = old_snapc;
3526 capsnap.issued = in->caps_issued();
3527 capsnap.dirty = in->caps_dirty();
3528
3529 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3530
3531 capsnap.uid = in->uid;
3532 capsnap.gid = in->gid;
3533 capsnap.mode = in->mode;
3534 capsnap.btime = in->btime;
3535 capsnap.xattrs = in->xattrs;
3536 capsnap.xattr_version = in->xattr_version;
3537
3538 if (used & CEPH_CAP_FILE_WR) {
3539 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3540 capsnap.writing = 1;
3541 } else {
3542 finish_cap_snap(in, capsnap, used);
3543 }
3544 } else {
3545 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3546 }
3547}
3548
3549void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3550{
3551 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3552 capsnap.size = in->size;
3553 capsnap.mtime = in->mtime;
3554 capsnap.atime = in->atime;
3555 capsnap.ctime = in->ctime;
3556 capsnap.time_warp_seq = in->time_warp_seq;
3557 capsnap.change_attr = in->change_attr;
3558
3559 capsnap.dirty |= in->caps_dirty();
3560
3561 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3562 capsnap.inline_data = in->inline_data;
3563 capsnap.inline_version = in->inline_version;
3564 }
3565
3566 if (used & CEPH_CAP_FILE_BUFFER) {
3567 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3568 << " WRBUFFER, delaying" << dendl;
3569 } else {
3570 capsnap.dirty_data = 0;
3571 flush_snaps(in);
3572 }
3573}
3574
3575void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3576{
3577 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3578 in->cap_snaps.at(seq).dirty_data = 0;
3579 flush_snaps(in);
3580}
3581
3582void Client::flush_snaps(Inode *in, bool all_again)
3583{
3584 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3585 assert(in->cap_snaps.size());
3586
3587 // pick auth mds
3588 assert(in->auth_cap);
3589 MetaSession *session = in->auth_cap->session;
3590 int mseq = in->auth_cap->mseq;
3591
3592 for (auto &p : in->cap_snaps) {
3593 CapSnap &capsnap = p.second;
3594 if (!all_again) {
3595 // only flush once per session
3596 if (capsnap.flush_tid > 0)
3597 continue;
3598 }
3599
3600 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3601 << " follows " << p.first
3602 << " size " << capsnap.size
3603 << " mtime " << capsnap.mtime
3604 << " dirty_data=" << capsnap.dirty_data
3605 << " writing=" << capsnap.writing
3606 << " on " << *in << dendl;
3607 if (capsnap.dirty_data || capsnap.writing)
3608 continue;
3609
3610 if (capsnap.flush_tid == 0) {
3611 capsnap.flush_tid = ++last_flush_tid;
3612 if (!in->flushing_cap_item.is_on_list())
3613 session->flushing_caps.push_back(&in->flushing_cap_item);
3614 session->flushing_caps_tids.insert(capsnap.flush_tid);
3615 }
3616
3617 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3618 cap_epoch_barrier);
3619 if (user_id >= 0)
3620 m->caller_uid = user_id;
3621 if (group_id >= 0)
3622 m->caller_gid = group_id;
3623
3624 m->set_client_tid(capsnap.flush_tid);
3625 m->head.snap_follows = p.first;
3626
3627 m->head.caps = capsnap.issued;
3628 m->head.dirty = capsnap.dirty;
3629
3630 m->head.uid = capsnap.uid;
3631 m->head.gid = capsnap.gid;
3632 m->head.mode = capsnap.mode;
3633 m->btime = capsnap.btime;
3634
3635 m->size = capsnap.size;
3636
3637 m->head.xattr_version = capsnap.xattr_version;
3638 ::encode(capsnap.xattrs, m->xattrbl);
3639
3640 m->ctime = capsnap.ctime;
3641 m->btime = capsnap.btime;
3642 m->mtime = capsnap.mtime;
3643 m->atime = capsnap.atime;
3644 m->time_warp_seq = capsnap.time_warp_seq;
3645 m->change_attr = capsnap.change_attr;
3646
3647 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3648 m->inline_version = in->inline_version;
3649 m->inline_data = in->inline_data;
3650 }
3651
3652 assert(!session->flushing_caps_tids.empty());
3653 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3654
3655 session->con->send_message(m);
3656 }
3657}
3658
3659
3660
3661void Client::wait_on_list(list<Cond*>& ls)
3662{
3663 Cond cond;
3664 ls.push_back(&cond);
3665 cond.Wait(client_lock);
3666 ls.remove(&cond);
3667}
3668
3669void Client::signal_cond_list(list<Cond*>& ls)
3670{
3671 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3672 (*it)->Signal();
3673}
3674
3675void Client::wait_on_context_list(list<Context*>& ls)
3676{
3677 Cond cond;
3678 bool done = false;
3679 int r;
3680 ls.push_back(new C_Cond(&cond, &done, &r));
3681 while (!done)
3682 cond.Wait(client_lock);
3683}
3684
3685void Client::signal_context_list(list<Context*>& ls)
3686{
3687 while (!ls.empty()) {
3688 ls.front()->complete(0);
3689 ls.pop_front();
3690 }
3691}
3692
3693void Client::wake_inode_waiters(MetaSession *s)
3694{
3695 xlist<Cap*>::iterator iter = s->caps.begin();
3696 while (!iter.end()){
3697 signal_cond_list((*iter)->inode->waitfor_caps);
3698 ++iter;
3699 }
3700}
3701
3702
3703// flush dirty data (from objectcache)
3704
3705class C_Client_CacheInvalidate : public Context {
3706private:
3707 Client *client;
3708 vinodeno_t ino;
3709 int64_t offset, length;
3710public:
3711 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3712 client(c), offset(off), length(len) {
3713 if (client->use_faked_inos())
3714 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3715 else
3716 ino = in->vino();
3717 }
3718 void finish(int r) override {
3719 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3720 assert(!client->client_lock.is_locked_by_me());
3721 client->_async_invalidate(ino, offset, length);
3722 }
3723};
3724
3725void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3726{
3727 if (unmounting)
3728 return;
3729 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3730 ino_invalidate_cb(callback_handle, ino, off, len);
3731}
3732
3733void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3734
3735 if (ino_invalidate_cb)
3736 // we queue the invalidate, which calls the callback and decrements the ref
3737 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3738}
3739
3740void Client::_invalidate_inode_cache(Inode *in)
3741{
3742 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3743
3744 // invalidate our userspace inode cache
3745 if (cct->_conf->client_oc)
3746 objectcacher->release_set(&in->oset);
3747
3748 _schedule_invalidate_callback(in, 0, 0);
3749}
3750
3751void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3752{
3753 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3754
3755 // invalidate our userspace inode cache
3756 if (cct->_conf->client_oc) {
3757 vector<ObjectExtent> ls;
3758 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3759 objectcacher->discard_set(&in->oset, ls);
3760 }
3761
3762 _schedule_invalidate_callback(in, off, len);
3763}
3764
3765bool Client::_release(Inode *in)
3766{
3767 ldout(cct, 20) << "_release " << *in << dendl;
3768 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3769 _invalidate_inode_cache(in);
3770 return true;
3771 }
3772 return false;
3773}
3774
3775bool Client::_flush(Inode *in, Context *onfinish)
3776{
3777 ldout(cct, 10) << "_flush " << *in << dendl;
3778
3779 if (!in->oset.dirty_or_tx) {
3780 ldout(cct, 10) << " nothing to flush" << dendl;
3781 onfinish->complete(0);
3782 return true;
3783 }
3784
3785 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3786 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3787 objectcacher->purge_set(&in->oset);
3788 if (onfinish) {
3789 onfinish->complete(-ENOSPC);
3790 }
3791 return true;
3792 }
3793
3794 return objectcacher->flush_set(&in->oset, onfinish);
3795}
3796
3797void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3798{
3799 assert(client_lock.is_locked());
3800 if (!in->oset.dirty_or_tx) {
3801 ldout(cct, 10) << " nothing to flush" << dendl;
3802 return;
3803 }
3804
3805 Mutex flock("Client::_flush_range flock");
3806 Cond cond;
3807 bool safe = false;
3808 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3809 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3810 offset, size, onflush);
3811 if (!ret) {
3812 // wait for flush
3813 client_lock.Unlock();
3814 flock.Lock();
3815 while (!safe)
3816 cond.Wait(flock);
3817 flock.Unlock();
3818 client_lock.Lock();
3819 }
3820}
3821
3822void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3823{
3824 // Mutex::Locker l(client_lock);
3825 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3826 Inode *in = static_cast<Inode *>(oset->parent);
3827 assert(in);
3828 _flushed(in);
3829}
3830
3831void Client::_flushed(Inode *in)
3832{
3833 ldout(cct, 10) << "_flushed " << *in << dendl;
3834
3835 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3836}
3837
3838
3839
3840// checks common to add_update_cap, handle_cap_grant
3841void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3842{
3843 unsigned had = in->caps_issued();
3844
3845 if ((issued & CEPH_CAP_FILE_CACHE) &&
3846 !(had & CEPH_CAP_FILE_CACHE))
3847 in->cache_gen++;
3848
3849 if ((issued & CEPH_CAP_FILE_SHARED) &&
3850 !(had & CEPH_CAP_FILE_SHARED)) {
3851 in->shared_gen++;
3852
3853 if (in->is_dir())
3854 clear_dir_complete_and_ordered(in, true);
3855 }
3856}
3857
3858void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3859 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3860 int flags, const UserPerm& cap_perms)
3861{
3862 Cap *cap = 0;
3863 mds_rank_t mds = mds_session->mds_num;
3864 if (in->caps.count(mds)) {
3865 cap = in->caps[mds];
3866
3867 /*
3868 * auth mds of the inode changed. we received the cap export
3869 * message, but still haven't received the cap import message.
3870 * handle_cap_export() updated the new auth MDS' cap.
3871 *
3872 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3873 * a message that was send before the cap import message. So
3874 * don't remove caps.
3875 */
3876 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3877 assert(cap == in->auth_cap);
3878 assert(cap->cap_id == cap_id);
3879 seq = cap->seq;
3880 mseq = cap->mseq;
3881 issued |= cap->issued;
3882 flags |= CEPH_CAP_FLAG_AUTH;
3883 }
3884 } else {
3885 mds_session->num_caps++;
3886 if (!in->is_any_caps()) {
3887 assert(in->snaprealm == 0);
3888 in->snaprealm = get_snap_realm(realm);
3889 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3890 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3891 }
3892 in->caps[mds] = cap = new Cap;
3893
3894 mds_session->caps.push_back(&cap->cap_item);
3895 cap->session = mds_session;
3896 cap->inode = in;
3897 cap->gen = mds_session->cap_gen;
3898 cap_list.push_back(&in->cap_item);
3899 }
3900
3901 check_cap_issue(in, cap, issued);
3902
3903 if (flags & CEPH_CAP_FLAG_AUTH) {
3904 if (in->auth_cap != cap &&
3905 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3906 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3907 ldout(cct, 10) << "add_update_cap changing auth cap: "
3908 << "add myself to new auth MDS' flushing caps list" << dendl;
3909 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3910 }
3911 in->auth_cap = cap;
3912 }
3913 }
3914
3915 unsigned old_caps = cap->issued;
3916 cap->cap_id = cap_id;
3917 cap->issued |= issued;
3918 cap->implemented |= issued;
3919 cap->seq = seq;
3920 cap->issue_seq = seq;
3921 cap->mseq = mseq;
3922 cap->latest_perms = cap_perms;
3923 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3924 << " from mds." << mds
3925 << " on " << *in
3926 << dendl;
3927
3928 if ((issued & ~old_caps) && in->auth_cap == cap) {
3929 // non-auth MDS is revoking the newly grant caps ?
3930 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3931 if (it->second == cap)
3932 continue;
3933 if (it->second->implemented & ~it->second->issued & issued) {
3934 check_caps(in, CHECK_CAPS_NODELAY);
3935 break;
3936 }
3937 }
3938 }
3939
3940 if (issued & ~old_caps)
3941 signal_cond_list(in->waitfor_caps);
3942}
3943
3944void Client::remove_cap(Cap *cap, bool queue_release)
3945{
3946 Inode *in = cap->inode;
3947 MetaSession *session = cap->session;
3948 mds_rank_t mds = cap->session->mds_num;
3949
3950 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3951
3952 if (queue_release) {
3953 session->enqueue_cap_release(
3954 in->ino,
3955 cap->cap_id,
3956 cap->issue_seq,
3957 cap->mseq,
3958 cap_epoch_barrier);
3959 }
3960
3961 if (in->auth_cap == cap) {
3962 if (in->flushing_cap_item.is_on_list()) {
3963 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3964 in->flushing_cap_item.remove_myself();
3965 }
3966 in->auth_cap = NULL;
3967 }
3968 assert(in->caps.count(mds));
3969 in->caps.erase(mds);
3970
3971 cap->cap_item.remove_myself();
3972 delete cap;
3973 cap = nullptr;
3974
3975 if (!in->is_any_caps()) {
3976 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3977 in->snaprealm_item.remove_myself();
3978 put_snap_realm(in->snaprealm);
3979 in->snaprealm = 0;
3980 }
3981}
3982
3983void Client::remove_all_caps(Inode *in)
3984{
3985 while (!in->caps.empty())
3986 remove_cap(in->caps.begin()->second, true);
3987}
3988
3989void Client::remove_session_caps(MetaSession *s)
3990{
3991 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3992
3993 while (s->caps.size()) {
3994 Cap *cap = *s->caps.begin();
3995 Inode *in = cap->inode;
3996 bool dirty_caps = false, cap_snaps = false;
3997 if (in->auth_cap == cap) {
3998 cap_snaps = !in->cap_snaps.empty();
3999 dirty_caps = in->dirty_caps | in->flushing_caps;
4000 in->wanted_max_size = 0;
4001 in->requested_max_size = 0;
4002 in->flags |= I_CAP_DROPPED;
4003 }
4004 remove_cap(cap, false);
4005 signal_cond_list(in->waitfor_caps);
4006 if (cap_snaps) {
4007 InodeRef tmp_ref(in);
4008 in->cap_snaps.clear();
4009 }
4010 if (dirty_caps) {
4011 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4012 if (in->flushing_caps) {
4013 num_flushing_caps--;
4014 in->flushing_cap_tids.clear();
4015 }
4016 in->flushing_caps = 0;
4017 in->dirty_caps = 0;
4018 put_inode(in);
4019 }
4020 }
4021 s->flushing_caps_tids.clear();
4022 sync_cond.Signal();
4023}
4024
4025class C_Client_Remount : public Context {
4026private:
4027 Client *client;
4028public:
4029 explicit C_Client_Remount(Client *c) : client(c) {}
4030 void finish(int r) override {
4031 assert (r == 0);
4032 r = client->remount_cb(client->callback_handle);
4033 if (r != 0) {
4034 client_t whoami = client->get_nodeid();
4035 lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error "
4036 << r << dendl;
4037 if (client->require_remount && !client->unmounting) {
4038 assert(0 == "failed to remount for kernel dentry trimming");
4039 }
4040 }
4041 }
4042};
4043
4044void Client::_invalidate_kernel_dcache()
4045{
4046 if (unmounting)
4047 return;
4048 if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
4049 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4050 p != root->dir->dentries.end();
4051 ++p) {
4052 if (p->second->inode)
4053 _schedule_invalidate_dentry_callback(p->second, false);
4054 }
4055 } else if (remount_cb) {
4056 // Hacky:
4057 // when remounting a file system, linux kernel trims all unused dentries in the fs
4058 remount_finisher.queue(new C_Client_Remount(this));
4059 }
4060}
4061
4062void Client::trim_caps(MetaSession *s, int max)
4063{
4064 mds_rank_t mds = s->mds_num;
4065 int caps_size = s->caps.size();
4066 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4067 << " caps " << caps_size << dendl;
4068
4069 int trimmed = 0;
4070 xlist<Cap*>::iterator p = s->caps.begin();
4071 while ((caps_size - trimmed) > max && !p.end()) {
4072 Cap *cap = *p;
4073 Inode *in = cap->inode;
4074
4075 // Increment p early because it will be invalidated if cap
4076 // is deleted inside remove_cap
4077 ++p;
4078
4079 if (in->caps.size() > 1 && cap != in->auth_cap) {
4080 int mine = cap->issued | cap->implemented;
4081 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4082 // disposable non-auth cap
4083 if (!(get_caps_used(in) & ~oissued & mine)) {
4084 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4085 remove_cap(cap, true);
4086 trimmed++;
4087 }
4088 } else {
4089 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4090 bool all = true;
4091 set<Dentry*>::iterator q = in->dn_set.begin();
4092 InodeRef tmp_ref(in);
4093 while (q != in->dn_set.end()) {
4094 Dentry *dn = *q++;
4095 if (dn->lru_is_expireable()) {
4096 if (can_invalidate_dentries &&
4097 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4098 // Only issue one of these per DN for inodes in root: handle
4099 // others more efficiently by calling for root-child DNs at
4100 // the end of this function.
4101 _schedule_invalidate_dentry_callback(dn, true);
4102 }
4103 trim_dentry(dn);
4104 } else {
4105 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4106 all = false;
4107 }
4108 }
4109 if (all && in->ino != MDS_INO_ROOT) {
4110 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4111 trimmed++;
4112 }
4113 }
4114 }
4115
4116 if (s->caps.size() > max)
4117 _invalidate_kernel_dcache();
4118}
4119
4120void Client::force_session_readonly(MetaSession *s)
4121{
4122 s->readonly = true;
4123 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4124 Inode *in = (*p)->inode;
4125 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4126 signal_cond_list(in->waitfor_caps);
4127 }
4128}
4129
4130void Client::mark_caps_dirty(Inode *in, int caps)
4131{
4132 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4133 << ccap_string(in->dirty_caps | caps) << dendl;
4134 if (caps && !in->caps_dirty())
4135 in->get();
4136 in->dirty_caps |= caps;
4137}
4138
4139int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4140{
4141 MetaSession *session = in->auth_cap->session;
4142
4143 int flushing = in->dirty_caps;
4144 assert(flushing);
4145
4146 ceph_tid_t flush_tid = ++last_flush_tid;
4147 in->flushing_cap_tids[flush_tid] = flushing;
4148
4149 if (!in->flushing_caps) {
4150 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4151 num_flushing_caps++;
4152 } else {
4153 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4154 }
4155
4156 in->flushing_caps |= flushing;
4157 in->dirty_caps = 0;
4158
4159 if (!in->flushing_cap_item.is_on_list())
4160 session->flushing_caps.push_back(&in->flushing_cap_item);
4161 session->flushing_caps_tids.insert(flush_tid);
4162
4163 *ptid = flush_tid;
4164 return flushing;
4165}
4166
4167void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4168{
4169 for (auto &p : in->cap_snaps) {
4170 CapSnap &capsnap = p.second;
4171 if (capsnap.flush_tid > 0) {
4172 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4173 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4174 }
4175 }
4176 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4177 it != in->flushing_cap_tids.end();
4178 ++it) {
4179 old_s->flushing_caps_tids.erase(it->first);
4180 new_s->flushing_caps_tids.insert(it->first);
4181 }
4182 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4183}
4184
4185/*
4186 * Flush all caps back to the MDS. Because the callers generally wait on the
4187 * result of this function (syncfs and umount cases), we set
4188 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4189 */
4190void Client::flush_caps_sync()
4191{
4192 ldout(cct, 10) << __func__ << dendl;
4193 xlist<Inode*>::iterator p = delayed_caps.begin();
4194 while (!p.end()) {
4195 unsigned flags = CHECK_CAPS_NODELAY;
4196 Inode *in = *p;
4197
4198 ++p;
4199 delayed_caps.pop_front();
4200 if (p.end() && cap_list.empty())
4201 flags |= CHECK_CAPS_SYNCHRONOUS;
4202 check_caps(in, flags);
4203 }
4204
4205 // other caps, too
4206 p = cap_list.begin();
4207 while (!p.end()) {
4208 unsigned flags = CHECK_CAPS_NODELAY;
4209 Inode *in = *p;
4210
4211 ++p;
4212 if (p.end())
4213 flags |= CHECK_CAPS_SYNCHRONOUS;
4214 check_caps(in, flags);
4215 }
4216}
4217
4218void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4219{
4220 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4221 Cap *cap = in->auth_cap;
4222 assert(cap->session == session);
4223
4224 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4225 p != in->flushing_cap_tids.end();
4226 ++p) {
4227 bool req_sync = false;
4228
4229 /* If this is a synchronous request, then flush the journal on last one */
4230 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4231 req_sync = true;
4232
4233 send_cap(in, session, cap, req_sync,
4234 (get_caps_used(in) | in->caps_dirty()),
4235 in->caps_wanted(), (cap->issued | cap->implemented),
4236 p->second, p->first);
4237 }
4238}
4239
4240void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4241{
4242 while (in->flushing_caps) {
4243 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4244 assert(it != in->flushing_cap_tids.end());
4245 if (it->first > want)
4246 break;
4247 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4248 << ccap_string(it->second) << " want " << want
4249 << " last " << it->first << dendl;
4250 wait_on_list(in->waitfor_caps);
4251 }
4252}
4253
4254void Client::wait_sync_caps(ceph_tid_t want)
4255{
4256 retry:
4257 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4258 << num_flushing_caps << " total flushing)" << dendl;
4259 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4260 p != mds_sessions.end();
4261 ++p) {
4262 MetaSession *s = p->second;
4263 if (s->flushing_caps_tids.empty())
4264 continue;
4265 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4266 if (oldest_tid <= want) {
4267 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4268 << " (want " << want << ")" << dendl;
4269 sync_cond.Wait(client_lock);
4270 goto retry;
4271 }
4272 }
4273}
4274
4275void Client::kick_flushing_caps(MetaSession *session)
4276{
4277 mds_rank_t mds = session->mds_num;
4278 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4279
4280 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4281 Inode *in = *p;
4282 if (session->early_flushing_caps.count(in))
4283 continue;
4284 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4285 if (in->cap_snaps.size())
4286 flush_snaps(in, true);
4287 if (in->flushing_caps)
4288 flush_caps(in, session);
4289 }
4290
4291 session->early_flushing_caps.clear();
4292}
4293
4294void Client::early_kick_flushing_caps(MetaSession *session)
4295{
4296 session->early_flushing_caps.clear();
4297
4298 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4299 Inode *in = *p;
4300 assert(in->auth_cap);
4301
4302 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4303 // stage. This guarantees that MDS processes the cap flush message before issuing
4304 // the flushing caps to other client.
4305 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4306 continue;
4307
4308 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4309 << " to mds." << session->mds_num << dendl;
4310
4311 session->early_flushing_caps.insert(in);
4312
4313 if (in->cap_snaps.size())
4314 flush_snaps(in, true);
4315 if (in->flushing_caps)
4316 flush_caps(in, session);
4317
4318 }
4319}
4320
4321void Client::kick_maxsize_requests(MetaSession *session)
4322{
4323 xlist<Cap*>::iterator iter = session->caps.begin();
4324 while (!iter.end()){
4325 (*iter)->inode->requested_max_size = 0;
4326 (*iter)->inode->wanted_max_size = 0;
4327 signal_cond_list((*iter)->inode->waitfor_caps);
4328 ++iter;
4329 }
4330}
4331
4332void SnapRealm::build_snap_context()
4333{
4334 set<snapid_t> snaps;
4335 snapid_t max_seq = seq;
4336
4337 // start with prior_parents?
4338 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4339 snaps.insert(prior_parent_snaps[i]);
4340
4341 // current parent's snaps
4342 if (pparent) {
4343 const SnapContext& psnapc = pparent->get_snap_context();
4344 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4345 if (psnapc.snaps[i] >= parent_since)
4346 snaps.insert(psnapc.snaps[i]);
4347 if (psnapc.seq > max_seq)
4348 max_seq = psnapc.seq;
4349 }
4350
4351 // my snaps
4352 for (unsigned i=0; i<my_snaps.size(); i++)
4353 snaps.insert(my_snaps[i]);
4354
4355 // ok!
4356 cached_snap_context.seq = max_seq;
4357 cached_snap_context.snaps.resize(0);
4358 cached_snap_context.snaps.reserve(snaps.size());
4359 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4360 cached_snap_context.snaps.push_back(*p);
4361}
4362
4363void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4364{
4365 list<SnapRealm*> q;
4366 q.push_back(realm);
4367
4368 while (!q.empty()) {
4369 realm = q.front();
4370 q.pop_front();
4371
4372 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4373 realm->invalidate_cache();
4374
4375 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4376 p != realm->pchildren.end();
4377 ++p)
4378 q.push_back(*p);
4379 }
4380}
4381
4382SnapRealm *Client::get_snap_realm(inodeno_t r)
4383{
4384 SnapRealm *realm = snap_realms[r];
4385 if (!realm)
4386 snap_realms[r] = realm = new SnapRealm(r);
4387 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4388 realm->nref++;
4389 return realm;
4390}
4391
4392SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4393{
4394 if (snap_realms.count(r) == 0) {
4395 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4396 return NULL;
4397 }
4398 SnapRealm *realm = snap_realms[r];
4399 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4400 realm->nref++;
4401 return realm;
4402}
4403
4404void Client::put_snap_realm(SnapRealm *realm)
4405{
4406 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4407 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4408 if (--realm->nref == 0) {
4409 snap_realms.erase(realm->ino);
4410 if (realm->pparent) {
4411 realm->pparent->pchildren.erase(realm);
4412 put_snap_realm(realm->pparent);
4413 }
4414 delete realm;
4415 }
4416}
4417
4418bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4419{
4420 if (realm->parent != parent) {
4421 ldout(cct, 10) << "adjust_realm_parent " << *realm
4422 << " " << realm->parent << " -> " << parent << dendl;
4423 realm->parent = parent;
4424 if (realm->pparent) {
4425 realm->pparent->pchildren.erase(realm);
4426 put_snap_realm(realm->pparent);
4427 }
4428 realm->pparent = get_snap_realm(parent);
4429 realm->pparent->pchildren.insert(realm);
4430 return true;
4431 }
4432 return false;
4433}
4434
4435static bool has_new_snaps(const SnapContext& old_snapc,
4436 const SnapContext& new_snapc)
4437{
4438 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4439}
4440
4441
4442void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4443{
4444 SnapRealm *first_realm = NULL;
4445 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4446
4447 map<SnapRealm*, SnapContext> dirty_realms;
4448
4449 bufferlist::iterator p = bl.begin();
4450 while (!p.end()) {
4451 SnapRealmInfo info;
4452 ::decode(info, p);
4453 SnapRealm *realm = get_snap_realm(info.ino());
4454
4455 bool invalidate = false;
4456
4457 if (info.seq() > realm->seq) {
4458 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4459 << dendl;
4460
4461 if (flush) {
4462 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4463 // flush me + children
4464 list<SnapRealm*> q;
4465 q.push_back(realm);
4466 while (!q.empty()) {
4467 SnapRealm *realm = q.front();
4468 q.pop_front();
4469
4470 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4471 p != realm->pchildren.end();
4472 ++p)
4473 q.push_back(*p);
4474
4475 if (dirty_realms.count(realm) == 0) {
4476 realm->nref++;
4477 dirty_realms[realm] = realm->get_snap_context();
4478 }
4479 }
4480 }
4481
4482 // update
4483 realm->seq = info.seq();
4484 realm->created = info.created();
4485 realm->parent_since = info.parent_since();
4486 realm->prior_parent_snaps = info.prior_parent_snaps;
4487 realm->my_snaps = info.my_snaps;
4488 invalidate = true;
4489 }
4490
4491 // _always_ verify parent
4492 if (adjust_realm_parent(realm, info.parent()))
4493 invalidate = true;
4494
4495 if (invalidate) {
4496 invalidate_snaprealm_and_children(realm);
4497 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4498 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4499 } else {
4500 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4501 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4502 }
4503
4504 if (!first_realm)
4505 first_realm = realm;
4506 else
4507 put_snap_realm(realm);
4508 }
4509
4510 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4511 q != dirty_realms.end();
4512 ++q) {
4513 SnapRealm *realm = q->first;
4514 // if there are new snaps ?
4515 if (has_new_snaps(q->second, realm->get_snap_context())) {
4516 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4517 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4518 while (!r.end()) {
4519 Inode *in = *r;
4520 ++r;
4521 queue_cap_snap(in, q->second);
4522 }
4523 } else {
4524 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4525 }
4526 put_snap_realm(realm);
4527 }
4528
4529 if (realm_ret)
4530 *realm_ret = first_realm;
4531 else
4532 put_snap_realm(first_realm);
4533}
4534
4535void Client::handle_snap(MClientSnap *m)
4536{
4537 ldout(cct, 10) << "handle_snap " << *m << dendl;
4538 mds_rank_t mds = mds_rank_t(m->get_source().num());
4539 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4540 if (!session) {
4541 m->put();
4542 return;
4543 }
4544
4545 got_mds_push(session);
4546
4547 map<Inode*, SnapContext> to_move;
4548 SnapRealm *realm = 0;
4549
4550 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4551 assert(m->head.split);
4552 SnapRealmInfo info;
4553 bufferlist::iterator p = m->bl.begin();
4554 ::decode(info, p);
4555 assert(info.ino() == m->head.split);
4556
4557 // flush, then move, ino's.
4558 realm = get_snap_realm(info.ino());
4559 ldout(cct, 10) << " splitting off " << *realm << dendl;
4560 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4561 p != m->split_inos.end();
4562 ++p) {
4563 vinodeno_t vino(*p, CEPH_NOSNAP);
4564 if (inode_map.count(vino)) {
4565 Inode *in = inode_map[vino];
4566 if (!in->snaprealm || in->snaprealm == realm)
4567 continue;
4568 if (in->snaprealm->created > info.created()) {
4569 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4570 << *in->snaprealm << dendl;
4571 continue;
4572 }
4573 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4574
4575
4576 in->snaprealm_item.remove_myself();
4577 to_move[in] = in->snaprealm->get_snap_context();
4578 put_snap_realm(in->snaprealm);
4579 }
4580 }
4581
4582 // move child snaprealms, too
4583 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4584 p != m->split_realms.end();
4585 ++p) {
4586 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4587 SnapRealm *child = get_snap_realm_maybe(*p);
4588 if (!child)
4589 continue;
4590 adjust_realm_parent(child, realm->ino);
4591 put_snap_realm(child);
4592 }
4593 }
4594
4595 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4596
4597 if (realm) {
4598 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4599 Inode *in = p->first;
4600 in->snaprealm = realm;
4601 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4602 realm->nref++;
4603 // queue for snap writeback
4604 if (has_new_snaps(p->second, realm->get_snap_context()))
4605 queue_cap_snap(in, p->second);
4606 }
4607 put_snap_realm(realm);
4608 }
4609
4610 m->put();
4611}
4612
4613void Client::handle_quota(MClientQuota *m)
4614{
4615 mds_rank_t mds = mds_rank_t(m->get_source().num());
4616 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4617 if (!session) {
4618 m->put();
4619 return;
4620 }
4621
4622 got_mds_push(session);
4623
4624 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4625
4626 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4627 if (inode_map.count(vino)) {
4628 Inode *in = NULL;
4629 in = inode_map[vino];
4630
4631 if (in) {
4632 in->quota = m->quota;
4633 in->rstat = m->rstat;
4634 }
4635 }
4636
4637 m->put();
4638}
4639
4640void Client::handle_caps(MClientCaps *m)
4641{
4642 mds_rank_t mds = mds_rank_t(m->get_source().num());
4643 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4644 if (!session) {
4645 m->put();
4646 return;
4647 }
4648
4649 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4650 // Pause RADOS operations until we see the required epoch
4651 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4652 }
4653
4654 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4655 // Record the barrier so that we will transmit it to MDS when releasing
4656 set_cap_epoch_barrier(m->osd_epoch_barrier);
4657 }
4658
4659 got_mds_push(session);
4660
4661 m->clear_payload(); // for if/when we send back to MDS
4662
4663 Inode *in = 0;
4664 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4665 if (inode_map.count(vino))
4666 in = inode_map[vino];
4667 if (!in) {
4668 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4669 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4670 session->enqueue_cap_release(
4671 m->get_ino(),
4672 m->get_cap_id(),
4673 m->get_seq(),
4674 m->get_mseq(),
4675 cap_epoch_barrier);
4676 } else {
4677 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4678 }
4679 m->put();
4680
4681 // in case the mds is waiting on e.g. a revocation
4682 flush_cap_releases();
4683 return;
4684 }
4685
4686 switch (m->get_op()) {
4687 case CEPH_CAP_OP_EXPORT:
4688 return handle_cap_export(session, in, m);
4689 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4690 return handle_cap_flushsnap_ack(session, in, m);
4691 case CEPH_CAP_OP_IMPORT:
4692 handle_cap_import(session, in, m);
4693 }
4694
4695 if (in->caps.count(mds) == 0) {
4696 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4697 m->put();
4698 return;
4699 }
4700
4701 Cap *cap = in->caps[mds];
4702
4703 switch (m->get_op()) {
4704 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4705 case CEPH_CAP_OP_IMPORT:
4706 case CEPH_CAP_OP_REVOKE:
4707 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4708 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4709 default:
4710 m->put();
4711 }
4712}
4713
4714void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4715{
4716 mds_rank_t mds = session->mds_num;
4717
4718 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4719 << " IMPORT from mds." << mds << dendl;
4720
4721 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4722 Cap *cap = NULL;
4723 UserPerm cap_perms;
4724 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4725 cap = in->caps[peer_mds];
4726 if (cap) {
4727 cap_perms = cap->latest_perms;
4728 }
4729 }
4730
4731 // add/update it
4732 SnapRealm *realm = NULL;
4733 update_snap_trace(m->snapbl, &realm);
4734
4735 add_update_cap(in, session, m->get_cap_id(),
4736 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4737 CEPH_CAP_FLAG_AUTH, cap_perms);
4738
4739 if (cap && cap->cap_id == m->peer.cap_id) {
4740 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4741 }
4742
4743 if (realm)
4744 put_snap_realm(realm);
4745
4746 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4747 // reflush any/all caps (if we are now the auth_cap)
4748 if (in->cap_snaps.size())
4749 flush_snaps(in, true);
4750 if (in->flushing_caps)
4751 flush_caps(in, session);
4752 }
4753}
4754
4755void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4756{
4757 mds_rank_t mds = session->mds_num;
4758
4759 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4760 << " EXPORT from mds." << mds << dendl;
4761
4762 Cap *cap = NULL;
4763 if (in->caps.count(mds))
4764 cap = in->caps[mds];
4765
4766 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4767
4768 if (cap && cap->cap_id == m->get_cap_id()) {
4769 if (m->peer.cap_id) {
4770 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4771 if (in->caps.count(peer_mds)) {
4772 Cap *tcap = in->caps[peer_mds];
181888fb 4773 if (tcap->cap_id == m->peer.cap_id &&
7c673cae
FG
4774 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4775 tcap->cap_id = m->peer.cap_id;
4776 tcap->seq = m->peer.seq - 1;
4777 tcap->issue_seq = tcap->seq;
4778 tcap->mseq = m->peer.mseq;
4779 tcap->issued |= cap->issued;
4780 tcap->implemented |= cap->issued;
4781 if (cap == in->auth_cap)
4782 in->auth_cap = tcap;
4783 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4784 adjust_session_flushing_caps(in, session, tsession);
4785 }
4786 } else {
4787 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4788 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4789 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4790 cap->latest_perms);
4791 }
4792 } else {
4793 if (cap == in->auth_cap)
4794 in->flags |= I_CAP_DROPPED;
4795 }
4796
4797 remove_cap(cap, false);
4798 }
4799
4800 m->put();
4801}
4802
4803void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4804{
4805 mds_rank_t mds = session->mds_num;
4806 assert(in->caps[mds]);
4807
4808 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4809 << " size " << in->size << " -> " << m->get_size()
4810 << dendl;
4811
4812 int implemented = 0;
4813 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4814 issued |= implemented;
4815 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4816 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4817 m->get_ctime(), m->get_mtime(), m->get_atime(),
4818 m->inline_version, m->inline_data, issued);
4819 m->put();
4820}
4821
4822void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4823{
4824 ceph_tid_t flush_ack_tid = m->get_client_tid();
4825 int dirty = m->get_dirty();
4826 int cleaned = 0;
4827 int flushed = 0;
4828
4829 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4830 it != in->flushing_cap_tids.end(); ) {
4831 if (it->first == flush_ack_tid)
4832 cleaned = it->second;
4833 if (it->first <= flush_ack_tid) {
4834 session->flushing_caps_tids.erase(it->first);
4835 in->flushing_cap_tids.erase(it++);
4836 ++flushed;
4837 continue;
4838 }
4839 cleaned &= ~it->second;
4840 if (!cleaned)
4841 break;
4842 ++it;
4843 }
4844
4845 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4846 << " cleaned " << ccap_string(cleaned) << " on " << *in
4847 << " with " << ccap_string(dirty) << dendl;
4848
4849 if (flushed) {
4850 signal_cond_list(in->waitfor_caps);
4851 if (session->flushing_caps_tids.empty() ||
4852 *session->flushing_caps_tids.begin() > flush_ack_tid)
4853 sync_cond.Signal();
4854 }
4855
4856 if (!dirty) {
4857 in->cap_dirtier_uid = -1;
4858 in->cap_dirtier_gid = -1;
4859 }
4860
4861 if (!cleaned) {
4862 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4863 } else {
4864 if (in->flushing_caps) {
4865 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4866 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4867 in->flushing_caps &= ~cleaned;
4868 if (in->flushing_caps == 0) {
4869 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4870 num_flushing_caps--;
4871 if (in->cap_snaps.empty())
4872 in->flushing_cap_item.remove_myself();
4873 }
4874 if (!in->caps_dirty())
4875 put_inode(in);
4876 }
4877 }
4878
4879 m->put();
4880}
4881
4882
4883void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4884{
4885 mds_rank_t mds = session->mds_num;
4886 assert(in->caps[mds]);
4887 snapid_t follows = m->get_snap_follows();
4888
4889 if (in->cap_snaps.count(follows)) {
4890 CapSnap &capsnap = in->cap_snaps.at(follows);
4891 if (m->get_client_tid() != capsnap.flush_tid) {
4892 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4893 } else {
4894 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4895 << " on " << *in << dendl;
4896 InodeRef tmp_ref;
4897 if (in->get_num_ref() == 1)
4898 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4899 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4900 in->flushing_cap_item.remove_myself();
4901 session->flushing_caps_tids.erase(capsnap.flush_tid);
4902 in->cap_snaps.erase(follows);
4903 }
4904 } else {
4905 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4906 << " on " << *in << dendl;
4907 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4908 }
4909
4910 m->put();
4911}
4912
4913class C_Client_DentryInvalidate : public Context {
4914private:
4915 Client *client;
4916 vinodeno_t dirino;
4917 vinodeno_t ino;
4918 string name;
4919public:
4920 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4921 client(c), name(dn->name) {
4922 if (client->use_faked_inos()) {
4923 dirino.ino = dn->dir->parent_inode->faked_ino;
4924 if (del)
4925 ino.ino = dn->inode->faked_ino;
4926 } else {
4927 dirino = dn->dir->parent_inode->vino();
4928 if (del)
4929 ino = dn->inode->vino();
4930 }
4931 if (!del)
4932 ino.ino = inodeno_t();
4933 }
4934 void finish(int r) override {
4935 // _async_dentry_invalidate is responsible for its own locking
4936 assert(!client->client_lock.is_locked_by_me());
4937 client->_async_dentry_invalidate(dirino, ino, name);
4938 }
4939};
4940
4941void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4942{
4943 if (unmounting)
4944 return;
4945 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4946 << " in dir " << dirino << dendl;
4947 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4948}
4949
4950void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4951{
4952 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4953 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4954}
4955
4956void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4957{
4958 int ref = in->get_num_ref();
4959
4960 if (in->dir && !in->dir->dentries.empty()) {
4961 for (auto p = in->dir->dentries.begin();
4962 p != in->dir->dentries.end(); ) {
4963 Dentry *dn = p->second;
4964 ++p;
4965 /* rmsnap removes whole subtree, need trim inodes recursively.
4966 * we don't need to invalidate dentries recursively. because
4967 * invalidating a directory dentry effectively invalidate
4968 * whole subtree */
4969 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
4970 _try_to_trim_inode(dn->inode.get(), false);
4971
4972 if (dn->lru_is_expireable())
4973 unlink(dn, true, false); // keep dir, drop dentry
4974 }
4975 if (in->dir->dentries.empty()) {
4976 close_dir(in->dir);
4977 --ref;
4978 }
4979 }
4980
4981 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
4982 InodeRef snapdir = open_snapdir(in);
4983 _try_to_trim_inode(snapdir.get(), false);
4984 --ref;
4985 }
4986
4987 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
4988 set<Dentry*>::iterator q = in->dn_set.begin();
4989 while (q != in->dn_set.end()) {
4990 Dentry *dn = *q++;
4991 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
4992 // so in->dn_set doesn't always reflect the state of kernel's dcache.
4993 _schedule_invalidate_dentry_callback(dn, true);
4994 unlink(dn, true, true);
4995 }
4996 }
4997}
4998
4999void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5000{
5001 mds_rank_t mds = session->mds_num;
5002 int used = get_caps_used(in);
5003 int wanted = in->caps_wanted();
5004
5005 const int old_caps = cap->issued;
5006 const int new_caps = m->get_caps();
5007 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5008 << " mds." << mds << " seq " << m->get_seq()
5009 << " caps now " << ccap_string(new_caps)
5010 << " was " << ccap_string(old_caps) << dendl;
5011 cap->seq = m->get_seq();
5012
5013 in->layout = m->get_layout();
5014
5015 // update inode
5016 int implemented = 0;
5017 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5018 issued |= implemented;
5019
5020 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5021 in->mode = m->head.mode;
5022 in->uid = m->head.uid;
5023 in->gid = m->head.gid;
5024 in->btime = m->btime;
5025 }
5026 bool deleted_inode = false;
5027 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5028 in->nlink = m->head.nlink;
5029 if (in->nlink == 0 &&
5030 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5031 deleted_inode = true;
5032 }
5033 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5034 m->xattrbl.length() &&
5035 m->head.xattr_version > in->xattr_version) {
5036 bufferlist::iterator p = m->xattrbl.begin();
5037 ::decode(in->xattrs, p);
5038 in->xattr_version = m->head.xattr_version;
5039 }
5040 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5041 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5042 m->get_mtime(), m->get_atime(),
5043 m->inline_version, m->inline_data, issued);
5044
5045 // max_size
5046 if (cap == in->auth_cap &&
5047 m->get_max_size() != in->max_size) {
5048 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5049 in->max_size = m->get_max_size();
5050 if (in->max_size > in->wanted_max_size) {
5051 in->wanted_max_size = 0;
5052 in->requested_max_size = 0;
5053 }
5054 }
5055
5056 bool check = false;
5057 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5058 check = true;
5059
5060 check_cap_issue(in, cap, new_caps);
5061
5062 // update caps
5063 if (old_caps & ~new_caps) {
5064 ldout(cct, 10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl;
5065 cap->issued = new_caps;
5066 cap->implemented |= new_caps;
5067
5068 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5069 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5070 // waitin' for flush
5071 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5072 if (_release(in))
5073 check = true;
5074 } else {
5075 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5076 check = true;
5077 }
5078
5079 } else if (old_caps == new_caps) {
5080 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5081 } else {
5082 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5083 cap->issued = new_caps;
5084 cap->implemented |= new_caps;
5085
5086 if (cap == in->auth_cap) {
5087 // non-auth MDS is revoking the newly grant caps ?
5088 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5089 if (it->second == cap)
5090 continue;
5091 if (it->second->implemented & ~it->second->issued & new_caps) {
5092 check = true;
5093 break;
5094 }
5095 }
5096 }
5097 }
5098
5099 if (check)
5100 check_caps(in, 0);
5101
5102 // wake up waiters
5103 if (new_caps)
5104 signal_cond_list(in->waitfor_caps);
5105
5106 // may drop inode's last ref
5107 if (deleted_inode)
5108 _try_to_trim_inode(in, true);
5109
5110 m->put();
5111}
5112
5113int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5114{
5115 // cppcheck-suppress variableScope
5116 int sgid_count;
5117 gid_t *sgid_buf;
5118
5119 if (getgroups_cb) {
5120 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5121 if (sgid_count > 0) {
5122 *sgids = sgid_buf;
5123 return sgid_count;
5124 }
5125 }
5126
5127#if HAVE_GETGROUPLIST
5128 struct passwd *pw;
5129 pw = getpwuid(uid);
5130 if (pw == NULL) {
5131 ldout(cct, 3) << "getting user entry failed" << dendl;
5132 return -errno;
5133 }
5134 //use PAM to get the group list
5135 // initial number of group entries, defaults to posix standard of 16
5136 // PAM implementations may provide more than 16 groups....
5137 sgid_count = 16;
5138 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5139 if (sgid_buf == NULL) {
5140 ldout(cct, 3) << "allocating group memory failed" << dendl;
5141 return -ENOMEM;
5142 }
5143
5144 while (1) {
5145#if defined(__APPLE__)
5146 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5147#else
5148 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5149#endif
5150 // we need to resize the group list and try again
5151 void *_realloc = NULL;
5152 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5153 ldout(cct, 3) << "allocating group memory failed" << dendl;
5154 free(sgid_buf);
5155 return -ENOMEM;
5156 }
5157 sgid_buf = (gid_t*)_realloc;
5158 continue;
5159 }
5160 // list was successfully retrieved
5161 break;
5162 }
5163 *sgids = sgid_buf;
5164 return sgid_count;
5165#else
5166 return 0;
5167#endif
5168}
5169
5170int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5171{
5172 if (perms.uid() == 0)
5173 return 0;
5174
5175 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5176 int ret = _posix_acl_permission(in, perms, want);
5177 if (ret != -EAGAIN)
5178 return ret;
5179 }
5180
5181 // check permissions before doing anything else
5182 if (!in->check_mode(perms, want))
5183 return -EACCES;
5184 return 0;
5185}
5186
5187int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5188 const UserPerm& perms)
5189{
5190 int r = _getattr_for_perm(in, perms);
5191 if (r < 0)
5192 goto out;
5193
5194 r = 0;
5195 if (strncmp(name, "system.", 7) == 0) {
5196 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5197 r = -EPERM;
5198 } else {
5199 r = inode_permission(in, perms, want);
5200 }
5201out:
5202 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5203 return r;
5204}
5205
5206ostream& operator<<(ostream &out, const UserPerm& perm) {
5207 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5208 return out;
5209}
5210
5211int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5212 const UserPerm& perms)
5213{
181888fb 5214 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5215 int r = _getattr_for_perm(in, perms);
5216 if (r < 0)
5217 goto out;
5218
5219 if (mask & CEPH_SETATTR_SIZE) {
5220 r = inode_permission(in, perms, MAY_WRITE);
5221 if (r < 0)
5222 goto out;
5223 }
5224
5225 r = -EPERM;
5226 if (mask & CEPH_SETATTR_UID) {
5227 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5228 goto out;
5229 }
5230 if (mask & CEPH_SETATTR_GID) {
5231 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5232 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5233 goto out;
5234 }
5235
5236 if (mask & CEPH_SETATTR_MODE) {
5237 if (perms.uid() != 0 && perms.uid() != in->uid)
5238 goto out;
5239
5240 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5241 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5242 stx->stx_mode &= ~S_ISGID;
5243 }
5244
5245 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5246 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5247 if (perms.uid() != 0 && perms.uid() != in->uid) {
5248 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5249 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5250 check_mask |= CEPH_SETATTR_MTIME;
5251 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5252 check_mask |= CEPH_SETATTR_ATIME;
5253 if (check_mask & mask) {
5254 goto out;
5255 } else {
5256 r = inode_permission(in, perms, MAY_WRITE);
5257 if (r < 0)
5258 goto out;
5259 }
5260 }
5261 }
5262 r = 0;
5263out:
5264 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5265 return r;
5266}
5267
5268int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5269{
181888fb 5270 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5271 unsigned want = 0;
5272
5273 if ((flags & O_ACCMODE) == O_WRONLY)
5274 want = MAY_WRITE;
5275 else if ((flags & O_ACCMODE) == O_RDWR)
5276 want = MAY_READ | MAY_WRITE;
5277 else if ((flags & O_ACCMODE) == O_RDONLY)
5278 want = MAY_READ;
5279 if (flags & O_TRUNC)
5280 want |= MAY_WRITE;
5281
5282 int r = 0;
5283 switch (in->mode & S_IFMT) {
5284 case S_IFLNK:
5285 r = -ELOOP;
5286 goto out;
5287 case S_IFDIR:
5288 if (want & MAY_WRITE) {
5289 r = -EISDIR;
5290 goto out;
5291 }
5292 break;
5293 }
5294
5295 r = _getattr_for_perm(in, perms);
5296 if (r < 0)
5297 goto out;
5298
5299 r = inode_permission(in, perms, want);
5300out:
5301 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5302 return r;
5303}
5304
5305int Client::may_lookup(Inode *dir, const UserPerm& perms)
5306{
181888fb 5307 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5308 int r = _getattr_for_perm(dir, perms);
5309 if (r < 0)
5310 goto out;
5311
5312 r = inode_permission(dir, perms, MAY_EXEC);
5313out:
5314 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5315 return r;
5316}
5317
5318int Client::may_create(Inode *dir, const UserPerm& perms)
5319{
181888fb 5320 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5321 int r = _getattr_for_perm(dir, perms);
5322 if (r < 0)
5323 goto out;
5324
5325 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5326out:
5327 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5328 return r;
5329}
5330
5331int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5332{
181888fb 5333 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5334 int r = _getattr_for_perm(dir, perms);
5335 if (r < 0)
5336 goto out;
5337
5338 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5339 if (r < 0)
5340 goto out;
5341
5342 /* 'name == NULL' means rmsnap */
5343 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5344 InodeRef otherin;
5345 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5346 if (r < 0)
5347 goto out;
5348 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5349 r = -EPERM;
5350 }
5351out:
5352 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5353 return r;
5354}
5355
5356int Client::may_hardlink(Inode *in, const UserPerm& perms)
5357{
181888fb 5358 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5359 int r = _getattr_for_perm(in, perms);
5360 if (r < 0)
5361 goto out;
5362
5363 if (perms.uid() == 0 || perms.uid() == in->uid) {
5364 r = 0;
5365 goto out;
5366 }
5367
5368 r = -EPERM;
5369 if (!S_ISREG(in->mode))
5370 goto out;
5371
5372 if (in->mode & S_ISUID)
5373 goto out;
5374
5375 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5376 goto out;
5377
5378 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5379out:
5380 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5381 return r;
5382}
5383
5384int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5385{
5386 int mask = CEPH_STAT_CAP_MODE;
5387 bool force = false;
5388 if (acl_type != NO_ACL) {
5389 mask |= CEPH_STAT_CAP_XATTR;
5390 force = in->xattr_version == 0;
5391 }
5392 return _getattr(in, mask, perms, force);
5393}
5394
5395vinodeno_t Client::_get_vino(Inode *in)
5396{
5397 /* The caller must hold the client lock */
5398 return vinodeno_t(in->ino, in->snapid);
5399}
5400
5401inodeno_t Client::_get_inodeno(Inode *in)
5402{
5403 /* The caller must hold the client lock */
5404 return in->ino;
5405}
5406
5407
5408/**
5409 * Resolve an MDS spec to a list of MDS daemon GIDs.
5410 *
5411 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5412 * It may be '*' in which case it matches all GIDs.
5413 *
5414 * If no error is returned, the `targets` vector will be populated with at least
5415 * one MDS.
5416 */
5417int Client::resolve_mds(
5418 const std::string &mds_spec,
5419 std::vector<mds_gid_t> *targets)
5420{
5421 assert(fsmap);
5422 assert(targets != nullptr);
5423
5424 mds_role_t role;
5425 std::stringstream ss;
5426 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5427 if (role_r == 0) {
5428 // We got a role, resolve it to a GID
5429 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5430 << role << "'" << dendl;
5431 targets->push_back(
5432 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5433 return 0;
5434 }
5435
5436 std::string strtol_err;
5437 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5438 if (strtol_err.empty()) {
5439 // It is a possible GID
5440 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5441 if (fsmap->gid_exists(mds_gid)) {
5442 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5443 targets->push_back(mds_gid);
5444 } else {
5445 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5446 << dendl;
5447 return -ENOENT;
5448 }
5449 } else if (mds_spec == "*") {
5450 // It is a wildcard: use all MDSs
5451 const auto mds_info = fsmap->get_mds_info();
5452
5453 if (mds_info.empty()) {
5454 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5455 return -ENOENT;
5456 }
5457
5458 for (const auto i : mds_info) {
5459 targets->push_back(i.first);
5460 }
5461 } else {
5462 // It did not parse as an integer, it is not a wildcard, it must be a name
5463 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5464 if (mds_gid == 0) {
5465 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5466
5467 lderr(cct) << "FSMap: " << *fsmap << dendl;
5468
5469 return -ENOENT;
5470 } else {
5471 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5472 << "' to GID " << mds_gid << dendl;
5473 targets->push_back(mds_gid);
5474 }
5475 }
5476
5477 return 0;
5478}
5479
5480
5481/**
5482 * Authenticate with mon and establish global ID
5483 */
5484int Client::authenticate()
5485{
5486 assert(client_lock.is_locked_by_me());
5487
5488 if (monclient->is_authenticated()) {
5489 return 0;
5490 }
5491
5492 client_lock.Unlock();
5493 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5494 client_lock.Lock();
5495 if (r < 0) {
5496 return r;
5497 }
5498
5499 whoami = monclient->get_global_id();
5500 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5501
5502 return 0;
5503}
5504
5505int Client::fetch_fsmap(bool user)
5506{
5507 int r;
5508 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5509 // rather than MDSMap because no one MDSMap contains all the daemons, and
5510 // a `tell` can address any daemon.
5511 version_t fsmap_latest;
5512 do {
5513 C_SaferCond cond;
5514 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5515 client_lock.Unlock();
5516 r = cond.wait();
5517 client_lock.Lock();
5518 } while (r == -EAGAIN);
5519
5520 if (r < 0) {
5521 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5522 return r;
5523 }
5524
5525 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5526
5527 if (user) {
5528 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5529 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5530 monclient->renew_subs();
5531 wait_on_list(waiting_for_fsmap);
5532 }
5533 assert(fsmap_user);
5534 assert(fsmap_user->get_epoch() >= fsmap_latest);
5535 } else {
5536 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5537 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5538 monclient->renew_subs();
5539 wait_on_list(waiting_for_fsmap);
5540 }
5541 assert(fsmap);
5542 assert(fsmap->get_epoch() >= fsmap_latest);
5543 }
5544 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5545 << fsmap_latest << dendl;
5546 return 0;
5547}
5548
5549/**
5550 *
5551 * @mds_spec one of ID, rank, GID, "*"
5552 *
5553 */
5554int Client::mds_command(
5555 const std::string &mds_spec,
5556 const vector<string>& cmd,
5557 const bufferlist& inbl,
5558 bufferlist *outbl,
5559 string *outs,
5560 Context *onfinish)
5561{
5562 Mutex::Locker lock(client_lock);
5563
181888fb
FG
5564 if (!initialized)
5565 return -ENOTCONN;
7c673cae
FG
5566
5567 int r;
5568 r = authenticate();
5569 if (r < 0) {
5570 return r;
5571 }
5572
5573 r = fetch_fsmap(false);
5574 if (r < 0) {
5575 return r;
5576 }
5577
5578 // Look up MDS target(s) of the command
5579 std::vector<mds_gid_t> targets;
5580 r = resolve_mds(mds_spec, &targets);
5581 if (r < 0) {
5582 return r;
5583 }
5584
5585 // If daemons are laggy, we won't send them commands. If all
5586 // are laggy then we fail.
5587 std::vector<mds_gid_t> non_laggy;
5588 for (const auto gid : targets) {
5589 const auto info = fsmap->get_info_gid(gid);
5590 if (!info.laggy()) {
5591 non_laggy.push_back(gid);
5592 }
5593 }
5594 if (non_laggy.size() == 0) {
5595 *outs = "All targeted MDS daemons are laggy";
5596 return -ENOENT;
5597 }
5598
5599 if (metadata.empty()) {
5600 // We are called on an unmounted client, so metadata
5601 // won't be initialized yet.
5602 populate_metadata("");
5603 }
5604
5605 // Send commands to targets
5606 C_GatherBuilder gather(cct, onfinish);
5607 for (const auto target_gid : non_laggy) {
5608 const auto info = fsmap->get_info_gid(target_gid);
5609
5610 // Open a connection to the target MDS
5611 entity_inst_t inst = info.get_inst();
5612 ConnectionRef conn = messenger->get_connection(inst);
5613
5614 // Generate MDSCommandOp state
5615 auto &op = command_table.start_command();
5616
5617 op.on_finish = gather.new_sub();
5618 op.cmd = cmd;
5619 op.outbl = outbl;
5620 op.outs = outs;
5621 op.inbl = inbl;
5622 op.mds_gid = target_gid;
5623 op.con = conn;
5624
5625 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5626 << " tid=" << op.tid << cmd << dendl;
5627
5628 // Construct and send MCommand
5629 MCommand *m = op.get_message(monclient->get_fsid());
5630 conn->send_message(m);
5631 }
5632 gather.activate();
5633
5634 return 0;
5635}
5636
5637void Client::handle_command_reply(MCommandReply *m)
5638{
5639 ceph_tid_t const tid = m->get_tid();
5640
5641 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5642
5643 if (!command_table.exists(tid)) {
5644 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5645 m->put();
5646 return;
5647 }
5648
5649 auto &op = command_table.get_command(tid);
5650 if (op.outbl) {
5651 op.outbl->claim(m->get_data());
5652 }
5653 if (op.outs) {
5654 *op.outs = m->rs;
5655 }
5656
5657 if (op.on_finish) {
5658 op.on_finish->complete(m->r);
5659 }
5660
5661 command_table.erase(tid);
5662
5663 m->put();
5664}
5665
5666// -------------------
5667// MOUNT
5668
5669int Client::mount(const std::string &mount_root, const UserPerm& perms,
5670 bool require_mds)
5671{
5672 Mutex::Locker lock(client_lock);
5673
5674 if (mounted) {
5675 ldout(cct, 5) << "already mounted" << dendl;
5676 return 0;
5677 }
5678
5679 int r = authenticate();
5680 if (r < 0) {
5681 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5682 return r;
5683 }
5684
5685 std::string want = "mdsmap";
5686 const auto &mds_ns = cct->_conf->client_mds_namespace;
5687 if (!mds_ns.empty()) {
5688 r = fetch_fsmap(true);
5689 if (r < 0)
5690 return r;
5691 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5692 if (cid == FS_CLUSTER_ID_NONE)
5693 return -ENOENT;
5694
5695 std::ostringstream oss;
5696 oss << want << "." << cid;
5697 want = oss.str();
5698 }
5699 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5700
5701 monclient->sub_want(want, 0, 0);
5702 monclient->renew_subs();
5703
5704 tick(); // start tick
5705
5706 if (require_mds) {
5707 while (1) {
5708 auto availability = mdsmap->is_cluster_available();
5709 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5710 // Error out
5711 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5712 return CEPH_FUSE_NO_MDS_UP;
5713 } else if (availability == MDSMap::AVAILABLE) {
5714 // Continue to mount
5715 break;
5716 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5717 // Else, wait. MDSMonitor will update the map to bring
5718 // us to a conclusion eventually.
5719 wait_on_list(waiting_for_mdsmap);
5720 } else {
5721 // Unexpected value!
5722 ceph_abort();
5723 }
5724 }
5725 }
5726
5727 populate_metadata(mount_root.empty() ? "/" : mount_root);
5728
5729 filepath fp(CEPH_INO_ROOT);
5730 if (!mount_root.empty()) {
5731 fp = filepath(mount_root.c_str());
5732 }
5733 while (true) {
5734 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5735 req->set_filepath(fp);
5736 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5737 int res = make_request(req, perms);
5738 if (res < 0) {
5739 if (res == -EACCES && root) {
5740 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5741 break;
5742 }
5743 return res;
5744 }
5745
5746 if (fp.depth())
5747 fp.pop_dentry();
5748 else
5749 break;
5750 }
5751
5752 assert(root);
5753 _ll_get(root);
5754
5755 mounted = true;
5756
5757 // trace?
5758 if (!cct->_conf->client_trace.empty()) {
5759 traceout.open(cct->_conf->client_trace.c_str());
5760 if (traceout.is_open()) {
5761 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5762 } else {
5763 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5764 }
5765 }
5766
5767 /*
5768 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5769 ldout(cct, 3) << "op: struct stat st;" << dendl;
5770 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5771 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5772 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5773 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5774 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5775 ldout(cct, 3) << "op: int fd;" << dendl;
5776 */
5777 return 0;
5778}
5779
5780// UNMOUNT
5781
5782void Client::_close_sessions()
5783{
5784 while (!mds_sessions.empty()) {
5785 // send session closes!
5786 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5787 p != mds_sessions.end();
5788 ++p) {
5789 if (p->second->state != MetaSession::STATE_CLOSING) {
5790 _close_mds_session(p->second);
5791 }
5792 }
5793
5794 // wait for sessions to close
5795 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5796 mount_cond.Wait(client_lock);
5797 }
5798}
5799
31f18b77
FG
5800void Client::flush_mdlog_sync()
5801{
5802 if (mds_requests.empty())
5803 return;
5804 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5805 p != mds_sessions.end();
5806 ++p) {
5807 MetaSession *s = p->second;
5808 flush_mdlog(s);
5809 }
5810}
5811
5812void Client::flush_mdlog(MetaSession *session)
5813{
5814 // Only send this to Luminous or newer MDS daemons, older daemons
5815 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5816 const uint64_t features = session->con->get_features();
5817 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5818 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5819 session->con->send_message(m);
5820 }
5821}
5822
5823
7c673cae
FG
5824void Client::unmount()
5825{
5826 Mutex::Locker lock(client_lock);
5827
181888fb
FG
5828 if (unmounting)
5829 return;
7c673cae
FG
5830
5831 ldout(cct, 2) << "unmounting" << dendl;
5832 unmounting = true;
5833
31f18b77 5834 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5835 while (!mds_requests.empty()) {
5836 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5837 mount_cond.Wait(client_lock);
5838 }
5839
5840 if (tick_event)
5841 timer.cancel_event(tick_event);
5842 tick_event = 0;
5843
5844 cwd.reset();
5845
5846 // clean up any unclosed files
5847 while (!fd_map.empty()) {
5848 Fh *fh = fd_map.begin()->second;
5849 fd_map.erase(fd_map.begin());
5850 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5851 _release_fh(fh);
5852 }
5853
5854 while (!ll_unclosed_fh_set.empty()) {
5855 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5856 Fh *fh = *it;
5857 ll_unclosed_fh_set.erase(fh);
5858 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5859 _release_fh(fh);
5860 }
5861
5862 while (!opened_dirs.empty()) {
5863 dir_result_t *dirp = *opened_dirs.begin();
5864 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5865 _closedir(dirp);
5866 }
5867
5868 _ll_drop_pins();
5869
31f18b77
FG
5870 if (blacklisted) {
5871 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5872
5873 if (cct->_conf->client_oc) {
5874 // Purge all cached data so that ObjectCacher doesn't get hung up
5875 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5876 // is to just leave things marked dirty
5877 // (http://tracker.ceph.com/issues/9105)
5878 for (const auto &i : inode_map) {
5879 objectcacher->purge_set(&(i.second->oset));
5880 }
5881 }
5882
5883 mounted = false;
5884 return;
5885 }
5886
7c673cae
FG
5887 while (unsafe_sync_write > 0) {
5888 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5889 mount_cond.Wait(client_lock);
5890 }
5891
5892 if (cct->_conf->client_oc) {
5893 // flush/release all buffered data
5894 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5895 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5896 p != inode_map.end();
5897 p = next) {
5898 next = p;
5899 ++next;
5900 Inode *in = p->second;
5901 if (!in) {
5902 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5903 assert(in);
5904 }
5905 if (!in->caps.empty()) {
5906 InodeRef tmp_ref(in);
5907 _release(in);
5908 _flush(in, new C_Client_FlushComplete(this, in));
5909 }
5910 }
5911 }
5912
5913 flush_caps_sync();
5914 wait_sync_caps(last_flush_tid);
5915
5916 // empty lru cache
7c673cae
FG
5917 trim_cache();
5918
5919 while (lru.lru_get_size() > 0 ||
5920 !inode_map.empty()) {
5921 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5922 << "+" << inode_map.size() << " items"
5923 << ", waiting (for caps to release?)"
5924 << dendl;
5925 utime_t until = ceph_clock_now() + utime_t(5, 0);
5926 int r = mount_cond.WaitUntil(client_lock, until);
5927 if (r == ETIMEDOUT) {
5928 dump_cache(NULL);
5929 }
5930 }
5931 assert(lru.lru_get_size() == 0);
5932 assert(inode_map.empty());
5933
5934 // stop tracing
5935 if (!cct->_conf->client_trace.empty()) {
5936 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5937 traceout.close();
5938 }
5939
5940 _close_sessions();
5941
5942 mounted = false;
5943
5944 ldout(cct, 2) << "unmounted." << dendl;
5945}
5946
5947
5948
5949class C_C_Tick : public Context {
5950 Client *client;
5951public:
5952 explicit C_C_Tick(Client *c) : client(c) {}
5953 void finish(int r) override {
5954 // Called back via Timer, which takes client_lock for us
5955 assert(client->client_lock.is_locked_by_me());
5956 client->tick();
5957 }
5958};
5959
5960void Client::flush_cap_releases()
5961{
5962 // send any cap releases
5963 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5964 p != mds_sessions.end();
5965 ++p) {
5966 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5967 p->first)) {
5968 if (cct->_conf->client_inject_release_failure) {
5969 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5970 p->second->release->put();
5971 } else {
5972 p->second->con->send_message(p->second->release);
5973 }
5974 p->second->release = 0;
5975 }
5976 }
5977}
5978
5979void Client::tick()
5980{
5981 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5982 sleep(cct->_conf->client_debug_inject_tick_delay);
5983 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5984 cct->_conf->apply_changes(NULL);
5985 }
5986
5987 ldout(cct, 21) << "tick" << dendl;
5988 tick_event = new C_C_Tick(this);
5989 timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
5990
5991 utime_t now = ceph_clock_now();
5992
5993 if (!mounted && !mds_requests.empty()) {
5994 MetaRequest *req = mds_requests.begin()->second;
5995 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
5996 req->abort(-ETIMEDOUT);
5997 if (req->caller_cond) {
5998 req->kick = true;
5999 req->caller_cond->Signal();
6000 }
6001 signal_cond_list(waiting_for_mdsmap);
6002 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6003 p != mds_sessions.end();
6004 ++p)
6005 signal_context_list(p->second->waiting_for_open);
6006 }
6007 }
6008
6009 if (mdsmap->get_epoch()) {
6010 // renew caps?
6011 utime_t el = now - last_cap_renew;
6012 if (el > mdsmap->get_session_timeout() / 3.0)
6013 renew_caps();
6014
6015 flush_cap_releases();
6016 }
6017
6018 // delayed caps
6019 xlist<Inode*>::iterator p = delayed_caps.begin();
6020 while (!p.end()) {
6021 Inode *in = *p;
6022 ++p;
6023 if (in->hold_caps_until > now)
6024 break;
6025 delayed_caps.pop_front();
6026 cap_list.push_back(&in->cap_item);
6027 check_caps(in, CHECK_CAPS_NODELAY);
6028 }
6029
6030 trim_cache(true);
6031}
6032
6033void Client::renew_caps()
6034{
6035 ldout(cct, 10) << "renew_caps()" << dendl;
6036 last_cap_renew = ceph_clock_now();
6037
6038 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6039 p != mds_sessions.end();
6040 ++p) {
6041 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6042 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6043 renew_caps(p->second);
6044 }
6045}
6046
6047void Client::renew_caps(MetaSession *session)
6048{
6049 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6050 session->last_cap_renew_request = ceph_clock_now();
6051 uint64_t seq = ++session->cap_renew_seq;
6052 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6053}
6054
6055
6056// ===============================================================
6057// high level (POSIXy) interface
6058
6059int Client::_do_lookup(Inode *dir, const string& name, int mask,
6060 InodeRef *target, const UserPerm& perms)
6061{
6062 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6063 MetaRequest *req = new MetaRequest(op);
6064 filepath path;
6065 dir->make_nosnap_relative_path(path);
6066 path.push_dentry(name);
6067 req->set_filepath(path);
6068 req->set_inode(dir);
6069 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6070 mask |= DEBUG_GETATTR_CAPS;
6071 req->head.args.getattr.mask = mask;
6072
6073 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6074
6075 int r = make_request(req, perms, target);
6076 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6077 return r;
6078}
6079
6080int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6081 const UserPerm& perms)
6082{
6083 int r = 0;
6084 Dentry *dn = NULL;
6085
6086 if (!dir->is_dir()) {
6087 r = -ENOTDIR;
6088 goto done;
6089 }
6090
6091 if (dname == "..") {
6092 if (dir->dn_set.empty())
6093 *target = dir;
6094 else
6095 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6096 goto done;
6097 }
6098
6099 if (dname == ".") {
6100 *target = dir;
6101 goto done;
6102 }
6103
6104 if (dname.length() > NAME_MAX) {
6105 r = -ENAMETOOLONG;
6106 goto done;
6107 }
6108
6109 if (dname == cct->_conf->client_snapdir &&
6110 dir->snapid == CEPH_NOSNAP) {
6111 *target = open_snapdir(dir);
6112 goto done;
6113 }
6114
6115 if (dir->dir &&
6116 dir->dir->dentries.count(dname)) {
6117 dn = dir->dir->dentries[dname];
6118
6119 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6120 << " seq " << dn->lease_seq
6121 << dendl;
6122
6123 if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
6124 // is dn lease valid?
6125 utime_t now = ceph_clock_now();
6126 if (dn->lease_mds >= 0 &&
6127 dn->lease_ttl > now &&
6128 mds_sessions.count(dn->lease_mds)) {
6129 MetaSession *s = mds_sessions[dn->lease_mds];
6130 if (s->cap_ttl > now &&
6131 s->cap_gen == dn->lease_gen) {
6132 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6133 // make trim_caps() behave.
6134 dir->try_touch_cap(dn->lease_mds);
6135 goto hit_dn;
6136 }
6137 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6138 << " vs lease_gen " << dn->lease_gen << dendl;
6139 }
6140 // dir lease?
6141 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
6142 if (dn->cap_shared_gen == dir->shared_gen &&
6143 (!dn->inode || dn->inode->caps_issued_mask(mask)))
6144 goto hit_dn;
6145 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6146 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6147 << *dir << " dn '" << dname << "'" << dendl;
6148 return -ENOENT;
6149 }
6150 }
6151 } else {
6152 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6153 }
6154 } else {
6155 // can we conclude ENOENT locally?
6156 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
6157 (dir->flags & I_COMPLETE)) {
6158 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6159 return -ENOENT;
6160 }
6161 }
6162
6163 r = _do_lookup(dir, dname, mask, target, perms);
6164 goto done;
6165
6166 hit_dn:
6167 if (dn->inode) {
6168 *target = dn->inode;
6169 } else {
6170 r = -ENOENT;
6171 }
6172 touch_dn(dn);
6173
6174 done:
6175 if (r < 0)
6176 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6177 else
6178 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6179 return r;
6180}
6181
6182int Client::get_or_create(Inode *dir, const char* name,
6183 Dentry **pdn, bool expect_null)
6184{
6185 // lookup
6186 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6187 dir->open_dir();
6188 if (dir->dir->dentries.count(name)) {
6189 Dentry *dn = dir->dir->dentries[name];
6190
6191 // is dn lease valid?
6192 utime_t now = ceph_clock_now();
6193 if (dn->inode &&
6194 dn->lease_mds >= 0 &&
6195 dn->lease_ttl > now &&
6196 mds_sessions.count(dn->lease_mds)) {
6197 MetaSession *s = mds_sessions[dn->lease_mds];
6198 if (s->cap_ttl > now &&
6199 s->cap_gen == dn->lease_gen) {
6200 if (expect_null)
6201 return -EEXIST;
6202 }
6203 }
6204 *pdn = dn;
6205 } else {
6206 // otherwise link up a new one
6207 *pdn = link(dir->dir, name, NULL, NULL);
6208 }
6209
6210 // success
6211 return 0;
6212}
6213
6214int Client::path_walk(const filepath& origpath, InodeRef *end,
6215 const UserPerm& perms, bool followsym, int mask)
6216{
6217 filepath path = origpath;
6218 InodeRef cur;
6219 if (origpath.absolute())
6220 cur = root;
6221 else
6222 cur = cwd;
6223 assert(cur);
6224
6225 ldout(cct, 10) << "path_walk " << path << dendl;
6226
6227 int symlinks = 0;
6228
6229 unsigned i=0;
6230 while (i < path.depth() && cur) {
6231 int caps = 0;
6232 const string &dname = path[i];
6233 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6234 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6235 InodeRef next;
6236 if (cct->_conf->client_permissions) {
6237 int r = may_lookup(cur.get(), perms);
6238 if (r < 0)
6239 return r;
6240 caps = CEPH_CAP_AUTH_SHARED;
6241 }
6242
6243 /* Get extra requested caps on the last component */
6244 if (i == (path.depth() - 1))
6245 caps |= mask;
6246 int r = _lookup(cur.get(), dname, caps, &next, perms);
6247 if (r < 0)
6248 return r;
6249 // only follow trailing symlink if followsym. always follow
6250 // 'directory' symlinks.
6251 if (next && next->is_symlink()) {
6252 symlinks++;
6253 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6254 if (symlinks > MAXSYMLINKS) {
6255 return -ELOOP;
6256 }
6257
6258 if (i < path.depth() - 1) {
6259 // dir symlink
6260 // replace consumed components of path with symlink dir target
6261 filepath resolved(next->symlink.c_str());
6262 resolved.append(path.postfixpath(i + 1));
6263 path = resolved;
6264 i = 0;
6265 if (next->symlink[0] == '/') {
6266 cur = root;
6267 }
6268 continue;
6269 } else if (followsym) {
6270 if (next->symlink[0] == '/') {
6271 path = next->symlink.c_str();
6272 i = 0;
6273 // reset position
6274 cur = root;
6275 } else {
6276 filepath more(next->symlink.c_str());
6277 // we need to remove the symlink component from off of the path
6278 // before adding the target that the symlink points to. remain
6279 // at the same position in the path.
6280 path.pop_dentry();
6281 path.append(more);
6282 }
6283 continue;
6284 }
6285 }
6286 cur.swap(next);
6287 i++;
6288 }
6289 if (!cur)
6290 return -ENOENT;
6291 if (end)
6292 end->swap(cur);
6293 return 0;
6294}
6295
6296
6297// namespace ops
6298
6299int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6300{
6301 Mutex::Locker lock(client_lock);
6302 tout(cct) << "link" << std::endl;
6303 tout(cct) << relexisting << std::endl;
6304 tout(cct) << relpath << std::endl;
6305
181888fb
FG
6306 if (unmounting)
6307 return -ENOTCONN;
6308
7c673cae
FG
6309 filepath existing(relexisting);
6310
6311 InodeRef in, dir;
6312 int r = path_walk(existing, &in, perm, true);
6313 if (r < 0)
6314 return r;
6315 if (std::string(relpath) == "/") {
6316 r = -EEXIST;
6317 return r;
6318 }
6319 filepath path(relpath);
6320 string name = path.last_dentry();
6321 path.pop_dentry();
6322
6323 r = path_walk(path, &dir, perm, true);
6324 if (r < 0)
6325 return r;
6326 if (cct->_conf->client_permissions) {
6327 if (S_ISDIR(in->mode)) {
6328 r = -EPERM;
6329 return r;
6330 }
6331 r = may_hardlink(in.get(), perm);
6332 if (r < 0)
6333 return r;
6334 r = may_create(dir.get(), perm);
6335 if (r < 0)
6336 return r;
6337 }
6338 r = _link(in.get(), dir.get(), name.c_str(), perm);
6339 return r;
6340}
6341
6342int Client::unlink(const char *relpath, const UserPerm& perm)
6343{
6344 Mutex::Locker lock(client_lock);
6345 tout(cct) << "unlink" << std::endl;
6346 tout(cct) << relpath << std::endl;
6347
181888fb
FG
6348 if (unmounting)
6349 return -ENOTCONN;
6350
7c673cae
FG
6351 if (std::string(relpath) == "/")
6352 return -EISDIR;
6353
6354 filepath path(relpath);
6355 string name = path.last_dentry();
6356 path.pop_dentry();
6357 InodeRef dir;
6358 int r = path_walk(path, &dir, perm);
6359 if (r < 0)
6360 return r;
6361 if (cct->_conf->client_permissions) {
6362 r = may_delete(dir.get(), name.c_str(), perm);
6363 if (r < 0)
6364 return r;
6365 }
6366 return _unlink(dir.get(), name.c_str(), perm);
6367}
6368
6369int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6370{
6371 Mutex::Locker lock(client_lock);
6372 tout(cct) << "rename" << std::endl;
6373 tout(cct) << relfrom << std::endl;
6374 tout(cct) << relto << std::endl;
6375
181888fb
FG
6376 if (unmounting)
6377 return -ENOTCONN;
6378
7c673cae
FG
6379 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6380 return -EBUSY;
6381
6382 filepath from(relfrom);
6383 filepath to(relto);
6384 string fromname = from.last_dentry();
6385 from.pop_dentry();
6386 string toname = to.last_dentry();
6387 to.pop_dentry();
6388
6389 InodeRef fromdir, todir;
6390 int r = path_walk(from, &fromdir, perm);
6391 if (r < 0)
6392 goto out;
6393 r = path_walk(to, &todir, perm);
6394 if (r < 0)
6395 goto out;
6396
6397 if (cct->_conf->client_permissions) {
6398 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6399 if (r < 0)
6400 return r;
6401 r = may_delete(todir.get(), toname.c_str(), perm);
6402 if (r < 0 && r != -ENOENT)
6403 return r;
6404 }
6405 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6406out:
6407 return r;
6408}
6409
6410// dirs
6411
6412int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6413{
6414 Mutex::Locker lock(client_lock);
6415 tout(cct) << "mkdir" << std::endl;
6416 tout(cct) << relpath << std::endl;
6417 tout(cct) << mode << std::endl;
6418 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6419
181888fb
FG
6420 if (unmounting)
6421 return -ENOTCONN;
6422
7c673cae
FG
6423 if (std::string(relpath) == "/")
6424 return -EEXIST;
6425
6426 filepath path(relpath);
6427 string name = path.last_dentry();
6428 path.pop_dentry();
6429 InodeRef dir;
6430 int r = path_walk(path, &dir, perm);
6431 if (r < 0)
6432 return r;
6433 if (cct->_conf->client_permissions) {
6434 r = may_create(dir.get(), perm);
6435 if (r < 0)
6436 return r;
6437 }
6438 return _mkdir(dir.get(), name.c_str(), mode, perm);
6439}
6440
6441int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6442{
6443 Mutex::Locker lock(client_lock);
6444 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6445 tout(cct) << "mkdirs" << std::endl;
6446 tout(cct) << relpath << std::endl;
6447 tout(cct) << mode << std::endl;
6448
181888fb
FG
6449 if (unmounting)
6450 return -ENOTCONN;
6451
7c673cae
FG
6452 //get through existing parts of path
6453 filepath path(relpath);
6454 unsigned int i;
6455 int r = 0, caps = 0;
6456 InodeRef cur, next;
6457 cur = cwd;
6458 for (i=0; i<path.depth(); ++i) {
6459 if (cct->_conf->client_permissions) {
6460 r = may_lookup(cur.get(), perms);
6461 if (r < 0)
6462 break;
6463 caps = CEPH_CAP_AUTH_SHARED;
6464 }
6465 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6466 if (r < 0)
6467 break;
6468 cur.swap(next);
6469 }
6470 //check that we have work left to do
6471 if (i==path.depth()) return -EEXIST;
6472 if (r!=-ENOENT) return r;
6473 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6474 //make new directory at each level
6475 for (; i<path.depth(); ++i) {
6476 if (cct->_conf->client_permissions) {
6477 r = may_create(cur.get(), perms);
6478 if (r < 0)
6479 return r;
6480 }
6481 //make new dir
6482 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6483
7c673cae 6484 //check proper creation/existence
c07f9fc5
FG
6485 if(-EEXIST == r && i < path.depth() - 1) {
6486 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6487 }
6488 if (r < 0)
6489 return r;
7c673cae
FG
6490 //move to new dir and continue
6491 cur.swap(next);
6492 ldout(cct, 20) << "mkdirs: successfully created directory "
6493 << filepath(cur->ino).get_path() << dendl;
6494 }
6495 return 0;
6496}
6497
6498int Client::rmdir(const char *relpath, const UserPerm& perms)
6499{
6500 Mutex::Locker lock(client_lock);
6501 tout(cct) << "rmdir" << std::endl;
6502 tout(cct) << relpath << std::endl;
6503
181888fb
FG
6504 if (unmounting)
6505 return -ENOTCONN;
6506
7c673cae
FG
6507 if (std::string(relpath) == "/")
6508 return -EBUSY;
6509
6510 filepath path(relpath);
6511 string name = path.last_dentry();
6512 path.pop_dentry();
6513 InodeRef dir;
6514 int r = path_walk(path, &dir, perms);
6515 if (r < 0)
6516 return r;
6517 if (cct->_conf->client_permissions) {
6518 int r = may_delete(dir.get(), name.c_str(), perms);
6519 if (r < 0)
6520 return r;
6521 }
6522 return _rmdir(dir.get(), name.c_str(), perms);
6523}
6524
6525int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6526{
6527 Mutex::Locker lock(client_lock);
6528 tout(cct) << "mknod" << std::endl;
6529 tout(cct) << relpath << std::endl;
6530 tout(cct) << mode << std::endl;
6531 tout(cct) << rdev << std::endl;
6532
181888fb
FG
6533 if (unmounting)
6534 return -ENOTCONN;
6535
7c673cae
FG
6536 if (std::string(relpath) == "/")
6537 return -EEXIST;
6538
6539 filepath path(relpath);
6540 string name = path.last_dentry();
6541 path.pop_dentry();
6542 InodeRef dir;
6543 int r = path_walk(path, &dir, perms);
6544 if (r < 0)
6545 return r;
6546 if (cct->_conf->client_permissions) {
6547 int r = may_create(dir.get(), perms);
6548 if (r < 0)
6549 return r;
6550 }
6551 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6552}
6553
6554// symlinks
6555
6556int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6557{
6558 Mutex::Locker lock(client_lock);
6559 tout(cct) << "symlink" << std::endl;
6560 tout(cct) << target << std::endl;
6561 tout(cct) << relpath << std::endl;
6562
181888fb
FG
6563 if (unmounting)
6564 return -ENOTCONN;
6565
7c673cae
FG
6566 if (std::string(relpath) == "/")
6567 return -EEXIST;
6568
6569 filepath path(relpath);
6570 string name = path.last_dentry();
6571 path.pop_dentry();
6572 InodeRef dir;
6573 int r = path_walk(path, &dir, perms);
6574 if (r < 0)
6575 return r;
6576 if (cct->_conf->client_permissions) {
6577 int r = may_create(dir.get(), perms);
6578 if (r < 0)
6579 return r;
6580 }
6581 return _symlink(dir.get(), name.c_str(), target, perms);
6582}
6583
6584int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6585{
6586 Mutex::Locker lock(client_lock);
6587 tout(cct) << "readlink" << std::endl;
6588 tout(cct) << relpath << std::endl;
6589
181888fb
FG
6590 if (unmounting)
6591 return -ENOTCONN;
6592
7c673cae
FG
6593 filepath path(relpath);
6594 InodeRef in;
6595 int r = path_walk(path, &in, perms, false);
6596 if (r < 0)
6597 return r;
6598
6599 return _readlink(in.get(), buf, size);
6600}
6601
6602int Client::_readlink(Inode *in, char *buf, size_t size)
6603{
6604 if (!in->is_symlink())
6605 return -EINVAL;
6606
6607 // copy into buf (at most size bytes)
6608 int r = in->symlink.length();
6609 if (r > (int)size)
6610 r = size;
6611 memcpy(buf, in->symlink.c_str(), r);
6612 return r;
6613}
6614
6615
6616// inode stuff
6617
6618int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6619{
6620 bool yes = in->caps_issued_mask(mask);
6621
6622 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6623 if (yes && !force)
6624 return 0;
6625
6626 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6627 filepath path;
6628 in->make_nosnap_relative_path(path);
6629 req->set_filepath(path);
6630 req->set_inode(in);
6631 req->head.args.getattr.mask = mask;
6632
6633 int res = make_request(req, perms);
6634 ldout(cct, 10) << "_getattr result=" << res << dendl;
6635 return res;
6636}
6637
6638int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6639 const UserPerm& perms, InodeRef *inp)
6640{
6641 int issued = in->caps_issued();
6642
6643 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6644 ccap_string(issued) << dendl;
6645
6646 if (in->snapid != CEPH_NOSNAP) {
6647 return -EROFS;
6648 }
6649 if ((mask & CEPH_SETATTR_SIZE) &&
6650 (unsigned long)stx->stx_size > in->size &&
6651 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6652 perms)) {
6653 return -EDQUOT;
6654 }
6655
6656 // make the change locally?
6657 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6658 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6659 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6660 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6661 << in->cap_dirtier_gid << ", forcing sync setattr"
6662 << dendl;
6663 /*
6664 * This works because we implicitly flush the caps as part of the
6665 * request, so the cap update check will happen with the writeback
6666 * cap context, and then the setattr check will happen with the
6667 * caller's context.
6668 *
6669 * In reality this pattern is likely pretty rare (different users
6670 * setattr'ing the same file). If that turns out not to be the
6671 * case later, we can build a more complex pipelined cap writeback
6672 * infrastructure...
6673 */
6674 if (!mask)
6675 mask |= CEPH_SETATTR_CTIME;
6676 goto force_request;
6677 }
6678
6679 if (!mask) {
6680 // caller just needs us to bump the ctime
6681 in->ctime = ceph_clock_now();
6682 in->cap_dirtier_uid = perms.uid();
6683 in->cap_dirtier_gid = perms.gid();
6684 if (issued & CEPH_CAP_AUTH_EXCL)
6685 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6686 else if (issued & CEPH_CAP_FILE_EXCL)
6687 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6688 else if (issued & CEPH_CAP_XATTR_EXCL)
6689 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6690 else
6691 mask |= CEPH_SETATTR_CTIME;
6692 }
6693
6694 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6695 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6696
6697 mask &= ~CEPH_SETATTR_KILL_SGUID;
6698
6699 if (mask & CEPH_SETATTR_UID) {
6700 in->ctime = ceph_clock_now();
6701 in->cap_dirtier_uid = perms.uid();
6702 in->cap_dirtier_gid = perms.gid();
6703 in->uid = stx->stx_uid;
6704 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6705 mask &= ~CEPH_SETATTR_UID;
6706 kill_sguid = true;
6707 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6708 }
6709 if (mask & CEPH_SETATTR_GID) {
6710 in->ctime = ceph_clock_now();
6711 in->cap_dirtier_uid = perms.uid();
6712 in->cap_dirtier_gid = perms.gid();
6713 in->gid = stx->stx_gid;
6714 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6715 mask &= ~CEPH_SETATTR_GID;
6716 kill_sguid = true;
6717 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6718 }
6719
6720 if (mask & CEPH_SETATTR_MODE) {
6721 in->ctime = ceph_clock_now();
6722 in->cap_dirtier_uid = perms.uid();
6723 in->cap_dirtier_gid = perms.gid();
6724 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6725 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6726 mask &= ~CEPH_SETATTR_MODE;
6727 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6728 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6729 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6730 in->mode &= ~(S_ISUID|S_ISGID);
7c673cae
FG
6731 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6732 }
6733
6734 if (mask & CEPH_SETATTR_BTIME) {
6735 in->ctime = ceph_clock_now();
6736 in->cap_dirtier_uid = perms.uid();
6737 in->cap_dirtier_gid = perms.gid();
6738 in->btime = utime_t(stx->stx_btime);
6739 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6740 mask &= ~CEPH_SETATTR_BTIME;
6741 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6742 }
6743 } else if (mask & CEPH_SETATTR_SIZE) {
6744 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6745 mask |= CEPH_SETATTR_KILL_SGUID;
6746 }
6747
6748 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6749 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6750 if (mask & CEPH_SETATTR_MTIME)
6751 in->mtime = utime_t(stx->stx_mtime);
6752 if (mask & CEPH_SETATTR_ATIME)
6753 in->atime = utime_t(stx->stx_atime);
6754 in->ctime = ceph_clock_now();
6755 in->cap_dirtier_uid = perms.uid();
6756 in->cap_dirtier_gid = perms.gid();
6757 in->time_warp_seq++;
6758 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6759 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6760 }
6761 }
6762 if (!mask) {
6763 in->change_attr++;
6764 return 0;
6765 }
6766
6767force_request:
6768 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6769
6770 filepath path;
6771
6772 in->make_nosnap_relative_path(path);
6773 req->set_filepath(path);
6774 req->set_inode(in);
6775
6776 if (mask & CEPH_SETATTR_KILL_SGUID) {
6777 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6778 }
6779 if (mask & CEPH_SETATTR_MODE) {
6780 req->head.args.setattr.mode = stx->stx_mode;
6781 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6782 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6783 }
6784 if (mask & CEPH_SETATTR_UID) {
6785 req->head.args.setattr.uid = stx->stx_uid;
6786 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6787 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6788 }
6789 if (mask & CEPH_SETATTR_GID) {
6790 req->head.args.setattr.gid = stx->stx_gid;
6791 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6792 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6793 }
6794 if (mask & CEPH_SETATTR_BTIME) {
6795 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6796 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6797 }
6798 if (mask & CEPH_SETATTR_MTIME) {
6799 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6800 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6801 CEPH_CAP_FILE_WR;
6802 }
6803 if (mask & CEPH_SETATTR_ATIME) {
6804 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6805 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6806 CEPH_CAP_FILE_WR;
6807 }
6808 if (mask & CEPH_SETATTR_SIZE) {
6809 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6810 req->head.args.setattr.size = stx->stx_size;
6811 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6812 } else { //too big!
6813 put_request(req);
6814 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6815 return -EFBIG;
6816 }
6817 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6818 CEPH_CAP_FILE_WR;
6819 }
6820 req->head.args.setattr.mask = mask;
6821
6822 req->regetattr_mask = mask;
6823
6824 int res = make_request(req, perms, inp);
6825 ldout(cct, 10) << "_setattr result=" << res << dendl;
6826 return res;
6827}
6828
6829/* Note that we only care about attrs that setattr cares about */
6830void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6831{
6832 stx->stx_size = st->st_size;
6833 stx->stx_mode = st->st_mode;
6834 stx->stx_uid = st->st_uid;
6835 stx->stx_gid = st->st_gid;
6836 stx->stx_mtime = st->st_mtim;
6837 stx->stx_atime = st->st_atim;
6838}
6839
6840int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6841 const UserPerm& perms, InodeRef *inp)
6842{
6843 int ret = _do_setattr(in, stx, mask, perms, inp);
6844 if (ret < 0)
6845 return ret;
6846 if (mask & CEPH_SETATTR_MODE)
6847 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6848 return ret;
6849}
6850
6851int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6852 const UserPerm& perms)
6853{
6854 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6855 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6856 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6857 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6858 if (cct->_conf->client_permissions) {
6859 int r = may_setattr(in.get(), stx, mask, perms);
6860 if (r < 0)
6861 return r;
6862 }
6863 return __setattrx(in.get(), stx, mask, perms);
6864}
6865
6866int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6867 const UserPerm& perms)
6868{
6869 struct ceph_statx stx;
6870
6871 stat_to_statx(attr, &stx);
6872 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
6873
6874 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6875 mask &= ~CEPH_SETATTR_UID;
6876 }
6877 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6878 mask &= ~CEPH_SETATTR_GID;
6879 }
6880
7c673cae
FG
6881 return _setattrx(in, &stx, mask, perms);
6882}
6883
6884int Client::setattr(const char *relpath, struct stat *attr, int mask,
6885 const UserPerm& perms)
6886{
6887 Mutex::Locker lock(client_lock);
6888 tout(cct) << "setattr" << std::endl;
6889 tout(cct) << relpath << std::endl;
6890 tout(cct) << mask << std::endl;
6891
181888fb
FG
6892 if (unmounting)
6893 return -ENOTCONN;
6894
7c673cae
FG
6895 filepath path(relpath);
6896 InodeRef in;
6897 int r = path_walk(path, &in, perms);
6898 if (r < 0)
6899 return r;
6900 return _setattr(in, attr, mask, perms);
6901}
6902
6903int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6904 const UserPerm& perms, int flags)
6905{
6906 Mutex::Locker lock(client_lock);
6907 tout(cct) << "setattrx" << std::endl;
6908 tout(cct) << relpath << std::endl;
6909 tout(cct) << mask << std::endl;
6910
181888fb
FG
6911 if (unmounting)
6912 return -ENOTCONN;
6913
7c673cae
FG
6914 filepath path(relpath);
6915 InodeRef in;
6916 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6917 if (r < 0)
6918 return r;
6919 return _setattrx(in, stx, mask, perms);
6920}
6921
6922int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6923{
6924 Mutex::Locker lock(client_lock);
6925 tout(cct) << "fsetattr" << std::endl;
6926 tout(cct) << fd << std::endl;
6927 tout(cct) << mask << std::endl;
6928
181888fb
FG
6929 if (unmounting)
6930 return -ENOTCONN;
6931
7c673cae
FG
6932 Fh *f = get_filehandle(fd);
6933 if (!f)
6934 return -EBADF;
6935#if defined(__linux__) && defined(O_PATH)
6936 if (f->flags & O_PATH)
6937 return -EBADF;
6938#endif
6939 return _setattr(f->inode, attr, mask, perms);
6940}
6941
6942int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6943{
6944 Mutex::Locker lock(client_lock);
6945 tout(cct) << "fsetattr" << std::endl;
6946 tout(cct) << fd << std::endl;
6947 tout(cct) << mask << std::endl;
6948
181888fb
FG
6949 if (unmounting)
6950 return -ENOTCONN;
6951
7c673cae
FG
6952 Fh *f = get_filehandle(fd);
6953 if (!f)
6954 return -EBADF;
6955#if defined(__linux__) && defined(O_PATH)
6956 if (f->flags & O_PATH)
6957 return -EBADF;
6958#endif
6959 return _setattrx(f->inode, stx, mask, perms);
6960}
6961
6962int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6963 frag_info_t *dirstat, int mask)
6964{
6965 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6966 Mutex::Locker lock(client_lock);
6967 tout(cct) << "stat" << std::endl;
6968 tout(cct) << relpath << std::endl;
181888fb
FG
6969
6970 if (unmounting)
6971 return -ENOTCONN;
6972
7c673cae
FG
6973 filepath path(relpath);
6974 InodeRef in;
6975 int r = path_walk(path, &in, perms, true, mask);
6976 if (r < 0)
6977 return r;
6978 r = _getattr(in, mask, perms);
6979 if (r < 0) {
6980 ldout(cct, 3) << "stat exit on error!" << dendl;
6981 return r;
6982 }
6983 fill_stat(in, stbuf, dirstat);
6984 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6985 return r;
6986}
6987
6988unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6989{
6990 unsigned mask = 0;
6991
6992 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
6993 if (flags & AT_NO_ATTR_SYNC)
6994 goto out;
6995
6996 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
6997 mask |= CEPH_CAP_PIN;
6998 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6999 mask |= CEPH_CAP_AUTH_SHARED;
7000 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7001 mask |= CEPH_CAP_LINK_SHARED;
7002 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7003 mask |= CEPH_CAP_FILE_SHARED;
7004 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7005 mask |= CEPH_CAP_XATTR_SHARED;
7006out:
7007 return mask;
7008}
7009
7010int Client::statx(const char *relpath, struct ceph_statx *stx,
7011 const UserPerm& perms,
7012 unsigned int want, unsigned int flags)
7013{
7014 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7015 Mutex::Locker lock(client_lock);
7016 tout(cct) << "statx" << std::endl;
7017 tout(cct) << relpath << std::endl;
181888fb
FG
7018
7019 if (unmounting)
7020 return -ENOTCONN;
7021
7c673cae
FG
7022 filepath path(relpath);
7023 InodeRef in;
7024
7025 unsigned mask = statx_to_mask(flags, want);
7026
7027 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7028 if (r < 0)
7029 return r;
7030
7031 r = _getattr(in, mask, perms);
7032 if (r < 0) {
7033 ldout(cct, 3) << "statx exit on error!" << dendl;
7034 return r;
7035 }
7036
7037 fill_statx(in, mask, stx);
7038 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7039 return r;
7040}
7041
7042int Client::lstat(const char *relpath, struct stat *stbuf,
7043 const UserPerm& perms, frag_info_t *dirstat, int mask)
7044{
7045 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7046 Mutex::Locker lock(client_lock);
7047 tout(cct) << "lstat" << std::endl;
7048 tout(cct) << relpath << std::endl;
181888fb
FG
7049
7050 if (unmounting)
7051 return -ENOTCONN;
7052
7c673cae
FG
7053 filepath path(relpath);
7054 InodeRef in;
7055 // don't follow symlinks
7056 int r = path_walk(path, &in, perms, false, mask);
7057 if (r < 0)
7058 return r;
7059 r = _getattr(in, mask, perms);
7060 if (r < 0) {
7061 ldout(cct, 3) << "lstat exit on error!" << dendl;
7062 return r;
7063 }
7064 fill_stat(in, stbuf, dirstat);
7065 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7066 return r;
7067}
7068
7069int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7070{
7071 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7072 << " mode 0" << oct << in->mode << dec
7073 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7074 memset(st, 0, sizeof(struct stat));
7075 if (use_faked_inos())
7076 st->st_ino = in->faked_ino;
7077 else
7078 st->st_ino = in->ino;
7079 st->st_dev = in->snapid;
7080 st->st_mode = in->mode;
7081 st->st_rdev = in->rdev;
7082 st->st_nlink = in->nlink;
7083 st->st_uid = in->uid;
7084 st->st_gid = in->gid;
7085 if (in->ctime > in->mtime) {
7086 stat_set_ctime_sec(st, in->ctime.sec());
7087 stat_set_ctime_nsec(st, in->ctime.nsec());
7088 } else {
7089 stat_set_ctime_sec(st, in->mtime.sec());
7090 stat_set_ctime_nsec(st, in->mtime.nsec());
7091 }
7092 stat_set_atime_sec(st, in->atime.sec());
7093 stat_set_atime_nsec(st, in->atime.nsec());
7094 stat_set_mtime_sec(st, in->mtime.sec());
7095 stat_set_mtime_nsec(st, in->mtime.nsec());
7096 if (in->is_dir()) {
7097 if (cct->_conf->client_dirsize_rbytes)
7098 st->st_size = in->rstat.rbytes;
7099 else
7100 st->st_size = in->dirstat.size();
7101 st->st_blocks = 1;
7102 } else {
7103 st->st_size = in->size;
7104 st->st_blocks = (in->size + 511) >> 9;
7105 }
7106 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7107
7108 if (dirstat)
7109 *dirstat = in->dirstat;
7110 if (rstat)
7111 *rstat = in->rstat;
7112
7113 return in->caps_issued();
7114}
7115
7116void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7117{
7118 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7119 << " mode 0" << oct << in->mode << dec
7120 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7121 memset(stx, 0, sizeof(struct ceph_statx));
7122
7123 /*
7124 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7125 * so that all bits are set.
7126 */
7127 if (!mask)
7128 mask = ~0;
7129
7130 /* These are always considered to be available */
7131 stx->stx_dev = in->snapid;
7132 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7133
7134 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7135 stx->stx_mode = S_IFMT & in->mode;
7136 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7137 stx->stx_rdev = in->rdev;
7138 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7139
7140 if (mask & CEPH_CAP_AUTH_SHARED) {
7141 stx->stx_uid = in->uid;
7142 stx->stx_gid = in->gid;
7143 stx->stx_mode = in->mode;
7144 in->btime.to_timespec(&stx->stx_btime);
7145 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7146 }
7147
7148 if (mask & CEPH_CAP_LINK_SHARED) {
7149 stx->stx_nlink = in->nlink;
7150 stx->stx_mask |= CEPH_STATX_NLINK;
7151 }
7152
7153 if (mask & CEPH_CAP_FILE_SHARED) {
7154
7155 in->atime.to_timespec(&stx->stx_atime);
7156 in->mtime.to_timespec(&stx->stx_mtime);
7157
7158 if (in->is_dir()) {
7159 if (cct->_conf->client_dirsize_rbytes)
7160 stx->stx_size = in->rstat.rbytes;
7161 else
7162 stx->stx_size = in->dirstat.size();
7163 stx->stx_blocks = 1;
7164 } else {
7165 stx->stx_size = in->size;
7166 stx->stx_blocks = (in->size + 511) >> 9;
7167 }
7168 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7169 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7170 }
7171
7172 /* Change time and change_attr both require all shared caps to view */
7173 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7174 stx->stx_version = in->change_attr;
7175 if (in->ctime > in->mtime)
7176 in->ctime.to_timespec(&stx->stx_ctime);
7177 else
7178 in->mtime.to_timespec(&stx->stx_ctime);
7179 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7180 }
7181
7182}
7183
7184void Client::touch_dn(Dentry *dn)
7185{
7186 lru.lru_touch(dn);
7187}
7188
7189int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7190{
7191 Mutex::Locker lock(client_lock);
7192 tout(cct) << "chmod" << std::endl;
7193 tout(cct) << relpath << std::endl;
7194 tout(cct) << mode << std::endl;
181888fb
FG
7195
7196 if (unmounting)
7197 return -ENOTCONN;
7198
7c673cae
FG
7199 filepath path(relpath);
7200 InodeRef in;
7201 int r = path_walk(path, &in, perms);
7202 if (r < 0)
7203 return r;
7204 struct stat attr;
7205 attr.st_mode = mode;
7206 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7207}
7208
7209int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7210{
7211 Mutex::Locker lock(client_lock);
7212 tout(cct) << "fchmod" << std::endl;
7213 tout(cct) << fd << std::endl;
7214 tout(cct) << mode << std::endl;
181888fb
FG
7215
7216 if (unmounting)
7217 return -ENOTCONN;
7218
7c673cae
FG
7219 Fh *f = get_filehandle(fd);
7220 if (!f)
7221 return -EBADF;
7222#if defined(__linux__) && defined(O_PATH)
7223 if (f->flags & O_PATH)
7224 return -EBADF;
7225#endif
7226 struct stat attr;
7227 attr.st_mode = mode;
7228 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7229}
7230
7231int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7232{
7233 Mutex::Locker lock(client_lock);
7234 tout(cct) << "lchmod" << std::endl;
7235 tout(cct) << relpath << std::endl;
7236 tout(cct) << mode << std::endl;
181888fb
FG
7237
7238 if (unmounting)
7239 return -ENOTCONN;
7240
7c673cae
FG
7241 filepath path(relpath);
7242 InodeRef in;
7243 // don't follow symlinks
7244 int r = path_walk(path, &in, perms, false);
7245 if (r < 0)
7246 return r;
7247 struct stat attr;
7248 attr.st_mode = mode;
7249 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7250}
7251
7252int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7253 const UserPerm& perms)
7254{
7255 Mutex::Locker lock(client_lock);
7256 tout(cct) << "chown" << std::endl;
7257 tout(cct) << relpath << std::endl;
7258 tout(cct) << new_uid << std::endl;
7259 tout(cct) << new_gid << std::endl;
181888fb
FG
7260
7261 if (unmounting)
7262 return -ENOTCONN;
7263
7c673cae
FG
7264 filepath path(relpath);
7265 InodeRef in;
7266 int r = path_walk(path, &in, perms);
7267 if (r < 0)
7268 return r;
7269 struct stat attr;
7270 attr.st_uid = new_uid;
7271 attr.st_gid = new_gid;
181888fb 7272 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7273}
7274
7275int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7276{
7277 Mutex::Locker lock(client_lock);
7278 tout(cct) << "fchown" << std::endl;
7279 tout(cct) << fd << std::endl;
7280 tout(cct) << new_uid << std::endl;
7281 tout(cct) << new_gid << std::endl;
181888fb
FG
7282
7283 if (unmounting)
7284 return -ENOTCONN;
7285
7c673cae
FG
7286 Fh *f = get_filehandle(fd);
7287 if (!f)
7288 return -EBADF;
7289#if defined(__linux__) && defined(O_PATH)
7290 if (f->flags & O_PATH)
7291 return -EBADF;
7292#endif
7293 struct stat attr;
7294 attr.st_uid = new_uid;
7295 attr.st_gid = new_gid;
7296 int mask = 0;
7297 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7298 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7299 return _setattr(f->inode, &attr, mask, perms);
7300}
7301
7302int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7303 const UserPerm& perms)
7304{
7305 Mutex::Locker lock(client_lock);
7306 tout(cct) << "lchown" << std::endl;
7307 tout(cct) << relpath << std::endl;
7308 tout(cct) << new_uid << std::endl;
7309 tout(cct) << new_gid << std::endl;
181888fb
FG
7310
7311 if (unmounting)
7312 return -ENOTCONN;
7313
7c673cae
FG
7314 filepath path(relpath);
7315 InodeRef in;
7316 // don't follow symlinks
7317 int r = path_walk(path, &in, perms, false);
7318 if (r < 0)
7319 return r;
7320 struct stat attr;
7321 attr.st_uid = new_uid;
7322 attr.st_gid = new_gid;
7323 int mask = 0;
7324 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7325 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7326 return _setattr(in, &attr, mask, perms);
7327}
7328
7329int Client::utime(const char *relpath, struct utimbuf *buf,
7330 const UserPerm& perms)
7331{
7332 Mutex::Locker lock(client_lock);
7333 tout(cct) << "utime" << std::endl;
7334 tout(cct) << relpath << std::endl;
7335 tout(cct) << buf->modtime << std::endl;
7336 tout(cct) << buf->actime << std::endl;
181888fb
FG
7337
7338 if (unmounting)
7339 return -ENOTCONN;
7340
7c673cae
FG
7341 filepath path(relpath);
7342 InodeRef in;
7343 int r = path_walk(path, &in, perms);
7344 if (r < 0)
7345 return r;
7346 struct stat attr;
7347 stat_set_mtime_sec(&attr, buf->modtime);
7348 stat_set_mtime_nsec(&attr, 0);
7349 stat_set_atime_sec(&attr, buf->actime);
7350 stat_set_atime_nsec(&attr, 0);
7351 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7352}
7353
7354int Client::lutime(const char *relpath, struct utimbuf *buf,
7355 const UserPerm& perms)
7356{
7357 Mutex::Locker lock(client_lock);
7358 tout(cct) << "lutime" << std::endl;
7359 tout(cct) << relpath << std::endl;
7360 tout(cct) << buf->modtime << std::endl;
7361 tout(cct) << buf->actime << std::endl;
181888fb
FG
7362
7363 if (unmounting)
7364 return -ENOTCONN;
7365
7c673cae
FG
7366 filepath path(relpath);
7367 InodeRef in;
7368 // don't follow symlinks
7369 int r = path_walk(path, &in, perms, false);
7370 if (r < 0)
7371 return r;
7372 struct stat attr;
7373 stat_set_mtime_sec(&attr, buf->modtime);
7374 stat_set_mtime_nsec(&attr, 0);
7375 stat_set_atime_sec(&attr, buf->actime);
7376 stat_set_atime_nsec(&attr, 0);
7377 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7378}
7379
7380int Client::flock(int fd, int operation, uint64_t owner)
7381{
7382 Mutex::Locker lock(client_lock);
7383 tout(cct) << "flock" << std::endl;
7384 tout(cct) << fd << std::endl;
7385 tout(cct) << operation << std::endl;
7386 tout(cct) << owner << std::endl;
181888fb
FG
7387
7388 if (unmounting)
7389 return -ENOTCONN;
7390
7c673cae
FG
7391 Fh *f = get_filehandle(fd);
7392 if (!f)
7393 return -EBADF;
7394
7395 return _flock(f, operation, owner);
7396}
7397
7398int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7399{
7400 Mutex::Locker lock(client_lock);
7401 tout(cct) << "opendir" << std::endl;
7402 tout(cct) << relpath << std::endl;
181888fb
FG
7403
7404 if (unmounting)
7405 return -ENOTCONN;
7406
7c673cae
FG
7407 filepath path(relpath);
7408 InodeRef in;
7409 int r = path_walk(path, &in, perms, true);
7410 if (r < 0)
7411 return r;
7412 if (cct->_conf->client_permissions) {
7413 int r = may_open(in.get(), O_RDONLY, perms);
7414 if (r < 0)
7415 return r;
7416 }
7417 r = _opendir(in.get(), dirpp, perms);
7418 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7419 if (r != -ENOTDIR)
7420 tout(cct) << (unsigned long)*dirpp << std::endl;
7421 return r;
7422}
7423
7424int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7425{
7426 if (!in->is_dir())
7427 return -ENOTDIR;
7428 *dirpp = new dir_result_t(in, perms);
7429 opened_dirs.insert(*dirpp);
7430 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7431 return 0;
7432}
7433
7434
7435int Client::closedir(dir_result_t *dir)
7436{
7437 Mutex::Locker lock(client_lock);
7438 tout(cct) << "closedir" << std::endl;
7439 tout(cct) << (unsigned long)dir << std::endl;
7440
7441 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7442 _closedir(dir);
7443 return 0;
7444}
7445
7446void Client::_closedir(dir_result_t *dirp)
7447{
7448 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7449 if (dirp->inode) {
7450 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7451 dirp->inode.reset();
7452 }
7453 _readdir_drop_dirp_buffer(dirp);
7454 opened_dirs.erase(dirp);
7455 delete dirp;
7456}
7457
7458void Client::rewinddir(dir_result_t *dirp)
7459{
7460 Mutex::Locker lock(client_lock);
7c673cae 7461 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
181888fb
FG
7462
7463 if (unmounting)
7464 return;
7465
7c673cae
FG
7466 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7467 _readdir_drop_dirp_buffer(d);
7468 d->reset();
7469}
7470
7471loff_t Client::telldir(dir_result_t *dirp)
7472{
7473 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7474 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7475 return d->offset;
7476}
7477
7478void Client::seekdir(dir_result_t *dirp, loff_t offset)
7479{
7480 Mutex::Locker lock(client_lock);
7481
7482 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7483
181888fb
FG
7484 if (unmounting)
7485 return;
7486
7c673cae
FG
7487 if (offset == dirp->offset)
7488 return;
7489
7490 if (offset > dirp->offset)
7491 dirp->release_count = 0; // bump if we do a forward seek
7492 else
7493 dirp->ordered_count = 0; // disable filling readdir cache
7494
7495 if (dirp->hash_order()) {
7496 if (dirp->offset > offset) {
7497 _readdir_drop_dirp_buffer(dirp);
7498 dirp->reset();
7499 }
7500 } else {
7501 if (offset == 0 ||
7502 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7503 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7504 _readdir_drop_dirp_buffer(dirp);
7505 dirp->reset();
7506 }
7507 }
7508
7509 dirp->offset = offset;
7510}
7511
7512
7513//struct dirent {
7514// ino_t d_ino; /* inode number */
7515// off_t d_off; /* offset to the next dirent */
7516// unsigned short d_reclen; /* length of this record */
7517// unsigned char d_type; /* type of file */
7518// char d_name[256]; /* filename */
7519//};
7520void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7521{
7522 strncpy(de->d_name, name, 255);
7523 de->d_name[255] = '\0';
7524#ifndef __CYGWIN__
7525 de->d_ino = ino;
7526#if !defined(DARWIN) && !defined(__FreeBSD__)
7527 de->d_off = next_off;
7528#endif
7529 de->d_reclen = 1;
7530 de->d_type = IFTODT(type);
7531 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7532 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7533#endif
7534}
7535
7536void Client::_readdir_next_frag(dir_result_t *dirp)
7537{
7538 frag_t fg = dirp->buffer_frag;
7539
7540 if (fg.is_rightmost()) {
7541 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7542 dirp->set_end();
7543 return;
7544 }
7545
7546 // advance
7547 fg = fg.next();
7548 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7549
7550 if (dirp->hash_order()) {
7551 // keep last_name
7552 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7553 if (dirp->offset < new_offset) // don't decrease offset
7554 dirp->offset = new_offset;
7555 } else {
7556 dirp->last_name.clear();
7557 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7558 _readdir_rechoose_frag(dirp);
7559 }
7560}
7561
7562void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7563{
7564 assert(dirp->inode);
7565
7566 if (dirp->hash_order())
7567 return;
7568
7569 frag_t cur = frag_t(dirp->offset_high());
7570 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7571 if (fg != cur) {
7572 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7573 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7574 dirp->last_name.clear();
7575 dirp->next_offset = 2;
7576 }
7577}
7578
7579void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7580{
7581 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7582 dirp->buffer.clear();
7583}
7584
7585int Client::_readdir_get_frag(dir_result_t *dirp)
7586{
7587 assert(dirp);
7588 assert(dirp->inode);
7589
7590 // get the current frag.
7591 frag_t fg;
7592 if (dirp->hash_order())
7593 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7594 else
7595 fg = frag_t(dirp->offset_high());
7596
7597 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7598 << " offset " << hex << dirp->offset << dec << dendl;
7599
7600 int op = CEPH_MDS_OP_READDIR;
7601 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7602 op = CEPH_MDS_OP_LSSNAP;
7603
7604 InodeRef& diri = dirp->inode;
7605
7606 MetaRequest *req = new MetaRequest(op);
7607 filepath path;
7608 diri->make_nosnap_relative_path(path);
7609 req->set_filepath(path);
7610 req->set_inode(diri.get());
7611 req->head.args.readdir.frag = fg;
7612 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7613 if (dirp->last_name.length()) {
7614 req->path2.set_path(dirp->last_name.c_str());
7615 } else if (dirp->hash_order()) {
7616 req->head.args.readdir.offset_hash = dirp->offset_high();
7617 }
7618 req->dirp = dirp;
7619
7620 bufferlist dirbl;
7621 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7622
7623 if (res == -EAGAIN) {
7624 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7625 _readdir_rechoose_frag(dirp);
7626 return _readdir_get_frag(dirp);
7627 }
7628
7629 if (res == 0) {
7630 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7631 << " size " << dirp->buffer.size() << dendl;
7632 } else {
7633 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7634 dirp->set_end();
7635 }
7636
7637 return res;
7638}
7639
7640struct dentry_off_lt {
7641 bool operator()(const Dentry* dn, int64_t off) const {
7642 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7643 }
7644};
7645
7646int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7647 int caps, bool getref)
7648{
7649 assert(client_lock.is_locked());
7650 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7651 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7652 << dendl;
7653 Dir *dir = dirp->inode->dir;
7654
7655 if (!dir) {
7656 ldout(cct, 10) << " dir is empty" << dendl;
7657 dirp->set_end();
7658 return 0;
7659 }
7660
7661 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7662 dir->readdir_cache.end(),
7663 dirp->offset, dentry_off_lt());
7664
7665 string dn_name;
7666 while (true) {
7667 if (!dirp->inode->is_complete_and_ordered())
7668 return -EAGAIN;
7669 if (pd == dir->readdir_cache.end())
7670 break;
7671 Dentry *dn = *pd;
7672 if (dn->inode == NULL) {
7673 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7674 ++pd;
7675 continue;
7676 }
7677 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7678 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7679 ++pd;
7680 continue;
7681 }
7682
7683 int r = _getattr(dn->inode, caps, dirp->perms);
7684 if (r < 0)
7685 return r;
7686
7687 struct ceph_statx stx;
7688 struct dirent de;
7689 fill_statx(dn->inode, caps, &stx);
7690
7691 uint64_t next_off = dn->offset + 1;
7692 ++pd;
7693 if (pd == dir->readdir_cache.end())
7694 next_off = dir_result_t::END;
7695
7696 Inode *in = NULL;
7697 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7698 if (getref) {
7699 in = dn->inode.get();
7700 _ll_get(in);
7701 }
7702
7703 dn_name = dn->name; // fill in name while we have lock
7704
7705 client_lock.Unlock();
7706 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7707 client_lock.Lock();
7708 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7709 << " = " << r << dendl;
7710 if (r < 0) {
7711 return r;
7712 }
7713
7714 dirp->offset = next_off;
7715 if (dirp->at_end())
7716 dirp->next_offset = 2;
7717 else
7718 dirp->next_offset = dirp->offset_low();
7719 dirp->last_name = dn_name; // we successfully returned this one; update!
7720 if (r > 0)
7721 return r;
7722 }
7723
7724 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7725 dirp->set_end();
7726 return 0;
7727}
7728
7729int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7730 unsigned want, unsigned flags, bool getref)
7731{
7732 int caps = statx_to_mask(flags, want);
7733
7734 Mutex::Locker lock(client_lock);
7735
181888fb
FG
7736 if (unmounting)
7737 return -ENOTCONN;
7738
7c673cae
FG
7739 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7740
7741 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7742 << dec << " at_end=" << dirp->at_end()
7743 << " hash_order=" << dirp->hash_order() << dendl;
7744
7745 struct dirent de;
7746 struct ceph_statx stx;
7747 memset(&de, 0, sizeof(de));
7748 memset(&stx, 0, sizeof(stx));
7749
7750 InodeRef& diri = dirp->inode;
7751
7752 if (dirp->at_end())
7753 return 0;
7754
7755 if (dirp->offset == 0) {
7756 ldout(cct, 15) << " including ." << dendl;
7757 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7758 uint64_t next_off = 1;
7759
7760 int r;
7761 r = _getattr(diri, caps, dirp->perms);
7762 if (r < 0)
7763 return r;
7764
7765 fill_statx(diri, caps, &stx);
7766 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7767
7768 Inode *inode = NULL;
7769 if (getref) {
7770 inode = diri.get();
7771 _ll_get(inode);
7772 }
7773
7774 client_lock.Unlock();
7775 r = cb(p, &de, &stx, next_off, inode);
7776 client_lock.Lock();
7777 if (r < 0)
7778 return r;
7779
7780 dirp->offset = next_off;
7781 if (r > 0)
7782 return r;
7783 }
7784 if (dirp->offset == 1) {
7785 ldout(cct, 15) << " including .." << dendl;
7786 uint64_t next_off = 2;
7787 InodeRef in;
7788 if (diri->dn_set.empty())
7789 in = diri;
7790 else
7791 in = diri->get_first_parent()->inode;
7792
7793 int r;
7794 r = _getattr(diri, caps, dirp->perms);
7795 if (r < 0)
7796 return r;
7797
7798 fill_statx(in, caps, &stx);
7799 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7800
7801 Inode *inode = NULL;
7802 if (getref) {
7803 inode = in.get();
7804 _ll_get(inode);
7805 }
7806
7807 client_lock.Unlock();
7808 r = cb(p, &de, &stx, next_off, inode);
7809 client_lock.Lock();
7810 if (r < 0)
7811 return r;
7812
7813 dirp->offset = next_off;
7814 if (r > 0)
7815 return r;
7816 }
7817
7818 // can we read from our cache?
7819 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7820 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7821 << dirp->inode->is_complete_and_ordered()
7822 << " issued " << ccap_string(dirp->inode->caps_issued())
7823 << dendl;
7824 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7825 dirp->inode->is_complete_and_ordered() &&
7826 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
7827 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7828 if (err != -EAGAIN)
7829 return err;
7830 }
7831
7832 while (1) {
7833 if (dirp->at_end())
7834 return 0;
7835
7836 bool check_caps = true;
7837 if (!dirp->is_cached()) {
7838 int r = _readdir_get_frag(dirp);
7839 if (r)
7840 return r;
7841 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7842 // different than the requested one. (our dirfragtree was outdated)
7843 check_caps = false;
7844 }
7845 frag_t fg = dirp->buffer_frag;
7846
7847 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7848 << " offset " << hex << dirp->offset << dendl;
7849
7850 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7851 dirp->offset, dir_result_t::dentry_off_lt());
7852 it != dirp->buffer.end();
7853 ++it) {
7854 dir_result_t::dentry &entry = *it;
7855
7856 uint64_t next_off = entry.offset + 1;
7857
7858 int r;
7859 if (check_caps) {
7860 r = _getattr(entry.inode, caps, dirp->perms);
7861 if (r < 0)
7862 return r;
7863 }
7864
7865 fill_statx(entry.inode, caps, &stx);
7866 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7867
7868 Inode *inode = NULL;
7869 if (getref) {
7870 inode = entry.inode.get();
7871 _ll_get(inode);
7872 }
7873
7874 client_lock.Unlock();
7875 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7876 client_lock.Lock();
7877
7878 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7879 << " = " << r << dendl;
7880 if (r < 0)
7881 return r;
7882
7883 dirp->offset = next_off;
7884 if (r > 0)
7885 return r;
7886 }
7887
7888 if (dirp->next_offset > 2) {
7889 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7890 _readdir_drop_dirp_buffer(dirp);
7891 continue; // more!
7892 }
7893
7894 if (!fg.is_rightmost()) {
7895 // next frag!
7896 _readdir_next_frag(dirp);
7897 continue;
7898 }
7899
7900 if (diri->shared_gen == dirp->start_shared_gen &&
7901 diri->dir_release_count == dirp->release_count) {
7902 if (diri->dir_ordered_count == dirp->ordered_count) {
7903 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7904 if (diri->dir) {
7905 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7906 diri->dir->readdir_cache.resize(dirp->cache_index);
7907 }
7908 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7909 } else {
7910 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7911 diri->flags |= I_COMPLETE;
7912 }
7913 }
7914
7915 dirp->set_end();
7916 return 0;
7917 }
7918 ceph_abort();
7919 return 0;
7920}
7921
7922
7923int Client::readdir_r(dir_result_t *d, struct dirent *de)
7924{
7925 return readdirplus_r(d, de, 0, 0, 0, NULL);
7926}
7927
7928/*
7929 * readdirplus_r
7930 *
7931 * returns
7932 * 1 if we got a dirent
7933 * 0 for end of directory
7934 * <0 on error
7935 */
7936
7937struct single_readdir {
7938 struct dirent *de;
7939 struct ceph_statx *stx;
7940 Inode *inode;
7941 bool full;
7942};
7943
7944static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7945 struct ceph_statx *stx, off_t off,
7946 Inode *in)
7947{
7948 single_readdir *c = static_cast<single_readdir *>(p);
7949
7950 if (c->full)
7951 return -1; // already filled this dirent
7952
7953 *c->de = *de;
7954 if (c->stx)
7955 *c->stx = *stx;
7956 c->inode = in;
7957 c->full = true;
7958 return 1;
7959}
7960
7961struct dirent *Client::readdir(dir_result_t *d)
7962{
7963 int ret;
7964 static struct dirent de;
7965 single_readdir sr;
7966 sr.de = &de;
7967 sr.stx = NULL;
7968 sr.inode = NULL;
7969 sr.full = false;
7970
7971 // our callback fills the dirent and sets sr.full=true on first
7972 // call, and returns -1 the second time around.
7973 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
7974 if (ret < -1) {
7975 errno = -ret; // this sucks.
7976 return (dirent *) NULL;
7977 }
7978 if (sr.full) {
7979 return &de;
7980 }
7981 return (dirent *) NULL;
7982}
7983
7984int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
7985 struct ceph_statx *stx, unsigned want,
7986 unsigned flags, Inode **out)
7987{
7988 single_readdir sr;
7989 sr.de = de;
7990 sr.stx = stx;
7991 sr.inode = NULL;
7992 sr.full = false;
7993
7994 // our callback fills the dirent and sets sr.full=true on first
7995 // call, and returns -1 the second time around.
7996 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
7997 if (r < -1)
7998 return r;
7999 if (out)
8000 *out = sr.inode;
8001 if (sr.full)
8002 return 1;
8003 return 0;
8004}
8005
8006
8007/* getdents */
8008struct getdents_result {
8009 char *buf;
8010 int buflen;
8011 int pos;
8012 bool fullent;
8013};
8014
8015static int _readdir_getdent_cb(void *p, struct dirent *de,
8016 struct ceph_statx *stx, off_t off, Inode *in)
8017{
8018 struct getdents_result *c = static_cast<getdents_result *>(p);
8019
8020 int dlen;
8021 if (c->fullent)
8022 dlen = sizeof(*de);
8023 else
8024 dlen = strlen(de->d_name) + 1;
8025
8026 if (c->pos + dlen > c->buflen)
8027 return -1; // doesn't fit
8028
8029 if (c->fullent) {
8030 memcpy(c->buf + c->pos, de, sizeof(*de));
8031 } else {
8032 memcpy(c->buf + c->pos, de->d_name, dlen);
8033 }
8034 c->pos += dlen;
8035 return 0;
8036}
8037
8038int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8039{
8040 getdents_result gr;
8041 gr.buf = buf;
8042 gr.buflen = buflen;
8043 gr.fullent = fullent;
8044 gr.pos = 0;
8045
8046 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8047
8048 if (r < 0) { // some error
8049 if (r == -1) { // buffer ran out of space
8050 if (gr.pos) { // but we got some entries already!
8051 return gr.pos;
8052 } // or we need a larger buffer
8053 return -ERANGE;
8054 } else { // actual error, return it
8055 return r;
8056 }
8057 }
8058 return gr.pos;
8059}
8060
8061
8062/* getdir */
8063struct getdir_result {
8064 list<string> *contents;
8065 int num;
8066};
8067
8068static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8069{
8070 getdir_result *r = static_cast<getdir_result *>(p);
8071
8072 r->contents->push_back(de->d_name);
8073 r->num++;
8074 return 0;
8075}
8076
8077int Client::getdir(const char *relpath, list<string>& contents,
8078 const UserPerm& perms)
8079{
8080 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8081 {
8082 Mutex::Locker lock(client_lock);
8083 tout(cct) << "getdir" << std::endl;
8084 tout(cct) << relpath << std::endl;
8085 }
8086
8087 dir_result_t *d;
8088 int r = opendir(relpath, &d, perms);
8089 if (r < 0)
8090 return r;
8091
8092 getdir_result gr;
8093 gr.contents = &contents;
8094 gr.num = 0;
8095 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8096
8097 closedir(d);
8098
8099 if (r < 0)
8100 return r;
8101 return gr.num;
8102}
8103
8104
8105/****** file i/o **********/
8106int Client::open(const char *relpath, int flags, const UserPerm& perms,
8107 mode_t mode, int stripe_unit, int stripe_count,
8108 int object_size, const char *data_pool)
8109{
8110 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8111 Mutex::Locker lock(client_lock);
8112 tout(cct) << "open" << std::endl;
8113 tout(cct) << relpath << std::endl;
8114 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8115
181888fb
FG
8116 if (unmounting)
8117 return -ENOTCONN;
8118
7c673cae
FG
8119 Fh *fh = NULL;
8120
8121#if defined(__linux__) && defined(O_PATH)
8122 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8123 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8124 * in kernel (fs/open.c). */
8125 if (flags & O_PATH)
8126 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8127#endif
8128
8129 filepath path(relpath);
8130 InodeRef in;
8131 bool created = false;
8132 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8133 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8134 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8135
8136 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8137 return -EEXIST;
8138
8139#if defined(__linux__) && defined(O_PATH)
8140 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8141#else
8142 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8143#endif
8144 return -ELOOP;
8145
8146 if (r == -ENOENT && (flags & O_CREAT)) {
8147 filepath dirpath = path;
8148 string dname = dirpath.last_dentry();
8149 dirpath.pop_dentry();
8150 InodeRef dir;
8151 r = path_walk(dirpath, &dir, perms, true,
8152 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8153 if (r < 0)
8154 goto out;
8155 if (cct->_conf->client_permissions) {
8156 r = may_create(dir.get(), perms);
8157 if (r < 0)
8158 goto out;
8159 }
8160 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8161 stripe_count, object_size, data_pool, &created, perms);
8162 }
8163 if (r < 0)
8164 goto out;
8165
8166 if (!created) {
8167 // posix says we can only check permissions of existing files
8168 if (cct->_conf->client_permissions) {
8169 r = may_open(in.get(), flags, perms);
8170 if (r < 0)
8171 goto out;
8172 }
8173 }
8174
8175 if (!fh)
8176 r = _open(in.get(), flags, mode, &fh, perms);
8177 if (r >= 0) {
8178 // allocate a integer file descriptor
8179 assert(fh);
8180 r = get_fd();
8181 assert(fd_map.count(r) == 0);
8182 fd_map[r] = fh;
8183 }
8184
8185 out:
8186 tout(cct) << r << std::endl;
8187 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8188 return r;
8189}
8190
8191int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8192{
8193 /* Use default file striping parameters */
8194 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8195}
8196
8197int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8198 const UserPerm& perms)
8199{
8200 Mutex::Locker lock(client_lock);
8201 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8202
181888fb
FG
8203 if (unmounting)
8204 return -ENOTCONN;
8205
7c673cae
FG
8206 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8207 filepath path(ino);
8208 req->set_filepath(path);
8209
8210 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8211 char f[30];
8212 sprintf(f, "%u", h);
8213 filepath path2(dirino);
8214 path2.push_dentry(string(f));
8215 req->set_filepath2(path2);
8216
8217 int r = make_request(req, perms, NULL, NULL,
8218 rand() % mdsmap->get_num_in_mds());
8219 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8220 return r;
8221}
8222
8223
8224/**
8225 * Load inode into local cache.
8226 *
8227 * If inode pointer is non-NULL, and take a reference on
8228 * the resulting Inode object in one operation, so that caller
8229 * can safely assume inode will still be there after return.
8230 */
8231int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8232{
8233 Mutex::Locker lock(client_lock);
8234 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8235
181888fb
FG
8236 if (unmounting)
8237 return -ENOTCONN;
8238
7c673cae
FG
8239 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8240 filepath path(ino);
8241 req->set_filepath(path);
8242
8243 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8244 if (r == 0 && inode != NULL) {
8245 vinodeno_t vino(ino, CEPH_NOSNAP);
8246 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8247 assert(p != inode_map.end());
8248 *inode = p->second;
8249 _ll_get(*inode);
8250 }
8251 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8252 return r;
8253}
8254
8255
8256
8257/**
8258 * Find the parent inode of `ino` and insert it into
8259 * our cache. Conditionally also set `parent` to a referenced
8260 * Inode* if caller provides non-NULL value.
8261 */
8262int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8263{
8264 Mutex::Locker lock(client_lock);
8265 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8266
181888fb
FG
8267 if (unmounting)
8268 return -ENOTCONN;
8269
7c673cae
FG
8270 if (!ino->dn_set.empty()) {
8271 // if we exposed the parent here, we'd need to check permissions,
8272 // but right now we just rely on the MDS doing so in make_request
8273 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8274 return 0;
8275 }
8276
8277 if (ino->is_root()) {
8278 *parent = NULL;
8279 ldout(cct, 3) << "ino is root, no parent" << dendl;
8280 return -EINVAL;
8281 }
8282
8283 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8284 filepath path(ino->ino);
8285 req->set_filepath(path);
8286
8287 InodeRef target;
8288 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8289 // Give caller a reference to the parent ino if they provided a pointer.
8290 if (parent != NULL) {
8291 if (r == 0) {
8292 *parent = target.get();
8293 _ll_get(*parent);
8294 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8295 } else {
8296 *parent = NULL;
8297 }
8298 }
8299 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8300 return r;
8301}
8302
8303
8304/**
8305 * Populate the parent dentry for `ino`, provided it is
8306 * a child of `parent`.
8307 */
8308int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8309{
8310 assert(parent->is_dir());
8311
8312 Mutex::Locker lock(client_lock);
8313 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8314
181888fb
FG
8315 if (unmounting)
8316 return -ENOTCONN;
8317
7c673cae
FG
8318 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8319 req->set_filepath2(filepath(parent->ino));
8320 req->set_filepath(filepath(ino->ino));
8321 req->set_inode(ino);
8322
8323 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8324 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8325 return r;
8326}
8327
8328
8329 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8330{
8331 assert(in);
8332 Fh *f = new Fh(in);
8333 f->mode = cmode;
8334 f->flags = flags;
8335
8336 // inode
8337 f->actor_perms = perms;
8338
8339 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8340
8341 if (in->snapid != CEPH_NOSNAP) {
8342 in->snap_cap_refs++;
8343 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8344 << ccap_string(in->caps_issued()) << dendl;
8345 }
8346
8347 const md_config_t *conf = cct->_conf;
8348 f->readahead.set_trigger_requests(1);
8349 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8350 uint64_t max_readahead = Readahead::NO_LIMIT;
8351 if (conf->client_readahead_max_bytes) {
8352 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8353 }
8354 if (conf->client_readahead_max_periods) {
8355 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8356 }
8357 f->readahead.set_max_readahead_size(max_readahead);
8358 vector<uint64_t> alignments;
8359 alignments.push_back(in->layout.get_period());
8360 alignments.push_back(in->layout.stripe_unit);
8361 f->readahead.set_alignments(alignments);
8362
8363 return f;
8364}
8365
8366int Client::_release_fh(Fh *f)
8367{
8368 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8369 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8370 Inode *in = f->inode.get();
8371 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8372
8373 if (in->snapid == CEPH_NOSNAP) {
8374 if (in->put_open_ref(f->mode)) {
8375 _flush(in, new C_Client_FlushComplete(this, in));
8376 check_caps(in, 0);
8377 }
8378 } else {
8379 assert(in->snap_cap_refs > 0);
8380 in->snap_cap_refs--;
8381 }
8382
8383 _release_filelocks(f);
8384
8385 // Finally, read any async err (i.e. from flushes)
8386 int err = f->take_async_err();
8387 if (err != 0) {
8388 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8389 << cpp_strerror(err) << dendl;
8390 } else {
8391 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8392 }
8393
8394 _put_fh(f);
8395
8396 return err;
8397}
8398
8399void Client::_put_fh(Fh *f)
8400{
8401 int left = f->put();
8402 if (!left) {
8403 delete f;
8404 }
8405}
8406
8407int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8408 const UserPerm& perms)
8409{
8410 if (in->snapid != CEPH_NOSNAP &&
8411 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8412 return -EROFS;
8413 }
8414
8415 // use normalized flags to generate cmode
8416 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8417 if (cmode < 0)
8418 return -EINVAL;
8419 int want = ceph_caps_for_mode(cmode);
8420 int result = 0;
8421
8422 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8423
8424 if ((flags & O_TRUNC) == 0 &&
8425 in->caps_issued_mask(want)) {
8426 // update wanted?
8427 check_caps(in, CHECK_CAPS_NODELAY);
8428 } else {
8429 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8430 filepath path;
8431 in->make_nosnap_relative_path(path);
8432 req->set_filepath(path);
8433 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8434 req->head.args.open.mode = mode;
8435 req->head.args.open.pool = -1;
8436 if (cct->_conf->client_debug_getattr_caps)
8437 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8438 else
8439 req->head.args.open.mask = 0;
8440 req->head.args.open.old_size = in->size; // for O_TRUNC
8441 req->set_inode(in);
8442 result = make_request(req, perms);
8443 }
8444
8445 // success?
8446 if (result >= 0) {
8447 if (fhp)
8448 *fhp = _create_fh(in, flags, cmode, perms);
8449 } else {
8450 in->put_open_ref(cmode);
8451 }
8452
8453 trim_cache();
8454
8455 return result;
8456}
8457
8458int Client::_renew_caps(Inode *in)
8459{
8460 int wanted = in->caps_file_wanted();
8461 if (in->is_any_caps() &&
8462 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8463 check_caps(in, CHECK_CAPS_NODELAY);
8464 return 0;
8465 }
8466
8467 int flags = 0;
8468 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8469 flags = O_RDWR;
8470 else if (wanted & CEPH_CAP_FILE_RD)
8471 flags = O_RDONLY;
8472 else if (wanted & CEPH_CAP_FILE_WR)
8473 flags = O_WRONLY;
8474
8475 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8476 filepath path;
8477 in->make_nosnap_relative_path(path);
8478 req->set_filepath(path);
8479 req->head.args.open.flags = flags;
8480 req->head.args.open.pool = -1;
8481 if (cct->_conf->client_debug_getattr_caps)
8482 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8483 else
8484 req->head.args.open.mask = 0;
8485 req->set_inode(in);
8486
8487 // duplicate in case Cap goes away; not sure if that race is a concern?
8488 const UserPerm *pperm = in->get_best_perms();
8489 UserPerm perms;
8490 if (pperm != NULL)
8491 perms = *pperm;
8492 int ret = make_request(req, perms);
8493 return ret;
8494}
8495
8496int Client::close(int fd)
8497{
8498 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8499 Mutex::Locker lock(client_lock);
8500 tout(cct) << "close" << std::endl;
8501 tout(cct) << fd << std::endl;
8502
181888fb
FG
8503 if (unmounting)
8504 return -ENOTCONN;
8505
7c673cae
FG
8506 Fh *fh = get_filehandle(fd);
8507 if (!fh)
8508 return -EBADF;
8509 int err = _release_fh(fh);
8510 fd_map.erase(fd);
8511 put_fd(fd);
8512 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8513 return err;
8514}
8515
8516
8517// ------------
8518// read, write
8519
8520loff_t Client::lseek(int fd, loff_t offset, int whence)
8521{
8522 Mutex::Locker lock(client_lock);
8523 tout(cct) << "lseek" << std::endl;
8524 tout(cct) << fd << std::endl;
8525 tout(cct) << offset << std::endl;
8526 tout(cct) << whence << std::endl;
8527
181888fb
FG
8528 if (unmounting)
8529 return -ENOTCONN;
8530
7c673cae
FG
8531 Fh *f = get_filehandle(fd);
8532 if (!f)
8533 return -EBADF;
8534#if defined(__linux__) && defined(O_PATH)
8535 if (f->flags & O_PATH)
8536 return -EBADF;
8537#endif
8538 return _lseek(f, offset, whence);
8539}
8540
8541loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8542{
8543 Inode *in = f->inode.get();
8544 int r;
8545
8546 switch (whence) {
8547 case SEEK_SET:
8548 f->pos = offset;
8549 break;
8550
8551 case SEEK_CUR:
8552 f->pos += offset;
8553 break;
8554
8555 case SEEK_END:
8556 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8557 if (r < 0)
8558 return r;
8559 f->pos = in->size + offset;
8560 break;
8561
8562 default:
8563 ceph_abort();
8564 }
8565
8566 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8567 return f->pos;
8568}
8569
8570
8571void Client::lock_fh_pos(Fh *f)
8572{
8573 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8574
8575 if (f->pos_locked || !f->pos_waiters.empty()) {
8576 Cond cond;
8577 f->pos_waiters.push_back(&cond);
8578 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8579 while (f->pos_locked || f->pos_waiters.front() != &cond)
8580 cond.Wait(client_lock);
8581 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8582 assert(f->pos_waiters.front() == &cond);
8583 f->pos_waiters.pop_front();
8584 }
8585
8586 f->pos_locked = true;
8587}
8588
8589void Client::unlock_fh_pos(Fh *f)
8590{
8591 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8592 f->pos_locked = false;
8593}
8594
8595int Client::uninline_data(Inode *in, Context *onfinish)
8596{
8597 if (!in->inline_data.length()) {
8598 onfinish->complete(0);
8599 return 0;
8600 }
8601
8602 char oid_buf[32];
8603 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8604 object_t oid = oid_buf;
8605
8606 ObjectOperation create_ops;
8607 create_ops.create(false);
8608
8609 objecter->mutate(oid,
8610 OSDMap::file_to_object_locator(in->layout),
8611 create_ops,
8612 in->snaprealm->get_snap_context(),
8613 ceph::real_clock::now(),
8614 0,
8615 NULL);
8616
8617 bufferlist inline_version_bl;
8618 ::encode(in->inline_version, inline_version_bl);
8619
8620 ObjectOperation uninline_ops;
8621 uninline_ops.cmpxattr("inline_version",
8622 CEPH_OSD_CMPXATTR_OP_GT,
8623 CEPH_OSD_CMPXATTR_MODE_U64,
8624 inline_version_bl);
8625 bufferlist inline_data = in->inline_data;
8626 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8627 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8628
8629 objecter->mutate(oid,
8630 OSDMap::file_to_object_locator(in->layout),
8631 uninline_ops,
8632 in->snaprealm->get_snap_context(),
8633 ceph::real_clock::now(),
8634 0,
8635 onfinish);
8636
8637 return 0;
8638}
8639
8640//
8641
8642// blocking osd interface
8643
8644int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8645{
8646 Mutex::Locker lock(client_lock);
8647 tout(cct) << "read" << std::endl;
8648 tout(cct) << fd << std::endl;
8649 tout(cct) << size << std::endl;
8650 tout(cct) << offset << std::endl;
8651
181888fb
FG
8652 if (unmounting)
8653 return -ENOTCONN;
8654
7c673cae
FG
8655 Fh *f = get_filehandle(fd);
8656 if (!f)
8657 return -EBADF;
8658#if defined(__linux__) && defined(O_PATH)
8659 if (f->flags & O_PATH)
8660 return -EBADF;
8661#endif
8662 bufferlist bl;
8663 int r = _read(f, offset, size, &bl);
8664 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8665 if (r >= 0) {
8666 bl.copy(0, bl.length(), buf);
8667 r = bl.length();
8668 }
8669 return r;
8670}
8671
8672int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8673{
8674 if (iovcnt < 0)
8675 return -EINVAL;
8676 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8677}
8678
8679int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8680{
8681 const md_config_t *conf = cct->_conf;
8682 Inode *in = f->inode.get();
8683
8684 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8685 return -EBADF;
8686 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8687
8688 bool movepos = false;
8689 if (offset < 0) {
8690 lock_fh_pos(f);
8691 offset = f->pos;
8692 movepos = true;
8693 }
8694 loff_t start_pos = offset;
8695
8696 if (in->inline_version == 0) {
8697 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8698 if (r < 0) {
8699 if (movepos)
8700 unlock_fh_pos(f);
7c673cae 8701 return r;
c07f9fc5 8702 }
7c673cae
FG
8703 assert(in->inline_version > 0);
8704 }
8705
8706retry:
8707 int have;
8708 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8709 if (r < 0) {
8710 if (movepos)
8711 unlock_fh_pos(f);
7c673cae 8712 return r;
c07f9fc5 8713 }
7c673cae
FG
8714 if (f->flags & O_DIRECT)
8715 have &= ~CEPH_CAP_FILE_CACHE;
8716
8717 Mutex uninline_flock("Client::_read_uninline_data flock");
8718 Cond uninline_cond;
8719 bool uninline_done = false;
8720 int uninline_ret = 0;
8721 Context *onuninline = NULL;
8722
8723 if (in->inline_version < CEPH_INLINE_NONE) {
8724 if (!(have & CEPH_CAP_FILE_CACHE)) {
8725 onuninline = new C_SafeCond(&uninline_flock,
8726 &uninline_cond,
8727 &uninline_done,
8728 &uninline_ret);
8729 uninline_data(in, onuninline);
8730 } else {
8731 uint32_t len = in->inline_data.length();
8732
8733 uint64_t endoff = offset + size;
8734 if (endoff > in->size)
8735 endoff = in->size;
8736
8737 if (offset < len) {
8738 if (endoff <= len) {
8739 bl->substr_of(in->inline_data, offset, endoff - offset);
8740 } else {
8741 bl->substr_of(in->inline_data, offset, len - offset);
8742 bl->append_zero(endoff - len);
8743 }
8744 } else if ((uint64_t)offset < endoff) {
8745 bl->append_zero(endoff - offset);
8746 }
8747
8748 goto success;
8749 }
8750 }
8751
8752 if (!conf->client_debug_force_sync_read &&
8753 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8754
8755 if (f->flags & O_RSYNC) {
8756 _flush_range(in, offset, size);
8757 }
8758 r = _read_async(f, offset, size, bl);
8759 if (r < 0)
8760 goto done;
8761 } else {
8762 if (f->flags & O_DIRECT)
8763 _flush_range(in, offset, size);
8764
8765 bool checkeof = false;
8766 r = _read_sync(f, offset, size, bl, &checkeof);
8767 if (r < 0)
8768 goto done;
8769 if (checkeof) {
8770 offset += r;
8771 size -= r;
8772
8773 put_cap_ref(in, CEPH_CAP_FILE_RD);
8774 have = 0;
8775 // reverify size
8776 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8777 if (r < 0)
8778 goto done;
8779
8780 // eof? short read.
8781 if ((uint64_t)offset < in->size)
8782 goto retry;
8783 }
8784 }
8785
8786success:
8787 if (movepos) {
8788 // adjust fd pos
8789 f->pos = start_pos + bl->length();
8790 unlock_fh_pos(f);
8791 }
8792
8793done:
8794 // done!
8795
8796 if (onuninline) {
8797 client_lock.Unlock();
8798 uninline_flock.Lock();
8799 while (!uninline_done)
8800 uninline_cond.Wait(uninline_flock);
8801 uninline_flock.Unlock();
8802 client_lock.Lock();
8803
8804 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8805 in->inline_data.clear();
8806 in->inline_version = CEPH_INLINE_NONE;
8807 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8808 check_caps(in, 0);
8809 } else
8810 r = uninline_ret;
8811 }
8812
8813 if (have)
8814 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8815 if (r < 0) {
8816 if (movepos)
8817 unlock_fh_pos(f);
8818 return r;
8819 } else
8820 return bl->length();
7c673cae
FG
8821}
8822
8823Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8824 client(c), f(f) {
8825 f->get();
8826 f->readahead.inc_pending();
8827}
8828
8829Client::C_Readahead::~C_Readahead() {
8830 f->readahead.dec_pending();
8831 client->_put_fh(f);
8832}
8833
8834void Client::C_Readahead::finish(int r) {
8835 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8836 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8837}
8838
8839int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8840{
8841 const md_config_t *conf = cct->_conf;
8842 Inode *in = f->inode.get();
8843
8844 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8845
8846 // trim read based on file size?
8847 if (off >= in->size)
8848 return 0;
8849 if (len == 0)
8850 return 0;
8851 if (off + len > in->size) {
8852 len = in->size - off;
8853 }
8854
8855 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8856 << " max_bytes=" << f->readahead.get_max_readahead_size()
8857 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8858
8859 // read (and possibly block)
8860 int r, rvalue = 0;
8861 Mutex flock("Client::_read_async flock");
8862 Cond cond;
8863 bool done = false;
8864 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8865 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8866 off, len, bl, 0, onfinish);
8867 if (r == 0) {
8868 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8869 client_lock.Unlock();
8870 flock.Lock();
8871 while (!done)
8872 cond.Wait(flock);
8873 flock.Unlock();
8874 client_lock.Lock();
8875 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8876 r = rvalue;
8877 } else {
8878 // it was cached.
8879 delete onfinish;
8880 }
8881
8882 if(f->readahead.get_min_readahead_size() > 0) {
8883 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8884 if (readahead_extent.second > 0) {
8885 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8886 << " (caller wants " << off << "~" << len << ")" << dendl;
8887 Context *onfinish2 = new C_Readahead(this, f);
8888 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8889 readahead_extent.first, readahead_extent.second,
8890 NULL, 0, onfinish2);
8891 if (r2 == 0) {
8892 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8893 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8894 } else {
8895 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8896 delete onfinish2;
8897 }
8898 }
8899 }
8900
8901 return r;
8902}
8903
8904int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8905 bool *checkeof)
8906{
8907 Inode *in = f->inode.get();
8908 uint64_t pos = off;
8909 int left = len;
8910 int read = 0;
8911
8912 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8913
8914 Mutex flock("Client::_read_sync flock");
8915 Cond cond;
8916 while (left > 0) {
8917 int r = 0;
8918 bool done = false;
8919 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8920 bufferlist tbl;
8921
8922 int wanted = left;
8923 filer->read_trunc(in->ino, &in->layout, in->snapid,
8924 pos, left, &tbl, 0,
8925 in->truncate_size, in->truncate_seq,
8926 onfinish);
8927 client_lock.Unlock();
8928 flock.Lock();
8929 while (!done)
8930 cond.Wait(flock);
8931 flock.Unlock();
8932 client_lock.Lock();
8933
8934 // if we get ENOENT from OSD, assume 0 bytes returned
8935 if (r == -ENOENT)
8936 r = 0;
8937 if (r < 0)
8938 return r;
8939 if (tbl.length()) {
8940 r = tbl.length();
8941
8942 read += r;
8943 pos += r;
8944 left -= r;
8945 bl->claim_append(tbl);
8946 }
8947 // short read?
8948 if (r >= 0 && r < wanted) {
8949 if (pos < in->size) {
8950 // zero up to known EOF
8951 int64_t some = in->size - pos;
8952 if (some > left)
8953 some = left;
8954 bufferptr z(some);
8955 z.zero();
8956 bl->push_back(z);
8957 read += some;
8958 pos += some;
8959 left -= some;
8960 if (left == 0)
8961 return read;
8962 }
8963
8964 *checkeof = true;
8965 return read;
8966 }
8967 }
8968 return read;
8969}
8970
8971
8972/*
8973 * we keep count of uncommitted sync writes on the inode, so that
8974 * fsync can DDRT.
8975 */
8976void Client::_sync_write_commit(Inode *in)
8977{
8978 assert(unsafe_sync_write > 0);
8979 unsafe_sync_write--;
8980
8981 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
8982
8983 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
8984 if (unsafe_sync_write == 0 && unmounting) {
8985 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
8986 mount_cond.Signal();
8987 }
8988}
8989
8990int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
8991{
8992 Mutex::Locker lock(client_lock);
8993 tout(cct) << "write" << std::endl;
8994 tout(cct) << fd << std::endl;
8995 tout(cct) << size << std::endl;
8996 tout(cct) << offset << std::endl;
8997
181888fb
FG
8998 if (unmounting)
8999 return -ENOTCONN;
9000
7c673cae
FG
9001 Fh *fh = get_filehandle(fd);
9002 if (!fh)
9003 return -EBADF;
9004#if defined(__linux__) && defined(O_PATH)
9005 if (fh->flags & O_PATH)
9006 return -EBADF;
9007#endif
9008 int r = _write(fh, offset, size, buf, NULL, 0);
9009 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9010 return r;
9011}
9012
9013int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9014{
9015 if (iovcnt < 0)
9016 return -EINVAL;
9017 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9018}
9019
9020int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9021{
9022 Mutex::Locker lock(client_lock);
9023 tout(cct) << fd << std::endl;
9024 tout(cct) << offset << std::endl;
9025
181888fb
FG
9026 if (unmounting)
9027 return -ENOTCONN;
9028
7c673cae
FG
9029 Fh *fh = get_filehandle(fd);
9030 if (!fh)
9031 return -EBADF;
9032#if defined(__linux__) && defined(O_PATH)
9033 if (fh->flags & O_PATH)
9034 return -EBADF;
9035#endif
9036 loff_t totallen = 0;
9037 for (unsigned i = 0; i < iovcnt; i++) {
9038 totallen += iov[i].iov_len;
9039 }
9040 if (write) {
9041 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9042 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9043 return w;
9044 } else {
9045 bufferlist bl;
9046 int r = _read(fh, offset, totallen, &bl);
9047 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9048 if (r <= 0)
9049 return r;
9050
9051 int bufoff = 0;
9052 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9053 /*
9054 * This piece of code aims to handle the case that bufferlist does not have enough data
9055 * to fill in the iov
9056 */
9057 if (resid < iov[j].iov_len) {
9058 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9059 break;
9060 } else {
9061 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9062 }
9063 resid -= iov[j].iov_len;
9064 bufoff += iov[j].iov_len;
9065 }
9066 return r;
9067 }
9068}
9069
9070int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9071 const struct iovec *iov, int iovcnt)
9072{
9073 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9074 return -EFBIG;
9075
9076 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9077 Inode *in = f->inode.get();
9078
9079 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9080 return -ENOSPC;
9081 }
9082
9083 assert(in->snapid == CEPH_NOSNAP);
9084
9085 // was Fh opened as writeable?
9086 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9087 return -EBADF;
9088
9089 // check quota
9090 uint64_t endoff = offset + size;
9091 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9092 f->actor_perms)) {
9093 return -EDQUOT;
9094 }
9095
9096 // use/adjust fd pos?
9097 if (offset < 0) {
9098 lock_fh_pos(f);
9099 /*
9100 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9101 * change out from under us.
9102 */
9103 if (f->flags & O_APPEND) {
9104 int r = _lseek(f, 0, SEEK_END);
9105 if (r < 0) {
9106 unlock_fh_pos(f);
9107 return r;
9108 }
9109 }
9110 offset = f->pos;
9111 f->pos = offset+size;
9112 unlock_fh_pos(f);
9113 }
9114
9115 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9116
9117 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9118
9119 // time it.
9120 utime_t start = ceph_clock_now();
9121
9122 if (in->inline_version == 0) {
9123 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9124 if (r < 0)
9125 return r;
9126 assert(in->inline_version > 0);
9127 }
9128
9129 // copy into fresh buffer (since our write may be resub, async)
9130 bufferlist bl;
9131 if (buf) {
9132 if (size > 0)
9133 bl.append(buf, size);
9134 } else if (iov){
9135 for (int i = 0; i < iovcnt; i++) {
9136 if (iov[i].iov_len > 0) {
9137 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9138 }
9139 }
9140 }
9141
9142 utime_t lat;
9143 uint64_t totalwritten;
9144 int have;
9145 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9146 CEPH_CAP_FILE_BUFFER, &have, endoff);
9147 if (r < 0)
9148 return r;
9149
9150 /* clear the setuid/setgid bits, if any */
181888fb 9151 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9152 struct ceph_statx stx = { 0 };
9153
9154 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9155 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9156 if (r < 0)
9157 return r;
9158 } else {
9159 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9160 }
9161
9162 if (f->flags & O_DIRECT)
9163 have &= ~CEPH_CAP_FILE_BUFFER;
9164
9165 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9166
9167 Mutex uninline_flock("Client::_write_uninline_data flock");
9168 Cond uninline_cond;
9169 bool uninline_done = false;
9170 int uninline_ret = 0;
9171 Context *onuninline = NULL;
9172
9173 if (in->inline_version < CEPH_INLINE_NONE) {
9174 if (endoff > cct->_conf->client_max_inline_size ||
9175 endoff > CEPH_INLINE_MAX_SIZE ||
9176 !(have & CEPH_CAP_FILE_BUFFER)) {
9177 onuninline = new C_SafeCond(&uninline_flock,
9178 &uninline_cond,
9179 &uninline_done,
9180 &uninline_ret);
9181 uninline_data(in, onuninline);
9182 } else {
9183 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9184
9185 uint32_t len = in->inline_data.length();
9186
9187 if (endoff < len)
9188 in->inline_data.copy(endoff, len - endoff, bl);
9189
9190 if (offset < len)
9191 in->inline_data.splice(offset, len - offset);
9192 else if (offset > len)
9193 in->inline_data.append_zero(offset - len);
9194
9195 in->inline_data.append(bl);
9196 in->inline_version++;
9197
9198 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9199
9200 goto success;
9201 }
9202 }
9203
9204 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9205 // do buffered write
9206 if (!in->oset.dirty_or_tx)
9207 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9208
9209 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9210
9211 // async, caching, non-blocking.
9212 r = objectcacher->file_write(&in->oset, &in->layout,
9213 in->snaprealm->get_snap_context(),
9214 offset, size, bl, ceph::real_clock::now(),
9215 0);
9216 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9217
9218 if (r < 0)
9219 goto done;
9220
9221 // flush cached write if O_SYNC is set on file fh
9222 // O_DSYNC == O_SYNC on linux < 2.6.33
9223 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9224 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9225 _flush_range(in, offset, size);
9226 }
9227 } else {
9228 if (f->flags & O_DIRECT)
9229 _flush_range(in, offset, size);
9230
9231 // simple, non-atomic sync write
9232 Mutex flock("Client::_write flock");
9233 Cond cond;
9234 bool done = false;
9235 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9236
9237 unsafe_sync_write++;
9238 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9239
9240 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9241 offset, size, bl, ceph::real_clock::now(), 0,
9242 in->truncate_size, in->truncate_seq,
9243 onfinish);
9244 client_lock.Unlock();
9245 flock.Lock();
9246
9247 while (!done)
9248 cond.Wait(flock);
9249 flock.Unlock();
9250 client_lock.Lock();
9251 _sync_write_commit(in);
9252 }
9253
9254 // if we get here, write was successful, update client metadata
9255success:
9256 // time
9257 lat = ceph_clock_now();
9258 lat -= start;
9259 logger->tinc(l_c_wrlat, lat);
9260
9261 totalwritten = size;
9262 r = (int)totalwritten;
9263
9264 // extend file?
9265 if (totalwritten + offset > in->size) {
9266 in->size = totalwritten + offset;
9267 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9268
9269 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9270 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9271 } else if (is_max_size_approaching(in)) {
9272 check_caps(in, 0);
7c673cae
FG
9273 }
9274
9275 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9276 } else {
9277 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9278 }
9279
9280 // mtime
9281 in->mtime = ceph_clock_now();
9282 in->change_attr++;
9283 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9284
9285done:
9286
9287 if (onuninline) {
9288 client_lock.Unlock();
9289 uninline_flock.Lock();
9290 while (!uninline_done)
9291 uninline_cond.Wait(uninline_flock);
9292 uninline_flock.Unlock();
9293 client_lock.Lock();
9294
9295 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9296 in->inline_data.clear();
9297 in->inline_version = CEPH_INLINE_NONE;
9298 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9299 check_caps(in, 0);
9300 } else
9301 r = uninline_ret;
9302 }
9303
9304 put_cap_ref(in, CEPH_CAP_FILE_WR);
9305 return r;
9306}
9307
9308int Client::_flush(Fh *f)
9309{
9310 Inode *in = f->inode.get();
9311 int err = f->take_async_err();
9312 if (err != 0) {
9313 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9314 << cpp_strerror(err) << dendl;
9315 } else {
9316 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9317 }
9318
9319 return err;
9320}
9321
9322int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9323{
9324 struct ceph_statx stx;
9325 stx.stx_size = length;
9326 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9327}
9328
9329int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9330{
9331 Mutex::Locker lock(client_lock);
9332 tout(cct) << "ftruncate" << std::endl;
9333 tout(cct) << fd << std::endl;
9334 tout(cct) << length << std::endl;
9335
181888fb
FG
9336 if (unmounting)
9337 return -ENOTCONN;
9338
7c673cae
FG
9339 Fh *f = get_filehandle(fd);
9340 if (!f)
9341 return -EBADF;
9342#if defined(__linux__) && defined(O_PATH)
9343 if (f->flags & O_PATH)
9344 return -EBADF;
9345#endif
9346 struct stat attr;
9347 attr.st_size = length;
9348 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9349}
9350
9351int Client::fsync(int fd, bool syncdataonly)
9352{
9353 Mutex::Locker lock(client_lock);
9354 tout(cct) << "fsync" << std::endl;
9355 tout(cct) << fd << std::endl;
9356 tout(cct) << syncdataonly << std::endl;
9357
181888fb
FG
9358 if (unmounting)
9359 return -ENOTCONN;
9360
7c673cae
FG
9361 Fh *f = get_filehandle(fd);
9362 if (!f)
9363 return -EBADF;
9364#if defined(__linux__) && defined(O_PATH)
9365 if (f->flags & O_PATH)
9366 return -EBADF;
9367#endif
9368 int r = _fsync(f, syncdataonly);
9369 if (r == 0) {
9370 // The IOs in this fsync were okay, but maybe something happened
9371 // in the background that we shoudl be reporting?
9372 r = f->take_async_err();
9373 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9374 << ") = 0, async_err = " << r << dendl;
9375 } else {
9376 // Assume that an error we encountered during fsync, even reported
9377 // synchronously, would also have applied the error to the Fh, and we
9378 // should clear it here to avoid returning the same error again on next
9379 // call.
9380 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9381 << r << dendl;
9382 f->take_async_err();
9383 }
9384 return r;
9385}
9386
9387int Client::_fsync(Inode *in, bool syncdataonly)
9388{
9389 int r = 0;
9390 Mutex lock("Client::_fsync::lock");
9391 Cond cond;
9392 bool done = false;
9393 C_SafeCond *object_cacher_completion = NULL;
9394 ceph_tid_t flush_tid = 0;
9395 InodeRef tmp_ref;
9396
9397 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9398
9399 if (cct->_conf->client_oc) {
9400 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9401 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9402 _flush(in, object_cacher_completion);
9403 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9404 }
9405
9406 if (!syncdataonly && in->dirty_caps) {
9407 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9408 if (in->flushing_caps)
9409 flush_tid = last_flush_tid;
9410 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9411
9412 if (!syncdataonly && !in->unsafe_ops.empty()) {
9413 MetaRequest *req = in->unsafe_ops.back();
9414 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9415
9416 req->get();
9417 wait_on_list(req->waitfor_safe);
9418 put_request(req);
9419 }
9420
9421 if (object_cacher_completion) { // wait on a real reply instead of guessing
9422 client_lock.Unlock();
9423 lock.Lock();
9424 ldout(cct, 15) << "waiting on data to flush" << dendl;
9425 while (!done)
9426 cond.Wait(lock);
9427 lock.Unlock();
9428 client_lock.Lock();
9429 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9430 } else {
9431 // FIXME: this can starve
9432 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9433 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9434 << " uncommitted, waiting" << dendl;
9435 wait_on_list(in->waitfor_commit);
9436 }
9437 }
9438
9439 if (!r) {
9440 if (flush_tid > 0)
9441 wait_sync_caps(in, flush_tid);
9442
9443 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9444 } else {
9445 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9446 << cpp_strerror(-r) << dendl;
9447 }
9448
9449 return r;
9450}
9451
9452int Client::_fsync(Fh *f, bool syncdataonly)
9453{
9454 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9455 return _fsync(f->inode.get(), syncdataonly);
9456}
9457
9458int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9459{
9460 Mutex::Locker lock(client_lock);
9461 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9462 tout(cct) << fd << std::endl;
9463
181888fb
FG
9464 if (unmounting)
9465 return -ENOTCONN;
9466
7c673cae
FG
9467 Fh *f = get_filehandle(fd);
9468 if (!f)
9469 return -EBADF;
9470 int r = _getattr(f->inode, mask, perms);
9471 if (r < 0)
9472 return r;
9473 fill_stat(f->inode, stbuf, NULL);
9474 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9475 return r;
9476}
9477
9478int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9479 unsigned int want, unsigned int flags)
9480{
9481 Mutex::Locker lock(client_lock);
9482 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9483 tout(cct) << fd << std::endl;
9484
181888fb
FG
9485 if (unmounting)
9486 return -ENOTCONN;
9487
7c673cae
FG
9488 Fh *f = get_filehandle(fd);
9489 if (!f)
9490 return -EBADF;
9491
9492 unsigned mask = statx_to_mask(flags, want);
9493
9494 int r = 0;
9495 if (mask && !f->inode->caps_issued_mask(mask)) {
9496 r = _getattr(f->inode, mask, perms);
9497 if (r < 0) {
9498 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9499 return r;
9500 }
9501 }
9502
9503 fill_statx(f->inode, mask, stx);
9504 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9505 return r;
9506}
9507
9508// not written yet, but i want to link!
9509
9510int Client::chdir(const char *relpath, std::string &new_cwd,
9511 const UserPerm& perms)
9512{
9513 Mutex::Locker lock(client_lock);
9514 tout(cct) << "chdir" << std::endl;
9515 tout(cct) << relpath << std::endl;
181888fb
FG
9516
9517 if (unmounting)
9518 return -ENOTCONN;
9519
7c673cae
FG
9520 filepath path(relpath);
9521 InodeRef in;
9522 int r = path_walk(path, &in, perms);
9523 if (r < 0)
9524 return r;
9525 if (cwd != in)
9526 cwd.swap(in);
9527 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9528
b5b8bbf5 9529 _getcwd(new_cwd, perms);
7c673cae
FG
9530 return 0;
9531}
9532
b5b8bbf5 9533void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9534{
9535 filepath path;
9536 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9537
9538 Inode *in = cwd.get();
9539 while (in != root) {
9540 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9541
9542 // A cwd or ancester is unlinked
9543 if (in->dn_set.empty()) {
9544 return;
9545 }
9546
9547 Dentry *dn = in->get_first_parent();
9548
9549
9550 if (!dn) {
9551 // look it up
9552 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9553 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9554 filepath path(in->ino);
9555 req->set_filepath(path);
9556 req->set_inode(in);
9557 int res = make_request(req, perms);
9558 if (res < 0)
9559 break;
9560
9561 // start over
9562 path = filepath();
9563 in = cwd.get();
9564 continue;
9565 }
9566 path.push_front_dentry(dn->name);
9567 in = dn->dir->parent_inode;
9568 }
9569 dir = "/";
9570 dir += path.get_path();
9571}
9572
b5b8bbf5
FG
9573void Client::getcwd(string& dir, const UserPerm& perms)
9574{
9575 Mutex::Locker l(client_lock);
181888fb
FG
9576 if (!unmounting)
9577 _getcwd(dir, perms);
b5b8bbf5
FG
9578}
9579
7c673cae
FG
9580int Client::statfs(const char *path, struct statvfs *stbuf,
9581 const UserPerm& perms)
9582{
9583 Mutex::Locker l(client_lock);
9584 tout(cct) << "statfs" << std::endl;
9585
181888fb
FG
9586 if (unmounting)
9587 return -ENOTCONN;
9588
7c673cae
FG
9589 ceph_statfs stats;
9590 C_SaferCond cond;
d2e6a577
FG
9591
9592 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9593 if (data_pools.size() == 1) {
9594 objecter->get_fs_stats(stats, data_pools[0], &cond);
9595 } else {
9596 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9597 }
7c673cae
FG
9598
9599 client_lock.Unlock();
9600 int rval = cond.wait();
9601 client_lock.Lock();
9602
9603 if (rval < 0) {
9604 ldout(cct, 1) << "underlying call to statfs returned error: "
9605 << cpp_strerror(rval)
9606 << dendl;
9607 return rval;
9608 }
9609
9610 memset(stbuf, 0, sizeof(*stbuf));
9611
9612 /*
9613 * we're going to set a block size of 4MB so we can represent larger
9614 * FSes without overflowing. Additionally convert the space
9615 * measurements from KB to bytes while making them in terms of
9616 * blocks. We use 4MB only because it is big enough, and because it
9617 * actually *is* the (ceph) default block size.
9618 */
9619 const int CEPH_BLOCK_SHIFT = 22;
9620 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9621 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9622 stbuf->f_files = stats.num_objects;
9623 stbuf->f_ffree = -1;
9624 stbuf->f_favail = -1;
9625 stbuf->f_fsid = -1; // ??
9626 stbuf->f_flag = 0; // ??
9627 stbuf->f_namemax = NAME_MAX;
9628
9629 // Usually quota_root will == root_ancestor, but if the mount root has no
9630 // quota but we can see a parent of it that does have a quota, we'll
9631 // respect that one instead.
9632 assert(root != nullptr);
9633 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9634
9635 // get_quota_root should always give us something
9636 // because client quotas are always enabled
9637 assert(quota_root != nullptr);
9638
9639 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9640
9641 // Skip the getattr if any sessions are stale, as we don't want to
9642 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9643 // is unhealthy.
9644 if (!_any_stale_sessions()) {
9645 int r = _getattr(quota_root, 0, perms, true);
9646 if (r != 0) {
9647 // Ignore return value: error getting latest inode metadata is not a good
9648 // reason to break "df".
9649 lderr(cct) << "Error in getattr on quota root 0x"
9650 << std::hex << quota_root->ino << std::dec
9651 << " statfs result may be outdated" << dendl;
9652 }
9653 }
9654
9655 // Special case: if there is a size quota set on the Inode acting
9656 // as the root for this client mount, then report the quota status
9657 // as the filesystem statistics.
9658 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9659 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9660 // It is possible for a quota to be exceeded: arithmetic here must
9661 // handle case where used > total.
9662 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9663
9664 stbuf->f_blocks = total;
9665 stbuf->f_bfree = free;
9666 stbuf->f_bavail = free;
9667 } else {
d2e6a577 9668 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9669 // multiple pools may be used without one filesystem namespace via
9670 // layouts, this is the most correct thing we can do.
9671 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9672 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9673 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9674 }
9675
9676 return rval;
9677}
9678
9679int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9680 struct flock *fl, uint64_t owner, bool removing)
9681{
9682 ldout(cct, 10) << "_do_filelock ino " << in->ino
9683 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9684 << " type " << fl->l_type << " owner " << owner
9685 << " " << fl->l_start << "~" << fl->l_len << dendl;
9686
9687 int lock_cmd;
9688 if (F_RDLCK == fl->l_type)
9689 lock_cmd = CEPH_LOCK_SHARED;
9690 else if (F_WRLCK == fl->l_type)
9691 lock_cmd = CEPH_LOCK_EXCL;
9692 else if (F_UNLCK == fl->l_type)
9693 lock_cmd = CEPH_LOCK_UNLOCK;
9694 else
9695 return -EIO;
9696
9697 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9698 sleep = 0;
9699
9700 /*
9701 * Set the most significant bit, so that MDS knows the 'owner'
9702 * is sufficient to identify the owner of lock. (old code uses
9703 * both 'owner' and 'pid')
9704 */
9705 owner |= (1ULL << 63);
9706
9707 MetaRequest *req = new MetaRequest(op);
9708 filepath path;
9709 in->make_nosnap_relative_path(path);
9710 req->set_filepath(path);
9711 req->set_inode(in);
9712
9713 req->head.args.filelock_change.rule = lock_type;
9714 req->head.args.filelock_change.type = lock_cmd;
9715 req->head.args.filelock_change.owner = owner;
9716 req->head.args.filelock_change.pid = fl->l_pid;
9717 req->head.args.filelock_change.start = fl->l_start;
9718 req->head.args.filelock_change.length = fl->l_len;
9719 req->head.args.filelock_change.wait = sleep;
9720
9721 int ret;
9722 bufferlist bl;
9723
9724 if (sleep && switch_interrupt_cb) {
9725 // enable interrupt
9726 switch_interrupt_cb(callback_handle, req->get());
9727 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9728 // disable interrupt
9729 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9730 if (ret == 0 && req->aborted()) {
9731 // effect of this lock request has been revoked by the 'lock intr' request
9732 ret = req->get_abort_code();
9733 }
7c673cae
FG
9734 put_request(req);
9735 } else {
9736 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9737 }
9738
9739 if (ret == 0) {
9740 if (op == CEPH_MDS_OP_GETFILELOCK) {
9741 ceph_filelock filelock;
9742 bufferlist::iterator p = bl.begin();
9743 ::decode(filelock, p);
9744
9745 if (CEPH_LOCK_SHARED == filelock.type)
9746 fl->l_type = F_RDLCK;
9747 else if (CEPH_LOCK_EXCL == filelock.type)
9748 fl->l_type = F_WRLCK;
9749 else
9750 fl->l_type = F_UNLCK;
9751
9752 fl->l_whence = SEEK_SET;
9753 fl->l_start = filelock.start;
9754 fl->l_len = filelock.length;
9755 fl->l_pid = filelock.pid;
9756 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9757 ceph_lock_state_t *lock_state;
9758 if (lock_type == CEPH_LOCK_FCNTL) {
9759 if (!in->fcntl_locks)
9760 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9761 lock_state = in->fcntl_locks;
9762 } else if (lock_type == CEPH_LOCK_FLOCK) {
9763 if (!in->flock_locks)
9764 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9765 lock_state = in->flock_locks;
9766 } else {
9767 ceph_abort();
9768 return -EINVAL;
9769 }
9770 _update_lock_state(fl, owner, lock_state);
9771
9772 if (!removing) {
9773 if (lock_type == CEPH_LOCK_FCNTL) {
9774 if (!fh->fcntl_locks)
9775 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9776 lock_state = fh->fcntl_locks;
9777 } else {
9778 if (!fh->flock_locks)
9779 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9780 lock_state = fh->flock_locks;
9781 }
9782 _update_lock_state(fl, owner, lock_state);
9783 }
9784 } else
9785 ceph_abort();
9786 }
9787 return ret;
9788}
9789
9790int Client::_interrupt_filelock(MetaRequest *req)
9791{
31f18b77
FG
9792 // Set abort code, but do not kick. The abort code prevents the request
9793 // from being re-sent.
9794 req->abort(-EINTR);
9795 if (req->mds < 0)
9796 return 0; // haven't sent the request
9797
7c673cae
FG
9798 Inode *in = req->inode();
9799
9800 int lock_type;
9801 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9802 lock_type = CEPH_LOCK_FLOCK_INTR;
9803 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9804 lock_type = CEPH_LOCK_FCNTL_INTR;
9805 else {
9806 ceph_abort();
9807 return -EINVAL;
9808 }
9809
9810 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9811 filepath path;
9812 in->make_nosnap_relative_path(path);
9813 intr_req->set_filepath(path);
9814 intr_req->set_inode(in);
9815 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9816 intr_req->head.args.filelock_change.rule = lock_type;
9817 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9818
9819 UserPerm perms(req->get_uid(), req->get_gid());
9820 return make_request(intr_req, perms, NULL, NULL, -1);
9821}
9822
9823void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9824{
9825 if (!in->fcntl_locks && !in->flock_locks)
9826 return;
9827
9828 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9829 ::encode(nr_fcntl_locks, bl);
9830 if (nr_fcntl_locks) {
9831 ceph_lock_state_t* lock_state = in->fcntl_locks;
9832 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9833 p != lock_state->held_locks.end();
9834 ++p)
9835 ::encode(p->second, bl);
9836 }
9837
9838 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9839 ::encode(nr_flock_locks, bl);
9840 if (nr_flock_locks) {
9841 ceph_lock_state_t* lock_state = in->flock_locks;
9842 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9843 p != lock_state->held_locks.end();
9844 ++p)
9845 ::encode(p->second, bl);
9846 }
9847
9848 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9849 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9850}
9851
9852void Client::_release_filelocks(Fh *fh)
9853{
9854 if (!fh->fcntl_locks && !fh->flock_locks)
9855 return;
9856
9857 Inode *in = fh->inode.get();
9858 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9859
9860 list<pair<int, ceph_filelock> > to_release;
9861
9862 if (fh->fcntl_locks) {
9863 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9864 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9865 p != lock_state->held_locks.end();
9866 ++p)
9867 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9868 delete fh->fcntl_locks;
9869 }
9870 if (fh->flock_locks) {
9871 ceph_lock_state_t* lock_state = fh->flock_locks;
9872 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9873 p != lock_state->held_locks.end();
9874 ++p)
9875 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9876 delete fh->flock_locks;
9877 }
9878
9879 if (to_release.empty())
9880 return;
9881
9882 struct flock fl;
9883 memset(&fl, 0, sizeof(fl));
9884 fl.l_whence = SEEK_SET;
9885 fl.l_type = F_UNLCK;
9886
9887 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9888 p != to_release.end();
9889 ++p) {
9890 fl.l_start = p->second.start;
9891 fl.l_len = p->second.length;
9892 fl.l_pid = p->second.pid;
9893 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9894 p->second.owner, true);
9895 }
9896}
9897
9898void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9899 ceph_lock_state_t *lock_state)
9900{
9901 int lock_cmd;
9902 if (F_RDLCK == fl->l_type)
9903 lock_cmd = CEPH_LOCK_SHARED;
9904 else if (F_WRLCK == fl->l_type)
9905 lock_cmd = CEPH_LOCK_EXCL;
9906 else
9907 lock_cmd = CEPH_LOCK_UNLOCK;;
9908
9909 ceph_filelock filelock;
9910 filelock.start = fl->l_start;
9911 filelock.length = fl->l_len;
9912 filelock.client = 0;
9913 // see comment in _do_filelock()
9914 filelock.owner = owner | (1ULL << 63);
9915 filelock.pid = fl->l_pid;
9916 filelock.type = lock_cmd;
9917
9918 if (filelock.type == CEPH_LOCK_UNLOCK) {
9919 list<ceph_filelock> activated_locks;
9920 lock_state->remove_lock(filelock, activated_locks);
9921 } else {
9922 bool r = lock_state->add_lock(filelock, false, false, NULL);
9923 assert(r);
9924 }
9925}
9926
9927int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9928{
9929 Inode *in = fh->inode.get();
9930 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9931 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9932 return ret;
9933}
9934
9935int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9936{
9937 Inode *in = fh->inode.get();
9938 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
9939 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
9940 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
9941 return ret;
9942}
9943
9944int Client::_flock(Fh *fh, int cmd, uint64_t owner)
9945{
9946 Inode *in = fh->inode.get();
9947 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
9948
9949 int sleep = !(cmd & LOCK_NB);
9950 cmd &= ~LOCK_NB;
9951
9952 int type;
9953 switch (cmd) {
9954 case LOCK_SH:
9955 type = F_RDLCK;
9956 break;
9957 case LOCK_EX:
9958 type = F_WRLCK;
9959 break;
9960 case LOCK_UN:
9961 type = F_UNLCK;
9962 break;
9963 default:
9964 return -EINVAL;
9965 }
9966
9967 struct flock fl;
9968 memset(&fl, 0, sizeof(fl));
9969 fl.l_type = type;
9970 fl.l_whence = SEEK_SET;
9971
9972 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
9973 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
9974 return ret;
9975}
9976
9977int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
9978{
9979 /* Since the only thing this does is wrap a call to statfs, and
9980 statfs takes a lock, it doesn't seem we have a need to split it
9981 out. */
9982 return statfs(0, stbuf, perms);
9983}
9984
9985void Client::ll_register_callbacks(struct client_callback_args *args)
9986{
9987 if (!args)
9988 return;
9989 Mutex::Locker l(client_lock);
9990 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
9991 << " invalidate_ino_cb " << args->ino_cb
9992 << " invalidate_dentry_cb " << args->dentry_cb
9993 << " getgroups_cb" << args->getgroups_cb
9994 << " switch_interrupt_cb " << args->switch_intr_cb
9995 << " remount_cb " << args->remount_cb
9996 << dendl;
9997 callback_handle = args->handle;
9998 if (args->ino_cb) {
9999 ino_invalidate_cb = args->ino_cb;
10000 async_ino_invalidator.start();
10001 }
10002 if (args->dentry_cb) {
10003 dentry_invalidate_cb = args->dentry_cb;
10004 async_dentry_invalidator.start();
10005 }
10006 if (args->switch_intr_cb) {
10007 switch_interrupt_cb = args->switch_intr_cb;
10008 interrupt_finisher.start();
10009 }
10010 if (args->remount_cb) {
10011 remount_cb = args->remount_cb;
10012 remount_finisher.start();
10013 }
10014 getgroups_cb = args->getgroups_cb;
10015 umask_cb = args->umask_cb;
10016}
10017
10018int Client::test_dentry_handling(bool can_invalidate)
10019{
10020 int r = 0;
10021
10022 can_invalidate_dentries = can_invalidate;
10023
10024 if (can_invalidate_dentries) {
10025 assert(dentry_invalidate_cb);
10026 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10027 } else if (remount_cb) {
10028 ldout(cct, 1) << "using remount_cb" << dendl;
10029 int s = remount_cb(callback_handle);
10030 if (s) {
10031 lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency"
10032 << dendl;
10033 }
10034 if (cct->_conf->client_die_on_failed_remount) {
10035 require_remount = true;
10036 r = s;
10037 }
10038 } else {
10039 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10040 if (cct->_conf->client_die_on_failed_remount)
10041 ceph_abort();
10042 }
10043 return r;
10044}
10045
10046int Client::_sync_fs()
10047{
10048 ldout(cct, 10) << "_sync_fs" << dendl;
10049
10050 // flush file data
10051 Mutex lock("Client::_fsync::lock");
10052 Cond cond;
10053 bool flush_done = false;
10054 if (cct->_conf->client_oc)
10055 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10056 else
10057 flush_done = true;
10058
10059 // flush caps
10060 flush_caps_sync();
10061 ceph_tid_t flush_tid = last_flush_tid;
10062
10063 // wait for unsafe mds requests
10064 wait_unsafe_requests();
10065
10066 wait_sync_caps(flush_tid);
10067
10068 if (!flush_done) {
10069 client_lock.Unlock();
10070 lock.Lock();
10071 ldout(cct, 15) << "waiting on data to flush" << dendl;
10072 while (!flush_done)
10073 cond.Wait(lock);
10074 lock.Unlock();
10075 client_lock.Lock();
10076 }
10077
10078 return 0;
10079}
10080
10081int Client::sync_fs()
10082{
10083 Mutex::Locker l(client_lock);
181888fb
FG
10084
10085 if (unmounting)
10086 return -ENOTCONN;
10087
7c673cae
FG
10088 return _sync_fs();
10089}
10090
10091int64_t Client::drop_caches()
10092{
10093 Mutex::Locker l(client_lock);
10094 return objectcacher->release_all();
10095}
10096
10097
10098int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10099{
10100 Mutex::Locker l(client_lock);
10101 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10102 << ", " << offset << ", " << count << ")" << dendl;
10103
10104 Fh *f = get_filehandle(fd);
10105 if (!f)
10106 return -EBADF;
10107
10108 // for now
10109 _fsync(f, true);
10110
10111 return 0;
10112}
10113
10114int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10115{
10116 Mutex::Locker l(client_lock);
10117 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10118 << ", " << offset << ", " << count << ")" << dendl;
10119
10120 Fh *f = get_filehandle(fd);
10121 if (!f)
10122 return -EBADF;
10123 Inode *in = f->inode.get();
10124
10125 _fsync(f, true);
10126 if (_release(in))
10127 check_caps(in, 0);
10128 return 0;
10129}
10130
10131
10132// =============================
10133// snaps
10134
10135int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10136{
10137 Mutex::Locker l(client_lock);
181888fb
FG
10138
10139 if (unmounting)
10140 return -ENOTCONN;
10141
7c673cae
FG
10142 filepath path(relpath);
10143 InodeRef in;
10144 int r = path_walk(path, &in, perm);
10145 if (r < 0)
10146 return r;
10147 if (cct->_conf->client_permissions) {
10148 r = may_create(in.get(), perm);
10149 if (r < 0)
10150 return r;
10151 }
10152 Inode *snapdir = open_snapdir(in.get());
10153 return _mkdir(snapdir, name, 0, perm);
10154}
181888fb 10155
7c673cae
FG
10156int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10157{
10158 Mutex::Locker l(client_lock);
181888fb
FG
10159
10160 if (unmounting)
10161 return -ENOTCONN;
10162
7c673cae
FG
10163 filepath path(relpath);
10164 InodeRef in;
10165 int r = path_walk(path, &in, perms);
10166 if (r < 0)
10167 return r;
10168 if (cct->_conf->client_permissions) {
10169 r = may_delete(in.get(), NULL, perms);
10170 if (r < 0)
10171 return r;
10172 }
10173 Inode *snapdir = open_snapdir(in.get());
10174 return _rmdir(snapdir, name, perms);
10175}
10176
10177// =============================
10178// expose caps
10179
10180int Client::get_caps_issued(int fd) {
10181
10182 Mutex::Locker lock(client_lock);
10183
181888fb
FG
10184 if (unmounting)
10185 return -ENOTCONN;
10186
7c673cae
FG
10187 Fh *f = get_filehandle(fd);
10188 if (!f)
10189 return -EBADF;
10190
10191 return f->inode->caps_issued();
10192}
10193
10194int Client::get_caps_issued(const char *path, const UserPerm& perms)
10195{
10196 Mutex::Locker lock(client_lock);
181888fb
FG
10197
10198 if (unmounting)
10199 return -ENOTCONN;
10200
7c673cae
FG
10201 filepath p(path);
10202 InodeRef in;
10203 int r = path_walk(p, &in, perms, true);
10204 if (r < 0)
10205 return r;
10206 return in->caps_issued();
10207}
10208
10209// =========================================
10210// low level
10211
10212Inode *Client::open_snapdir(Inode *diri)
10213{
10214 Inode *in;
10215 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10216 if (!inode_map.count(vino)) {
10217 in = new Inode(this, vino, &diri->layout);
10218
10219 in->ino = diri->ino;
10220 in->snapid = CEPH_SNAPDIR;
10221 in->mode = diri->mode;
10222 in->uid = diri->uid;
10223 in->gid = diri->gid;
10224 in->mtime = diri->mtime;
10225 in->ctime = diri->ctime;
10226 in->btime = diri->btime;
10227 in->size = diri->size;
10228 in->change_attr = diri->change_attr;
10229
10230 in->dirfragtree.clear();
10231 in->snapdir_parent = diri;
10232 diri->flags |= I_SNAPDIR_OPEN;
10233 inode_map[vino] = in;
10234 if (use_faked_inos())
10235 _assign_faked_ino(in);
10236 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10237 } else {
10238 in = inode_map[vino];
10239 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10240 }
10241 return in;
10242}
10243
10244int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10245 Inode **out, const UserPerm& perms)
10246{
10247 Mutex::Locker lock(client_lock);
31f18b77
FG
10248 vinodeno_t vparent = _get_vino(parent);
10249 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10250 tout(cct) << "ll_lookup" << std::endl;
10251 tout(cct) << name << std::endl;
10252
181888fb
FG
10253 if (unmounting)
10254 return -ENOTCONN;
10255
7c673cae
FG
10256 int r = 0;
10257 if (!cct->_conf->fuse_default_permissions) {
10258 r = may_lookup(parent, perms);
10259 if (r < 0)
10260 return r;
10261 }
10262
10263 string dname(name);
10264 InodeRef in;
10265
10266 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10267 if (r < 0) {
10268 attr->st_ino = 0;
10269 goto out;
10270 }
10271
10272 assert(in);
10273 fill_stat(in, attr);
10274 _ll_get(in.get());
10275
10276 out:
31f18b77 10277 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10278 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10279 tout(cct) << attr->st_ino << std::endl;
10280 *out = in.get();
10281 return r;
10282}
10283
10284int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10285 struct ceph_statx *stx, unsigned want, unsigned flags,
10286 const UserPerm& perms)
10287{
10288 Mutex::Locker lock(client_lock);
31f18b77
FG
10289 vinodeno_t vparent = _get_vino(parent);
10290 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10291 tout(cct) << "ll_lookupx" << std::endl;
10292 tout(cct) << name << std::endl;
10293
181888fb
FG
10294 if (unmounting)
10295 return -ENOTCONN;
10296
7c673cae
FG
10297 int r = 0;
10298 if (!cct->_conf->fuse_default_permissions) {
10299 r = may_lookup(parent, perms);
10300 if (r < 0)
10301 return r;
10302 }
10303
10304 string dname(name);
10305 InodeRef in;
10306
10307 unsigned mask = statx_to_mask(flags, want);
10308 r = _lookup(parent, dname, mask, &in, perms);
10309 if (r < 0) {
10310 stx->stx_ino = 0;
10311 stx->stx_mask = 0;
10312 } else {
10313 assert(in);
10314 fill_statx(in, mask, stx);
10315 _ll_get(in.get());
10316 }
10317
31f18b77 10318 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10319 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10320 tout(cct) << stx->stx_ino << std::endl;
10321 *out = in.get();
10322 return r;
10323}
10324
10325int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10326 unsigned int want, unsigned int flags, const UserPerm& perms)
10327{
10328 Mutex::Locker lock(client_lock);
181888fb
FG
10329
10330 if (unmounting)
10331 return -ENOTCONN;
10332
7c673cae
FG
10333 filepath fp(name, 0);
10334 InodeRef in;
10335 int rc;
10336 unsigned mask = statx_to_mask(flags, want);
10337
10338 ldout(cct, 3) << "ll_walk" << name << dendl;
10339 tout(cct) << "ll_walk" << std::endl;
10340 tout(cct) << name << std::endl;
10341
10342 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10343 if (rc < 0) {
10344 /* zero out mask, just in case... */
10345 stx->stx_mask = 0;
10346 stx->stx_ino = 0;
10347 *out = NULL;
10348 return rc;
10349 } else {
10350 assert(in);
10351 fill_statx(in, mask, stx);
10352 _ll_get(in.get());
10353 *out = in.get();
10354 return 0;
10355 }
10356}
10357
10358void Client::_ll_get(Inode *in)
10359{
10360 if (in->ll_ref == 0) {
10361 in->get();
10362 if (in->is_dir() && !in->dn_set.empty()) {
10363 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10364 in->get_first_parent()->get(); // pin dentry
10365 }
10366 }
10367 in->ll_get();
10368 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10369}
10370
10371int Client::_ll_put(Inode *in, int num)
10372{
10373 in->ll_put(num);
10374 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10375 if (in->ll_ref == 0) {
10376 if (in->is_dir() && !in->dn_set.empty()) {
10377 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10378 in->get_first_parent()->put(); // unpin dentry
10379 }
10380 put_inode(in);
10381 return 0;
10382 } else {
10383 return in->ll_ref;
10384 }
10385}
10386
10387void Client::_ll_drop_pins()
10388{
10389 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10390 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10391 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10392 it != inode_map.end();
10393 it = next) {
10394 Inode *in = it->second;
10395 next = it;
10396 ++next;
10397 if (in->ll_ref)
10398 _ll_put(in, in->ll_ref);
10399 }
10400}
10401
10402bool Client::ll_forget(Inode *in, int count)
10403{
10404 Mutex::Locker lock(client_lock);
10405 inodeno_t ino = _get_inodeno(in);
10406
10407 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10408 tout(cct) << "ll_forget" << std::endl;
10409 tout(cct) << ino.val << std::endl;
10410 tout(cct) << count << std::endl;
10411
181888fb
FG
10412 // Ignore forget if we're no longer mounted
10413 if (unmounting)
10414 return true;
10415
7c673cae
FG
10416 if (ino == 1) return true; // ignore forget on root.
10417
10418 bool last = false;
10419 if (in->ll_ref < count) {
10420 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10421 << ", which only has ll_ref=" << in->ll_ref << dendl;
10422 _ll_put(in, in->ll_ref);
10423 last = true;
10424 } else {
10425 if (_ll_put(in, count) == 0)
10426 last = true;
10427 }
10428
10429 return last;
10430}
10431
10432bool Client::ll_put(Inode *in)
10433{
10434 /* ll_forget already takes the lock */
10435 return ll_forget(in, 1);
10436}
10437
10438snapid_t Client::ll_get_snapid(Inode *in)
10439{
10440 Mutex::Locker lock(client_lock);
10441 return in->snapid;
10442}
10443
10444Inode *Client::ll_get_inode(ino_t ino)
10445{
10446 Mutex::Locker lock(client_lock);
181888fb
FG
10447
10448 if (unmounting)
10449 return NULL;
10450
7c673cae
FG
10451 vinodeno_t vino = _map_faked_ino(ino);
10452 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10453 if (p == inode_map.end())
10454 return NULL;
10455 Inode *in = p->second;
10456 _ll_get(in);
10457 return in;
10458}
10459
10460Inode *Client::ll_get_inode(vinodeno_t vino)
10461{
10462 Mutex::Locker lock(client_lock);
181888fb
FG
10463
10464 if (unmounting)
10465 return NULL;
10466
7c673cae
FG
10467 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10468 if (p == inode_map.end())
10469 return NULL;
10470 Inode *in = p->second;
10471 _ll_get(in);
10472 return in;
10473}
10474
10475int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10476{
10477 vinodeno_t vino = _get_vino(in);
10478
10479 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10480 tout(cct) << "ll_getattr" << std::endl;
10481 tout(cct) << vino.ino.val << std::endl;
10482
10483 if (vino.snapid < CEPH_NOSNAP)
10484 return 0;
10485 else
10486 return _getattr(in, caps, perms);
10487}
10488
10489int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10490{
10491 Mutex::Locker lock(client_lock);
10492
181888fb
FG
10493 if (unmounting)
10494 return -ENOTCONN;
10495
7c673cae
FG
10496 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10497
10498 if (res == 0)
10499 fill_stat(in, attr);
10500 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10501 return res;
10502}
10503
10504int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10505 unsigned int flags, const UserPerm& perms)
10506{
10507 Mutex::Locker lock(client_lock);
10508
181888fb
FG
10509 if (unmounting)
10510 return -ENOTCONN;
10511
7c673cae
FG
10512 int res = 0;
10513 unsigned mask = statx_to_mask(flags, want);
10514
10515 if (mask && !in->caps_issued_mask(mask))
10516 res = _ll_getattr(in, mask, perms);
10517
10518 if (res == 0)
10519 fill_statx(in, mask, stx);
10520 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10521 return res;
10522}
10523
10524int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10525 const UserPerm& perms, InodeRef *inp)
10526{
10527 vinodeno_t vino = _get_vino(in);
10528
10529 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10530 << dendl;
10531 tout(cct) << "ll_setattrx" << std::endl;
10532 tout(cct) << vino.ino.val << std::endl;
10533 tout(cct) << stx->stx_mode << std::endl;
10534 tout(cct) << stx->stx_uid << std::endl;
10535 tout(cct) << stx->stx_gid << std::endl;
10536 tout(cct) << stx->stx_size << std::endl;
10537 tout(cct) << stx->stx_mtime << std::endl;
10538 tout(cct) << stx->stx_atime << std::endl;
10539 tout(cct) << stx->stx_btime << std::endl;
10540 tout(cct) << mask << std::endl;
10541
10542 if (!cct->_conf->fuse_default_permissions) {
10543 int res = may_setattr(in, stx, mask, perms);
10544 if (res < 0)
10545 return res;
10546 }
10547
10548 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10549
10550 return __setattrx(in, stx, mask, perms, inp);
10551}
10552
10553int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10554 const UserPerm& perms)
10555{
10556 Mutex::Locker lock(client_lock);
181888fb
FG
10557
10558 if (unmounting)
10559 return -ENOTCONN;
10560
7c673cae
FG
10561 InodeRef target(in);
10562 int res = _ll_setattrx(in, stx, mask, perms, &target);
10563 if (res == 0) {
10564 assert(in == target.get());
10565 fill_statx(in, in->caps_issued(), stx);
10566 }
10567
10568 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10569 return res;
10570}
10571
10572int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10573 const UserPerm& perms)
10574{
10575 struct ceph_statx stx;
10576 stat_to_statx(attr, &stx);
10577
10578 Mutex::Locker lock(client_lock);
181888fb
FG
10579
10580 if (unmounting)
10581 return -ENOTCONN;
10582
7c673cae
FG
10583 InodeRef target(in);
10584 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10585 if (res == 0) {
10586 assert(in == target.get());
10587 fill_stat(in, attr);
10588 }
10589
10590 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10591 return res;
10592}
10593
10594
10595// ----------
10596// xattrs
10597
10598int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10599 const UserPerm& perms)
10600{
10601 Mutex::Locker lock(client_lock);
181888fb
FG
10602
10603 if (unmounting)
10604 return -ENOTCONN;
10605
7c673cae
FG
10606 InodeRef in;
10607 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10608 if (r < 0)
10609 return r;
10610 return _getxattr(in, name, value, size, perms);
10611}
10612
10613int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10614 const UserPerm& perms)
10615{
10616 Mutex::Locker lock(client_lock);
181888fb
FG
10617
10618 if (unmounting)
10619 return -ENOTCONN;
10620
7c673cae
FG
10621 InodeRef in;
10622 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10623 if (r < 0)
10624 return r;
10625 return _getxattr(in, name, value, size, perms);
10626}
10627
10628int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10629 const UserPerm& perms)
10630{
10631 Mutex::Locker lock(client_lock);
181888fb
FG
10632
10633 if (unmounting)
10634 return -ENOTCONN;
10635
7c673cae
FG
10636 Fh *f = get_filehandle(fd);
10637 if (!f)
10638 return -EBADF;
10639 return _getxattr(f->inode, name, value, size, perms);
10640}
10641
10642int Client::listxattr(const char *path, char *list, size_t size,
10643 const UserPerm& perms)
10644{
10645 Mutex::Locker lock(client_lock);
181888fb
FG
10646
10647 if (unmounting)
10648 return -ENOTCONN;
10649
7c673cae
FG
10650 InodeRef in;
10651 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10652 if (r < 0)
10653 return r;
10654 return Client::_listxattr(in.get(), list, size, perms);
10655}
10656
10657int Client::llistxattr(const char *path, char *list, size_t size,
10658 const UserPerm& perms)
10659{
10660 Mutex::Locker lock(client_lock);
181888fb
FG
10661
10662 if (unmounting)
10663 return -ENOTCONN;
10664
7c673cae
FG
10665 InodeRef in;
10666 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10667 if (r < 0)
10668 return r;
10669 return Client::_listxattr(in.get(), list, size, perms);
10670}
10671
10672int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10673{
10674 Mutex::Locker lock(client_lock);
181888fb
FG
10675
10676 if (unmounting)
10677 return -ENOTCONN;
10678
7c673cae
FG
10679 Fh *f = get_filehandle(fd);
10680 if (!f)
10681 return -EBADF;
10682 return Client::_listxattr(f->inode.get(), list, size, perms);
10683}
10684
10685int Client::removexattr(const char *path, const char *name,
10686 const UserPerm& perms)
10687{
10688 Mutex::Locker lock(client_lock);
181888fb
FG
10689
10690 if (unmounting)
10691 return -ENOTCONN;
10692
7c673cae
FG
10693 InodeRef in;
10694 int r = Client::path_walk(path, &in, perms, true);
10695 if (r < 0)
10696 return r;
10697 return _removexattr(in, name, perms);
10698}
10699
10700int Client::lremovexattr(const char *path, const char *name,
10701 const UserPerm& perms)
10702{
10703 Mutex::Locker lock(client_lock);
181888fb
FG
10704
10705 if (unmounting)
10706 return -ENOTCONN;
10707
7c673cae
FG
10708 InodeRef in;
10709 int r = Client::path_walk(path, &in, perms, false);
10710 if (r < 0)
10711 return r;
10712 return _removexattr(in, name, perms);
10713}
10714
10715int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10716{
10717 Mutex::Locker lock(client_lock);
181888fb
FG
10718
10719 if (unmounting)
10720 return -ENOTCONN;
10721
7c673cae
FG
10722 Fh *f = get_filehandle(fd);
10723 if (!f)
10724 return -EBADF;
10725 return _removexattr(f->inode, name, perms);
10726}
10727
10728int Client::setxattr(const char *path, const char *name, const void *value,
10729 size_t size, int flags, const UserPerm& perms)
10730{
10731 _setxattr_maybe_wait_for_osdmap(name, value, size);
10732
10733 Mutex::Locker lock(client_lock);
181888fb
FG
10734
10735 if (unmounting)
10736 return -ENOTCONN;
10737
7c673cae
FG
10738 InodeRef in;
10739 int r = Client::path_walk(path, &in, perms, true);
10740 if (r < 0)
10741 return r;
10742 return _setxattr(in, name, value, size, flags, perms);
10743}
10744
10745int Client::lsetxattr(const char *path, const char *name, const void *value,
10746 size_t size, int flags, const UserPerm& perms)
10747{
10748 _setxattr_maybe_wait_for_osdmap(name, value, size);
10749
10750 Mutex::Locker lock(client_lock);
181888fb
FG
10751
10752 if (unmounting)
10753 return -ENOTCONN;
10754
7c673cae
FG
10755 InodeRef in;
10756 int r = Client::path_walk(path, &in, perms, false);
10757 if (r < 0)
10758 return r;
10759 return _setxattr(in, name, value, size, flags, perms);
10760}
10761
10762int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10763 int flags, const UserPerm& perms)
10764{
10765 _setxattr_maybe_wait_for_osdmap(name, value, size);
10766
10767 Mutex::Locker lock(client_lock);
181888fb
FG
10768
10769 if (unmounting)
10770 return -ENOTCONN;
10771
7c673cae
FG
10772 Fh *f = get_filehandle(fd);
10773 if (!f)
10774 return -EBADF;
10775 return _setxattr(f->inode, name, value, size, flags, perms);
10776}
10777
10778int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10779 const UserPerm& perms)
10780{
10781 int r;
10782
10783 const VXattr *vxattr = _match_vxattr(in, name);
10784 if (vxattr) {
10785 r = -ENODATA;
10786
10787 // Do a force getattr to get the latest quota before returning
10788 // a value to userspace.
10789 r = _getattr(in, 0, perms, true);
10790 if (r != 0) {
10791 // Error from getattr!
10792 return r;
10793 }
10794
10795 // call pointer-to-member function
10796 char buf[256];
10797 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10798 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10799 } else {
10800 r = -ENODATA;
10801 }
10802
10803 if (size != 0) {
10804 if (r > (int)size) {
10805 r = -ERANGE;
10806 } else if (r > 0) {
10807 memcpy(value, buf, r);
10808 }
10809 }
10810 goto out;
10811 }
10812
10813 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10814 r = -EOPNOTSUPP;
10815 goto out;
10816 }
10817
10818 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10819 if (r == 0) {
10820 string n(name);
10821 r = -ENODATA;
10822 if (in->xattrs.count(n)) {
10823 r = in->xattrs[n].length();
10824 if (r > 0 && size != 0) {
10825 if (size >= (unsigned)r)
10826 memcpy(value, in->xattrs[n].c_str(), r);
10827 else
10828 r = -ERANGE;
10829 }
10830 }
10831 }
10832 out:
10833 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10834 return r;
10835}
10836
10837int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10838 const UserPerm& perms)
10839{
10840 if (cct->_conf->client_permissions) {
10841 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10842 if (r < 0)
10843 return r;
10844 }
10845 return _getxattr(in.get(), name, value, size, perms);
10846}
10847
10848int Client::ll_getxattr(Inode *in, const char *name, void *value,
10849 size_t size, const UserPerm& perms)
10850{
10851 Mutex::Locker lock(client_lock);
10852
181888fb
FG
10853 if (unmounting)
10854 return -ENOTCONN;
10855
7c673cae
FG
10856 vinodeno_t vino = _get_vino(in);
10857
10858 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10859 tout(cct) << "ll_getxattr" << std::endl;
10860 tout(cct) << vino.ino.val << std::endl;
10861 tout(cct) << name << std::endl;
10862
10863 if (!cct->_conf->fuse_default_permissions) {
10864 int r = xattr_permission(in, name, MAY_READ, perms);
10865 if (r < 0)
10866 return r;
10867 }
10868
10869 return _getxattr(in, name, value, size, perms);
10870}
10871
10872int Client::_listxattr(Inode *in, char *name, size_t size,
10873 const UserPerm& perms)
10874{
10875 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10876 if (r == 0) {
10877 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10878 p != in->xattrs.end();
10879 ++p)
10880 r += p->first.length() + 1;
10881
10882 const VXattr *vxattrs = _get_vxattrs(in);
10883 r += _vxattrs_name_size(vxattrs);
10884
10885 if (size != 0) {
10886 if (size >= (unsigned)r) {
10887 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10888 p != in->xattrs.end();
10889 ++p) {
10890 memcpy(name, p->first.c_str(), p->first.length());
10891 name += p->first.length();
10892 *name = '\0';
10893 name++;
10894 }
10895 if (vxattrs) {
10896 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10897 const VXattr& vxattr = vxattrs[i];
10898 if (vxattr.hidden)
10899 continue;
10900 // call pointer-to-member function
10901 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10902 continue;
10903 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10904 name += vxattr.name.length();
10905 *name = '\0';
10906 name++;
10907 }
10908 }
10909 } else
10910 r = -ERANGE;
10911 }
10912 }
10913 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10914 return r;
10915}
10916
10917int Client::ll_listxattr(Inode *in, char *names, size_t size,
10918 const UserPerm& perms)
10919{
10920 Mutex::Locker lock(client_lock);
10921
181888fb
FG
10922 if (unmounting)
10923 return -ENOTCONN;
10924
7c673cae
FG
10925 vinodeno_t vino = _get_vino(in);
10926
10927 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10928 tout(cct) << "ll_listxattr" << std::endl;
10929 tout(cct) << vino.ino.val << std::endl;
10930 tout(cct) << size << std::endl;
10931
10932 return _listxattr(in, names, size, perms);
10933}
10934
10935int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10936 size_t size, int flags, const UserPerm& perms)
10937{
10938
10939 int xattr_flags = 0;
10940 if (!value)
10941 xattr_flags |= CEPH_XATTR_REMOVE;
10942 if (flags & XATTR_CREATE)
10943 xattr_flags |= CEPH_XATTR_CREATE;
10944 if (flags & XATTR_REPLACE)
10945 xattr_flags |= CEPH_XATTR_REPLACE;
10946
10947 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
10948 filepath path;
10949 in->make_nosnap_relative_path(path);
10950 req->set_filepath(path);
10951 req->set_string2(name);
10952 req->set_inode(in);
10953 req->head.args.setxattr.flags = xattr_flags;
10954
10955 bufferlist bl;
10956 bl.append((const char*)value, size);
10957 req->set_data(bl);
10958
10959 int res = make_request(req, perms);
10960
10961 trim_cache();
10962 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
10963 res << dendl;
10964 return res;
10965}
10966
10967int Client::_setxattr(Inode *in, const char *name, const void *value,
10968 size_t size, int flags, const UserPerm& perms)
10969{
10970 if (in->snapid != CEPH_NOSNAP) {
10971 return -EROFS;
10972 }
10973
10974 bool posix_acl_xattr = false;
10975 if (acl_type == POSIX_ACL)
10976 posix_acl_xattr = !strncmp(name, "system.", 7);
10977
10978 if (strncmp(name, "user.", 5) &&
10979 strncmp(name, "security.", 9) &&
10980 strncmp(name, "trusted.", 8) &&
10981 strncmp(name, "ceph.", 5) &&
10982 !posix_acl_xattr)
10983 return -EOPNOTSUPP;
10984
10985 if (posix_acl_xattr) {
10986 if (!strcmp(name, ACL_EA_ACCESS)) {
10987 mode_t new_mode = in->mode;
10988 if (value) {
10989 int ret = posix_acl_equiv_mode(value, size, &new_mode);
10990 if (ret < 0)
10991 return ret;
10992 if (ret == 0) {
10993 value = NULL;
10994 size = 0;
10995 }
10996 if (new_mode != in->mode) {
10997 struct ceph_statx stx;
10998 stx.stx_mode = new_mode;
10999 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11000 if (ret < 0)
11001 return ret;
11002 }
11003 }
11004 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11005 if (value) {
11006 if (!S_ISDIR(in->mode))
11007 return -EACCES;
11008 int ret = posix_acl_check(value, size);
11009 if (ret < 0)
11010 return -EINVAL;
11011 if (ret == 0) {
11012 value = NULL;
11013 size = 0;
11014 }
11015 }
11016 } else {
11017 return -EOPNOTSUPP;
11018 }
11019 } else {
11020 const VXattr *vxattr = _match_vxattr(in, name);
11021 if (vxattr && vxattr->readonly)
11022 return -EOPNOTSUPP;
11023 }
11024
11025 return _do_setxattr(in, name, value, size, flags, perms);
11026}
11027
11028int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11029 size_t size, int flags, const UserPerm& perms)
11030{
11031 if (cct->_conf->client_permissions) {
11032 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11033 if (r < 0)
11034 return r;
11035 }
11036 return _setxattr(in.get(), name, value, size, flags, perms);
11037}
11038
11039int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11040{
11041 string tmp;
11042 if (name == "layout") {
11043 string::iterator begin = value.begin();
11044 string::iterator end = value.end();
11045 keys_and_values<string::iterator> p; // create instance of parser
11046 std::map<string, string> m; // map to receive results
11047 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11048 return -EINVAL;
11049 }
11050 if (begin != end)
11051 return -EINVAL;
11052 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11053 if (q->first == "pool") {
11054 tmp = q->second;
11055 break;
11056 }
11057 }
11058 } else if (name == "layout.pool") {
11059 tmp = value;
11060 }
11061
11062 if (tmp.length()) {
11063 int64_t pool;
11064 try {
11065 pool = boost::lexical_cast<unsigned>(tmp);
11066 if (!osdmap->have_pg_pool(pool))
11067 return -ENOENT;
11068 } catch (boost::bad_lexical_cast const&) {
11069 pool = osdmap->lookup_pg_pool_name(tmp);
11070 if (pool < 0) {
11071 return -ENOENT;
11072 }
11073 }
11074 }
11075
11076 return 0;
11077}
11078
11079void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11080{
11081 // For setting pool of layout, MetaRequest need osdmap epoch.
11082 // There is a race which create a new data pool but client and mds both don't have.
11083 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11084 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11085 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11086 string rest(strstr(name, "layout"));
11087 string v((const char*)value, size);
11088 int r = objecter->with_osdmap([&](const OSDMap& o) {
11089 return _setxattr_check_data_pool(rest, v, &o);
11090 });
11091
11092 if (r == -ENOENT) {
11093 C_SaferCond ctx;
11094 objecter->wait_for_latest_osdmap(&ctx);
11095 ctx.wait();
11096 }
11097 }
11098}
11099
11100int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11101 size_t size, int flags, const UserPerm& perms)
11102{
11103 _setxattr_maybe_wait_for_osdmap(name, value, size);
11104
11105 Mutex::Locker lock(client_lock);
11106
181888fb
FG
11107 if (unmounting)
11108 return -ENOTCONN;
11109
7c673cae
FG
11110 vinodeno_t vino = _get_vino(in);
11111
11112 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11113 tout(cct) << "ll_setxattr" << std::endl;
11114 tout(cct) << vino.ino.val << std::endl;
11115 tout(cct) << name << std::endl;
11116
11117 if (!cct->_conf->fuse_default_permissions) {
11118 int r = xattr_permission(in, name, MAY_WRITE, perms);
11119 if (r < 0)
11120 return r;
11121 }
11122 return _setxattr(in, name, value, size, flags, perms);
11123}
11124
11125int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11126{
11127 if (in->snapid != CEPH_NOSNAP) {
11128 return -EROFS;
11129 }
11130
11131 // same xattrs supported by kernel client
11132 if (strncmp(name, "user.", 5) &&
11133 strncmp(name, "system.", 7) &&
11134 strncmp(name, "security.", 9) &&
11135 strncmp(name, "trusted.", 8) &&
11136 strncmp(name, "ceph.", 5))
11137 return -EOPNOTSUPP;
11138
11139 const VXattr *vxattr = _match_vxattr(in, name);
11140 if (vxattr && vxattr->readonly)
11141 return -EOPNOTSUPP;
11142
11143 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11144 filepath path;
11145 in->make_nosnap_relative_path(path);
11146 req->set_filepath(path);
11147 req->set_filepath2(name);
11148 req->set_inode(in);
11149
11150 int res = make_request(req, perms);
11151
11152 trim_cache();
11153 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11154 return res;
11155}
11156
11157int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11158{
11159 if (cct->_conf->client_permissions) {
11160 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11161 if (r < 0)
11162 return r;
11163 }
11164 return _removexattr(in.get(), name, perms);
11165}
11166
11167int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11168{
11169 Mutex::Locker lock(client_lock);
11170
181888fb
FG
11171 if (unmounting)
11172 return -ENOTCONN;
11173
7c673cae
FG
11174 vinodeno_t vino = _get_vino(in);
11175
11176 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11177 tout(cct) << "ll_removexattr" << std::endl;
11178 tout(cct) << vino.ino.val << std::endl;
11179 tout(cct) << name << std::endl;
11180
11181 if (!cct->_conf->fuse_default_permissions) {
11182 int r = xattr_permission(in, name, MAY_WRITE, perms);
11183 if (r < 0)
11184 return r;
11185 }
11186
11187 return _removexattr(in, name, perms);
11188}
11189
11190bool Client::_vxattrcb_quota_exists(Inode *in)
11191{
11192 return in->quota.is_enable();
11193}
11194size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11195{
11196 return snprintf(val, size,
11197 "max_bytes=%lld max_files=%lld",
11198 (long long int)in->quota.max_bytes,
11199 (long long int)in->quota.max_files);
11200}
11201size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11202{
11203 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11204}
11205size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11206{
11207 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11208}
11209
11210bool Client::_vxattrcb_layout_exists(Inode *in)
11211{
11212 return in->layout != file_layout_t();
11213}
11214size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11215{
11216 int r = snprintf(val, size,
11217 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11218 (unsigned long long)in->layout.stripe_unit,
11219 (unsigned long long)in->layout.stripe_count,
11220 (unsigned long long)in->layout.object_size);
11221 objecter->with_osdmap([&](const OSDMap& o) {
11222 if (o.have_pg_pool(in->layout.pool_id))
11223 r += snprintf(val + r, size - r, "%s",
11224 o.get_pool_name(in->layout.pool_id).c_str());
11225 else
11226 r += snprintf(val + r, size - r, "%" PRIu64,
11227 (uint64_t)in->layout.pool_id);
11228 });
11229 if (in->layout.pool_ns.length())
11230 r += snprintf(val + r, size - r, " pool_namespace=%s",
11231 in->layout.pool_ns.c_str());
11232 return r;
11233}
11234size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11235{
11236 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11237}
11238size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11239{
11240 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11241}
11242size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11243{
11244 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11245}
11246size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11247{
11248 size_t r;
11249 objecter->with_osdmap([&](const OSDMap& o) {
11250 if (o.have_pg_pool(in->layout.pool_id))
11251 r = snprintf(val, size, "%s", o.get_pool_name(
11252 in->layout.pool_id).c_str());
11253 else
11254 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11255 });
11256 return r;
11257}
11258size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11259{
11260 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11261}
11262size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11263{
11264 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11265}
11266size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11267{
11268 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11269}
11270size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11271{
11272 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11273}
11274size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11275{
11276 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11277}
11278size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11279{
11280 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11281}
11282size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11283{
11284 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11285}
11286size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11287{
11288 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11289}
11290size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11291{
11292 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11293 (long)in->rstat.rctime.nsec());
11294}
11295
11296#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11297#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11298
11299#define XATTR_NAME_CEPH(_type, _name) \
11300{ \
11301 name: CEPH_XATTR_NAME(_type, _name), \
11302 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11303 readonly: true, \
11304 hidden: false, \
11305 exists_cb: NULL, \
11306}
11307#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11308{ \
11309 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11310 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11311 readonly: false, \
11312 hidden: true, \
11313 exists_cb: &Client::_vxattrcb_layout_exists, \
11314}
11315#define XATTR_QUOTA_FIELD(_type, _name) \
11316{ \
11317 name: CEPH_XATTR_NAME(_type, _name), \
11318 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11319 readonly: false, \
11320 hidden: true, \
11321 exists_cb: &Client::_vxattrcb_quota_exists, \
11322}
11323
11324const Client::VXattr Client::_dir_vxattrs[] = {
11325 {
11326 name: "ceph.dir.layout",
11327 getxattr_cb: &Client::_vxattrcb_layout,
11328 readonly: false,
11329 hidden: true,
11330 exists_cb: &Client::_vxattrcb_layout_exists,
11331 },
11332 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11333 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11334 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11335 XATTR_LAYOUT_FIELD(dir, layout, pool),
11336 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11337 XATTR_NAME_CEPH(dir, entries),
11338 XATTR_NAME_CEPH(dir, files),
11339 XATTR_NAME_CEPH(dir, subdirs),
11340 XATTR_NAME_CEPH(dir, rentries),
11341 XATTR_NAME_CEPH(dir, rfiles),
11342 XATTR_NAME_CEPH(dir, rsubdirs),
11343 XATTR_NAME_CEPH(dir, rbytes),
11344 XATTR_NAME_CEPH(dir, rctime),
11345 {
11346 name: "ceph.quota",
11347 getxattr_cb: &Client::_vxattrcb_quota,
11348 readonly: false,
11349 hidden: true,
11350 exists_cb: &Client::_vxattrcb_quota_exists,
11351 },
11352 XATTR_QUOTA_FIELD(quota, max_bytes),
11353 XATTR_QUOTA_FIELD(quota, max_files),
11354 { name: "" } /* Required table terminator */
11355};
11356
11357const Client::VXattr Client::_file_vxattrs[] = {
11358 {
11359 name: "ceph.file.layout",
11360 getxattr_cb: &Client::_vxattrcb_layout,
11361 readonly: false,
11362 hidden: true,
11363 exists_cb: &Client::_vxattrcb_layout_exists,
11364 },
11365 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11366 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11367 XATTR_LAYOUT_FIELD(file, layout, object_size),
11368 XATTR_LAYOUT_FIELD(file, layout, pool),
11369 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11370 { name: "" } /* Required table terminator */
11371};
11372
11373const Client::VXattr *Client::_get_vxattrs(Inode *in)
11374{
11375 if (in->is_dir())
11376 return _dir_vxattrs;
11377 else if (in->is_file())
11378 return _file_vxattrs;
11379 return NULL;
11380}
11381
11382const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11383{
11384 if (strncmp(name, "ceph.", 5) == 0) {
11385 const VXattr *vxattr = _get_vxattrs(in);
11386 if (vxattr) {
11387 while (!vxattr->name.empty()) {
11388 if (vxattr->name == name)
11389 return vxattr;
11390 vxattr++;
11391 }
11392 }
11393 }
11394 return NULL;
11395}
11396
11397size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11398{
11399 size_t len = 0;
11400 while (!vxattr->name.empty()) {
11401 if (!vxattr->hidden)
11402 len += vxattr->name.length() + 1;
11403 vxattr++;
11404 }
11405 return len;
11406}
11407
11408int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11409{
11410 Mutex::Locker lock(client_lock);
11411
181888fb
FG
11412 if (unmounting)
11413 return -ENOTCONN;
11414
7c673cae
FG
11415 vinodeno_t vino = _get_vino(in);
11416
11417 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11418 tout(cct) << "ll_readlink" << std::endl;
11419 tout(cct) << vino.ino.val << std::endl;
11420
11421 set<Dentry*>::iterator dn = in->dn_set.begin();
11422 while (dn != in->dn_set.end()) {
11423 touch_dn(*dn);
11424 ++dn;
11425 }
11426
11427 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11428 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11429 return r;
11430}
11431
11432int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11433 const UserPerm& perms, InodeRef *inp)
11434{
11435 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11436 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11437 << ", gid " << perms.gid() << ")" << dendl;
11438
11439 if (strlen(name) > NAME_MAX)
11440 return -ENAMETOOLONG;
11441
11442 if (dir->snapid != CEPH_NOSNAP) {
11443 return -EROFS;
11444 }
11445 if (is_quota_files_exceeded(dir, perms)) {
11446 return -EDQUOT;
11447 }
11448
11449 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11450
11451 filepath path;
11452 dir->make_nosnap_relative_path(path);
11453 path.push_dentry(name);
11454 req->set_filepath(path);
11455 req->set_inode(dir);
11456 req->head.args.mknod.rdev = rdev;
11457 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11458 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11459
11460 bufferlist xattrs_bl;
11461 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11462 if (res < 0)
11463 goto fail;
11464 req->head.args.mknod.mode = mode;
11465 if (xattrs_bl.length() > 0)
11466 req->set_data(xattrs_bl);
11467
11468 Dentry *de;
11469 res = get_or_create(dir, name, &de);
11470 if (res < 0)
11471 goto fail;
11472 req->set_dentry(de);
11473
11474 res = make_request(req, perms, inp);
11475
11476 trim_cache();
11477
11478 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11479 return res;
11480
11481 fail:
11482 put_request(req);
11483 return res;
11484}
11485
11486int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11487 dev_t rdev, struct stat *attr, Inode **out,
11488 const UserPerm& perms)
11489{
11490 Mutex::Locker lock(client_lock);
11491
181888fb
FG
11492 if (unmounting)
11493 return -ENOTCONN;
11494
7c673cae
FG
11495 vinodeno_t vparent = _get_vino(parent);
11496
11497 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11498 tout(cct) << "ll_mknod" << std::endl;
11499 tout(cct) << vparent.ino.val << std::endl;
11500 tout(cct) << name << std::endl;
11501 tout(cct) << mode << std::endl;
11502 tout(cct) << rdev << std::endl;
11503
11504 if (!cct->_conf->fuse_default_permissions) {
11505 int r = may_create(parent, perms);
11506 if (r < 0)
11507 return r;
11508 }
11509
11510 InodeRef in;
11511 int r = _mknod(parent, name, mode, rdev, perms, &in);
11512 if (r == 0) {
11513 fill_stat(in, attr);
11514 _ll_get(in.get());
11515 }
11516 tout(cct) << attr->st_ino << std::endl;
11517 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11518 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11519 *out = in.get();
11520 return r;
11521}
11522
11523int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11524 dev_t rdev, Inode **out,
11525 struct ceph_statx *stx, unsigned want, unsigned flags,
11526 const UserPerm& perms)
11527{
11528 unsigned caps = statx_to_mask(flags, want);
11529 Mutex::Locker lock(client_lock);
11530
181888fb
FG
11531 if (unmounting)
11532 return -ENOTCONN;
11533
7c673cae
FG
11534 vinodeno_t vparent = _get_vino(parent);
11535
11536 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11537 tout(cct) << "ll_mknodx" << std::endl;
11538 tout(cct) << vparent.ino.val << std::endl;
11539 tout(cct) << name << std::endl;
11540 tout(cct) << mode << std::endl;
11541 tout(cct) << rdev << std::endl;
11542
11543 if (!cct->_conf->fuse_default_permissions) {
11544 int r = may_create(parent, perms);
11545 if (r < 0)
11546 return r;
11547 }
11548
11549 InodeRef in;
11550 int r = _mknod(parent, name, mode, rdev, perms, &in);
11551 if (r == 0) {
11552 fill_statx(in, caps, stx);
11553 _ll_get(in.get());
11554 }
11555 tout(cct) << stx->stx_ino << std::endl;
11556 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11557 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11558 *out = in.get();
11559 return r;
11560}
11561
11562int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11563 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11564 int object_size, const char *data_pool, bool *created,
11565 const UserPerm& perms)
11566{
11567 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11568 mode << dec << ")" << dendl;
11569
11570 if (strlen(name) > NAME_MAX)
11571 return -ENAMETOOLONG;
11572 if (dir->snapid != CEPH_NOSNAP) {
11573 return -EROFS;
11574 }
11575 if (is_quota_files_exceeded(dir, perms)) {
11576 return -EDQUOT;
11577 }
11578
11579 // use normalized flags to generate cmode
11580 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11581 if (cmode < 0)
11582 return -EINVAL;
11583
11584 int64_t pool_id = -1;
11585 if (data_pool && *data_pool) {
11586 pool_id = objecter->with_osdmap(
11587 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11588 if (pool_id < 0)
11589 return -EINVAL;
11590 if (pool_id > 0xffffffffll)
11591 return -ERANGE; // bummer!
11592 }
11593
11594 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11595
11596 filepath path;
11597 dir->make_nosnap_relative_path(path);
11598 path.push_dentry(name);
11599 req->set_filepath(path);
11600 req->set_inode(dir);
11601 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11602
11603 req->head.args.open.stripe_unit = stripe_unit;
11604 req->head.args.open.stripe_count = stripe_count;
11605 req->head.args.open.object_size = object_size;
11606 if (cct->_conf->client_debug_getattr_caps)
11607 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11608 else
11609 req->head.args.open.mask = 0;
11610 req->head.args.open.pool = pool_id;
11611 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11612 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11613
11614 mode |= S_IFREG;
11615 bufferlist xattrs_bl;
11616 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11617 if (res < 0)
11618 goto fail;
11619 req->head.args.open.mode = mode;
11620 if (xattrs_bl.length() > 0)
11621 req->set_data(xattrs_bl);
11622
11623 Dentry *de;
11624 res = get_or_create(dir, name, &de);
11625 if (res < 0)
11626 goto fail;
11627 req->set_dentry(de);
11628
11629 res = make_request(req, perms, inp, created);
11630 if (res < 0) {
11631 goto reply_error;
11632 }
11633
11634 /* If the caller passed a value in fhp, do the open */
11635 if(fhp) {
11636 (*inp)->get_open_ref(cmode);
11637 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11638 }
11639
11640 reply_error:
11641 trim_cache();
11642
11643 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11644 << " layout " << stripe_unit
11645 << ' ' << stripe_count
11646 << ' ' << object_size
11647 <<") = " << res << dendl;
11648 return res;
11649
11650 fail:
11651 put_request(req);
11652 return res;
11653}
11654
11655
11656int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11657 InodeRef *inp)
11658{
11659 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11660 << mode << dec << ", uid " << perm.uid()
11661 << ", gid " << perm.gid() << ")" << dendl;
11662
11663 if (strlen(name) > NAME_MAX)
11664 return -ENAMETOOLONG;
11665
11666 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11667 return -EROFS;
11668 }
11669 if (is_quota_files_exceeded(dir, perm)) {
11670 return -EDQUOT;
11671 }
11672 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11673 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11674
11675 filepath path;
11676 dir->make_nosnap_relative_path(path);
11677 path.push_dentry(name);
11678 req->set_filepath(path);
11679 req->set_inode(dir);
11680 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11681 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11682
11683 mode |= S_IFDIR;
11684 bufferlist xattrs_bl;
11685 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11686 if (res < 0)
11687 goto fail;
11688 req->head.args.mkdir.mode = mode;
11689 if (xattrs_bl.length() > 0)
11690 req->set_data(xattrs_bl);
11691
11692 Dentry *de;
11693 res = get_or_create(dir, name, &de);
11694 if (res < 0)
11695 goto fail;
11696 req->set_dentry(de);
11697
11698 ldout(cct, 10) << "_mkdir: making request" << dendl;
11699 res = make_request(req, perm, inp);
11700 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11701
11702 trim_cache();
11703
11704 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11705 return res;
11706
11707 fail:
11708 put_request(req);
11709 return res;
11710}
11711
11712int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11713 struct stat *attr, Inode **out, const UserPerm& perm)
11714{
11715 Mutex::Locker lock(client_lock);
11716
181888fb
FG
11717 if (unmounting)
11718 return -ENOTCONN;
11719
7c673cae
FG
11720 vinodeno_t vparent = _get_vino(parent);
11721
11722 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11723 tout(cct) << "ll_mkdir" << std::endl;
11724 tout(cct) << vparent.ino.val << std::endl;
11725 tout(cct) << name << std::endl;
11726 tout(cct) << mode << std::endl;
11727
11728 if (!cct->_conf->fuse_default_permissions) {
11729 int r = may_create(parent, perm);
11730 if (r < 0)
11731 return r;
11732 }
11733
11734 InodeRef in;
11735 int r = _mkdir(parent, name, mode, perm, &in);
11736 if (r == 0) {
11737 fill_stat(in, attr);
11738 _ll_get(in.get());
11739 }
11740 tout(cct) << attr->st_ino << std::endl;
11741 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11742 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11743 *out = in.get();
11744 return r;
11745}
11746
11747int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11748 struct ceph_statx *stx, unsigned want, unsigned flags,
11749 const UserPerm& perms)
11750{
11751 Mutex::Locker lock(client_lock);
11752
181888fb
FG
11753 if (unmounting)
11754 return -ENOTCONN;
11755
7c673cae
FG
11756 vinodeno_t vparent = _get_vino(parent);
11757
11758 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11759 tout(cct) << "ll_mkdirx" << std::endl;
11760 tout(cct) << vparent.ino.val << std::endl;
11761 tout(cct) << name << std::endl;
11762 tout(cct) << mode << std::endl;
11763
11764 if (!cct->_conf->fuse_default_permissions) {
11765 int r = may_create(parent, perms);
11766 if (r < 0)
11767 return r;
11768 }
11769
11770 InodeRef in;
11771 int r = _mkdir(parent, name, mode, perms, &in);
11772 if (r == 0) {
11773 fill_statx(in, statx_to_mask(flags, want), stx);
11774 _ll_get(in.get());
11775 } else {
11776 stx->stx_ino = 0;
11777 stx->stx_mask = 0;
11778 }
11779 tout(cct) << stx->stx_ino << std::endl;
11780 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11781 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11782 *out = in.get();
11783 return r;
11784}
11785
11786int Client::_symlink(Inode *dir, const char *name, const char *target,
11787 const UserPerm& perms, InodeRef *inp)
11788{
11789 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11790 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11791 << dendl;
11792
11793 if (strlen(name) > NAME_MAX)
11794 return -ENAMETOOLONG;
11795
11796 if (dir->snapid != CEPH_NOSNAP) {
11797 return -EROFS;
11798 }
11799 if (is_quota_files_exceeded(dir, perms)) {
11800 return -EDQUOT;
11801 }
11802
11803 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11804
11805 filepath path;
11806 dir->make_nosnap_relative_path(path);
11807 path.push_dentry(name);
11808 req->set_filepath(path);
11809 req->set_inode(dir);
11810 req->set_string2(target);
11811 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11812 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11813
11814 Dentry *de;
11815 int res = get_or_create(dir, name, &de);
11816 if (res < 0)
11817 goto fail;
11818 req->set_dentry(de);
11819
11820 res = make_request(req, perms, inp);
11821
11822 trim_cache();
11823 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11824 res << dendl;
11825 return res;
11826
11827 fail:
11828 put_request(req);
11829 return res;
11830}
11831
11832int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11833 struct stat *attr, Inode **out, const UserPerm& perms)
11834{
11835 Mutex::Locker lock(client_lock);
11836
181888fb
FG
11837 if (unmounting)
11838 return -ENOTCONN;
11839
7c673cae
FG
11840 vinodeno_t vparent = _get_vino(parent);
11841
11842 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11843 << dendl;
11844 tout(cct) << "ll_symlink" << std::endl;
11845 tout(cct) << vparent.ino.val << std::endl;
11846 tout(cct) << name << std::endl;
11847 tout(cct) << value << std::endl;
11848
11849 if (!cct->_conf->fuse_default_permissions) {
11850 int r = may_create(parent, perms);
11851 if (r < 0)
11852 return r;
11853 }
11854
11855 InodeRef in;
11856 int r = _symlink(parent, name, value, perms, &in);
11857 if (r == 0) {
11858 fill_stat(in, attr);
11859 _ll_get(in.get());
11860 }
11861 tout(cct) << attr->st_ino << std::endl;
11862 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11863 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11864 *out = in.get();
11865 return r;
11866}
11867
11868int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11869 Inode **out, struct ceph_statx *stx, unsigned want,
11870 unsigned flags, const UserPerm& perms)
11871{
11872 Mutex::Locker lock(client_lock);
11873
181888fb
FG
11874 if (unmounting)
11875 return -ENOTCONN;
11876
7c673cae
FG
11877 vinodeno_t vparent = _get_vino(parent);
11878
11879 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11880 << dendl;
11881 tout(cct) << "ll_symlinkx" << std::endl;
11882 tout(cct) << vparent.ino.val << std::endl;
11883 tout(cct) << name << std::endl;
11884 tout(cct) << value << std::endl;
11885
11886 if (!cct->_conf->fuse_default_permissions) {
11887 int r = may_create(parent, perms);
11888 if (r < 0)
11889 return r;
11890 }
11891
11892 InodeRef in;
11893 int r = _symlink(parent, name, value, perms, &in);
11894 if (r == 0) {
11895 fill_statx(in, statx_to_mask(flags, want), stx);
11896 _ll_get(in.get());
11897 }
11898 tout(cct) << stx->stx_ino << std::endl;
11899 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11900 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11901 *out = in.get();
11902 return r;
11903}
11904
11905int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11906{
11907 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11908 << " uid " << perm.uid() << " gid " << perm.gid()
11909 << ")" << dendl;
11910
11911 if (dir->snapid != CEPH_NOSNAP) {
11912 return -EROFS;
11913 }
11914
11915 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11916
11917 filepath path;
11918 dir->make_nosnap_relative_path(path);
11919 path.push_dentry(name);
11920 req->set_filepath(path);
11921
11922 InodeRef otherin;
11923
11924 Dentry *de;
11925 int res = get_or_create(dir, name, &de);
11926 if (res < 0)
11927 goto fail;
11928 req->set_dentry(de);
11929 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11930 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11931
11932 res = _lookup(dir, name, 0, &otherin, perm);
11933 if (res < 0)
11934 goto fail;
11935 req->set_other_inode(otherin.get());
11936 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11937
11938 req->set_inode(dir);
11939
11940 res = make_request(req, perm);
11941
11942 trim_cache();
11943 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
11944 return res;
11945
11946 fail:
11947 put_request(req);
11948 return res;
11949}
11950
11951int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
11952{
11953 Mutex::Locker lock(client_lock);
11954
181888fb
FG
11955 if (unmounting)
11956 return -ENOTCONN;
11957
7c673cae
FG
11958 vinodeno_t vino = _get_vino(in);
11959
11960 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
11961 tout(cct) << "ll_unlink" << std::endl;
11962 tout(cct) << vino.ino.val << std::endl;
11963 tout(cct) << name << std::endl;
11964
11965 if (!cct->_conf->fuse_default_permissions) {
11966 int r = may_delete(in, name, perm);
11967 if (r < 0)
11968 return r;
11969 }
11970 return _unlink(in, name, perm);
11971}
11972
11973int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
11974{
11975 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
11976 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
11977
11978 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11979 return -EROFS;
11980 }
11981
11982 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP:CEPH_MDS_OP_RMDIR);
11983 filepath path;
11984 dir->make_nosnap_relative_path(path);
11985 path.push_dentry(name);
11986 req->set_filepath(path);
11987
11988 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11989 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11990 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11991
11992 InodeRef in;
11993
11994 Dentry *de;
11995 int res = get_or_create(dir, name, &de);
11996 if (res < 0)
11997 goto fail;
11998 res = _lookup(dir, name, 0, &in, perms);
11999 if (res < 0)
12000 goto fail;
12001 if (req->get_op() == CEPH_MDS_OP_RMDIR) {
12002 req->set_inode(dir);
12003 req->set_dentry(de);
12004 req->set_other_inode(in.get());
12005 } else {
12006 unlink(de, true, true);
12007 req->set_other_inode(in.get());
12008 }
12009
12010 res = make_request(req, perms);
12011
12012 trim_cache();
12013 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
12014 return res;
12015
12016 fail:
12017 put_request(req);
12018 return res;
12019}
12020
12021int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12022{
12023 Mutex::Locker lock(client_lock);
12024
181888fb
FG
12025 if (unmounting)
12026 return -ENOTCONN;
12027
7c673cae
FG
12028 vinodeno_t vino = _get_vino(in);
12029
12030 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12031 tout(cct) << "ll_rmdir" << std::endl;
12032 tout(cct) << vino.ino.val << std::endl;
12033 tout(cct) << name << std::endl;
12034
12035 if (!cct->_conf->fuse_default_permissions) {
12036 int r = may_delete(in, name, perms);
12037 if (r < 0)
12038 return r;
12039 }
12040
12041 return _rmdir(in, name, perms);
12042}
12043
12044int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12045{
12046 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
12047 << todir->ino << " " << toname
12048 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12049 << dendl;
12050
12051 if (fromdir->snapid != todir->snapid)
12052 return -EXDEV;
12053
12054 int op = CEPH_MDS_OP_RENAME;
12055 if (fromdir->snapid != CEPH_NOSNAP) {
12056 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12057 op = CEPH_MDS_OP_RENAMESNAP;
12058 else
12059 return -EROFS;
12060 }
12061 if (fromdir != todir) {
12062 Inode *fromdir_root =
12063 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12064 Inode *todir_root =
12065 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12066 if (fromdir_root != todir_root) {
12067 return -EXDEV;
12068 }
12069 }
12070
12071 InodeRef target;
12072 MetaRequest *req = new MetaRequest(op);
12073
12074 filepath from;
12075 fromdir->make_nosnap_relative_path(from);
12076 from.push_dentry(fromname);
12077 filepath to;
12078 todir->make_nosnap_relative_path(to);
12079 to.push_dentry(toname);
12080 req->set_filepath(to);
12081 req->set_filepath2(from);
12082
12083 Dentry *oldde;
12084 int res = get_or_create(fromdir, fromname, &oldde);
12085 if (res < 0)
12086 goto fail;
12087 Dentry *de;
12088 res = get_or_create(todir, toname, &de);
12089 if (res < 0)
12090 goto fail;
12091
12092 if (op == CEPH_MDS_OP_RENAME) {
12093 req->set_old_dentry(oldde);
12094 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12095 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12096
12097 req->set_dentry(de);
12098 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12099 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12100
12101 InodeRef oldin, otherin;
12102 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12103 if (res < 0)
12104 goto fail;
12105 req->set_old_inode(oldin.get());
12106 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12107
12108 res = _lookup(todir, toname, 0, &otherin, perm);
12109 if (res != 0 && res != -ENOENT) {
12110 goto fail;
12111 } else if (res == 0) {
12112 req->set_other_inode(otherin.get());
12113 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12114 }
12115
12116 req->set_inode(todir);
12117 } else {
12118 // renamesnap reply contains no tracedn, so we need to invalidate
12119 // dentry manually
12120 unlink(oldde, true, true);
12121 unlink(de, true, true);
12122 }
12123
12124 res = make_request(req, perm, &target);
12125 ldout(cct, 10) << "rename result is " << res << dendl;
12126
12127 // renamed item from our cache
12128
12129 trim_cache();
12130 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12131 return res;
12132
12133 fail:
12134 put_request(req);
12135 return res;
12136}
12137
12138int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12139 const char *newname, const UserPerm& perm)
12140{
12141 Mutex::Locker lock(client_lock);
12142
181888fb
FG
12143 if (unmounting)
12144 return -ENOTCONN;
12145
7c673cae
FG
12146 vinodeno_t vparent = _get_vino(parent);
12147 vinodeno_t vnewparent = _get_vino(newparent);
12148
12149 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12150 << vnewparent << " " << newname << dendl;
12151 tout(cct) << "ll_rename" << std::endl;
12152 tout(cct) << vparent.ino.val << std::endl;
12153 tout(cct) << name << std::endl;
12154 tout(cct) << vnewparent.ino.val << std::endl;
12155 tout(cct) << newname << std::endl;
12156
12157 if (!cct->_conf->fuse_default_permissions) {
12158 int r = may_delete(parent, name, perm);
12159 if (r < 0)
12160 return r;
12161 r = may_delete(newparent, newname, perm);
12162 if (r < 0 && r != -ENOENT)
12163 return r;
12164 }
12165
12166 return _rename(parent, name, newparent, newname, perm);
12167}
12168
12169int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12170{
12171 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12172 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12173
12174 if (strlen(newname) > NAME_MAX)
12175 return -ENAMETOOLONG;
12176
12177 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12178 return -EROFS;
12179 }
12180 if (is_quota_files_exceeded(dir, perm)) {
12181 return -EDQUOT;
12182 }
12183
12184 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12185
12186 filepath path(newname, dir->ino);
12187 req->set_filepath(path);
12188 filepath existing(in->ino);
12189 req->set_filepath2(existing);
12190
12191 req->set_inode(dir);
12192 req->inode_drop = CEPH_CAP_FILE_SHARED;
12193 req->inode_unless = CEPH_CAP_FILE_EXCL;
12194
12195 Dentry *de;
12196 int res = get_or_create(dir, newname, &de);
12197 if (res < 0)
12198 goto fail;
12199 req->set_dentry(de);
12200
12201 res = make_request(req, perm, inp);
12202 ldout(cct, 10) << "link result is " << res << dendl;
12203
12204 trim_cache();
12205 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
12206 return res;
12207
12208 fail:
12209 put_request(req);
12210 return res;
12211}
12212
12213int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12214 const UserPerm& perm)
12215{
12216 Mutex::Locker lock(client_lock);
12217
181888fb
FG
12218 if (unmounting)
12219 return -ENOTCONN;
12220
7c673cae
FG
12221 vinodeno_t vino = _get_vino(in);
12222 vinodeno_t vnewparent = _get_vino(newparent);
12223
31f18b77 12224 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12225 newname << dendl;
12226 tout(cct) << "ll_link" << std::endl;
12227 tout(cct) << vino.ino.val << std::endl;
12228 tout(cct) << vnewparent << std::endl;
12229 tout(cct) << newname << std::endl;
12230
12231 int r = 0;
12232 InodeRef target;
12233
12234 if (!cct->_conf->fuse_default_permissions) {
12235 if (S_ISDIR(in->mode))
12236 return -EPERM;
12237
12238 r = may_hardlink(in, perm);
12239 if (r < 0)
12240 return r;
12241
12242 r = may_create(newparent, perm);
12243 if (r < 0)
12244 return r;
12245 }
12246
12247 return _link(in, newparent, newname, perm, &target);
12248}
12249
12250int Client::ll_num_osds(void)
12251{
12252 Mutex::Locker lock(client_lock);
12253 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12254}
12255
12256int Client::ll_osdaddr(int osd, uint32_t *addr)
12257{
12258 Mutex::Locker lock(client_lock);
181888fb 12259
7c673cae
FG
12260 entity_addr_t g;
12261 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12262 if (!o.exists(osd))
12263 return false;
12264 g = o.get_addr(osd);
12265 return true;
12266 });
12267 if (!exists)
12268 return -1;
12269 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12270 *addr = ntohl(nb_addr);
12271 return 0;
12272}
181888fb 12273
7c673cae
FG
12274uint32_t Client::ll_stripe_unit(Inode *in)
12275{
12276 Mutex::Locker lock(client_lock);
12277 return in->layout.stripe_unit;
12278}
12279
12280uint64_t Client::ll_snap_seq(Inode *in)
12281{
12282 Mutex::Locker lock(client_lock);
12283 return in->snaprealm->seq;
12284}
12285
12286int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12287{
12288 Mutex::Locker lock(client_lock);
12289 *layout = in->layout;
12290 return 0;
12291}
12292
12293int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12294{
12295 return ll_file_layout(fh->inode.get(), layout);
12296}
12297
12298/* Currently we cannot take advantage of redundancy in reads, since we
12299 would have to go through all possible placement groups (a
12300 potentially quite large number determined by a hash), and use CRUSH
12301 to calculate the appropriate set of OSDs for each placement group,
12302 then index into that. An array with one entry per OSD is much more
12303 tractable and works for demonstration purposes. */
12304
12305int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12306 file_layout_t* layout)
12307{
12308 Mutex::Locker lock(client_lock);
181888fb 12309
7c673cae
FG
12310 inodeno_t ino = ll_get_inodeno(in);
12311 uint32_t object_size = layout->object_size;
12312 uint32_t su = layout->stripe_unit;
12313 uint32_t stripe_count = layout->stripe_count;
12314 uint64_t stripes_per_object = object_size / su;
12315
12316 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12317 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12318 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12319 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12320
12321 object_t oid = file_object_t(ino, objectno);
12322 return objecter->with_osdmap([&](const OSDMap& o) {
12323 ceph_object_layout olayout =
12324 o.file_to_object_layout(oid, *layout);
12325 pg_t pg = (pg_t)olayout.ol_pgid;
12326 vector<int> osds;
12327 int primary;
12328 o.pg_to_acting_osds(pg, &osds, &primary);
12329 return primary;
12330 });
12331}
12332
12333/* Return the offset of the block, internal to the object */
12334
12335uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12336{
12337 Mutex::Locker lock(client_lock);
12338 file_layout_t *layout=&(in->layout);
12339 uint32_t object_size = layout->object_size;
12340 uint32_t su = layout->stripe_unit;
12341 uint64_t stripes_per_object = object_size / su;
12342
12343 return (blockno % stripes_per_object) * su;
12344}
12345
12346int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12347 const UserPerm& perms)
12348{
12349 Mutex::Locker lock(client_lock);
12350
181888fb
FG
12351 if (unmounting)
12352 return -ENOTCONN;
12353
7c673cae
FG
12354 vinodeno_t vino = _get_vino(in);
12355
12356 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12357 tout(cct) << "ll_opendir" << std::endl;
12358 tout(cct) << vino.ino.val << std::endl;
12359
12360 if (!cct->_conf->fuse_default_permissions) {
12361 int r = may_open(in, flags, perms);
12362 if (r < 0)
12363 return r;
12364 }
12365
12366 int r = _opendir(in, dirpp, perms);
12367 tout(cct) << (unsigned long)*dirpp << std::endl;
12368
12369 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12370 << dendl;
12371 return r;
12372}
12373
12374int Client::ll_releasedir(dir_result_t *dirp)
12375{
12376 Mutex::Locker lock(client_lock);
12377 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12378 tout(cct) << "ll_releasedir" << std::endl;
12379 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12380
12381 if (unmounting)
12382 return -ENOTCONN;
12383
7c673cae
FG
12384 _closedir(dirp);
12385 return 0;
12386}
12387
12388int Client::ll_fsyncdir(dir_result_t *dirp)
12389{
12390 Mutex::Locker lock(client_lock);
12391 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12392 tout(cct) << "ll_fsyncdir" << std::endl;
12393 tout(cct) << (unsigned long)dirp << std::endl;
12394
181888fb
FG
12395 if (unmounting)
12396 return -ENOTCONN;
12397
7c673cae
FG
12398 return _fsync(dirp->inode.get(), false);
12399}
12400
12401int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12402{
12403 assert(!(flags & O_CREAT));
12404
12405 Mutex::Locker lock(client_lock);
12406
181888fb
FG
12407 if (unmounting)
12408 return -ENOTCONN;
12409
7c673cae
FG
12410 vinodeno_t vino = _get_vino(in);
12411
12412 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12413 tout(cct) << "ll_open" << std::endl;
12414 tout(cct) << vino.ino.val << std::endl;
12415 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12416
12417 int r;
12418 if (!cct->_conf->fuse_default_permissions) {
12419 r = may_open(in, flags, perms);
12420 if (r < 0)
12421 goto out;
12422 }
12423
12424 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12425
12426 out:
12427 Fh *fhptr = fhp ? *fhp : NULL;
12428 if (fhptr) {
12429 ll_unclosed_fh_set.insert(fhptr);
12430 }
12431 tout(cct) << (unsigned long)fhptr << std::endl;
12432 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12433 " = " << r << " (" << fhptr << ")" << dendl;
12434 return r;
12435}
12436
12437int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12438 int flags, InodeRef *in, int caps, Fh **fhp,
12439 const UserPerm& perms)
12440{
12441 *fhp = NULL;
12442
12443 vinodeno_t vparent = _get_vino(parent);
12444
12445 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12446 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12447 << ", gid " << perms.gid() << dendl;
12448 tout(cct) << "ll_create" << std::endl;
12449 tout(cct) << vparent.ino.val << std::endl;
12450 tout(cct) << name << std::endl;
12451 tout(cct) << mode << std::endl;
12452 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12453
12454 bool created = false;
12455 int r = _lookup(parent, name, caps, in, perms);
12456
12457 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12458 return -EEXIST;
12459
12460 if (r == -ENOENT && (flags & O_CREAT)) {
12461 if (!cct->_conf->fuse_default_permissions) {
12462 r = may_create(parent, perms);
12463 if (r < 0)
12464 goto out;
12465 }
12466 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12467 perms);
12468 if (r < 0)
12469 goto out;
12470 }
12471
12472 if (r < 0)
12473 goto out;
12474
12475 assert(*in);
12476
12477 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12478 if (!created) {
12479 if (!cct->_conf->fuse_default_permissions) {
12480 r = may_open(in->get(), flags, perms);
12481 if (r < 0) {
12482 if (*fhp) {
12483 int release_r = _release_fh(*fhp);
12484 assert(release_r == 0); // during create, no async data ops should have happened
12485 }
12486 goto out;
12487 }
12488 }
12489 if (*fhp == NULL) {
12490 r = _open(in->get(), flags, mode, fhp, perms);
12491 if (r < 0)
12492 goto out;
12493 }
12494 }
12495
12496out:
12497 if (*fhp) {
12498 ll_unclosed_fh_set.insert(*fhp);
12499 }
12500
12501 ino_t ino = 0;
12502 if (r >= 0) {
12503 Inode *inode = in->get();
12504 if (use_faked_inos())
12505 ino = inode->faked_ino;
12506 else
12507 ino = inode->ino;
12508 }
12509
12510 tout(cct) << (unsigned long)*fhp << std::endl;
12511 tout(cct) << ino << std::endl;
31f18b77 12512 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12513 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12514 *fhp << " " << hex << ino << dec << ")" << dendl;
12515
12516 return r;
12517}
12518
12519int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12520 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12521 const UserPerm& perms)
12522{
12523 Mutex::Locker lock(client_lock);
12524 InodeRef in;
12525
181888fb
FG
12526 if (unmounting)
12527 return -ENOTCONN;
12528
7c673cae
FG
12529 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12530 fhp, perms);
12531 if (r >= 0) {
12532 assert(in);
12533
12534 // passing an Inode in outp requires an additional ref
12535 if (outp) {
12536 _ll_get(in.get());
12537 *outp = in.get();
12538 }
12539 fill_stat(in, attr);
12540 } else {
12541 attr->st_ino = 0;
12542 }
12543
12544 return r;
12545}
12546
12547int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12548 int oflags, Inode **outp, Fh **fhp,
12549 struct ceph_statx *stx, unsigned want, unsigned lflags,
12550 const UserPerm& perms)
12551{
12552 unsigned caps = statx_to_mask(lflags, want);
12553 Mutex::Locker lock(client_lock);
12554 InodeRef in;
12555
181888fb
FG
12556 if (unmounting)
12557 return -ENOTCONN;
7c673cae
FG
12558
12559 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12560 if (r >= 0) {
12561 assert(in);
12562
12563 // passing an Inode in outp requires an additional ref
12564 if (outp) {
12565 _ll_get(in.get());
12566 *outp = in.get();
12567 }
12568 fill_statx(in, caps, stx);
12569 } else {
12570 stx->stx_ino = 0;
12571 stx->stx_mask = 0;
12572 }
12573
12574 return r;
12575}
12576
12577loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12578{
12579 Mutex::Locker lock(client_lock);
12580 tout(cct) << "ll_lseek" << std::endl;
12581 tout(cct) << offset << std::endl;
12582 tout(cct) << whence << std::endl;
12583
181888fb
FG
12584 if (unmounting)
12585 return -ENOTCONN;
12586
7c673cae
FG
12587 return _lseek(fh, offset, whence);
12588}
12589
12590int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12591{
12592 Mutex::Locker lock(client_lock);
12593 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12594 tout(cct) << "ll_read" << std::endl;
12595 tout(cct) << (unsigned long)fh << std::endl;
12596 tout(cct) << off << std::endl;
12597 tout(cct) << len << std::endl;
12598
181888fb
FG
12599 if (unmounting)
12600 return -ENOTCONN;
12601
7c673cae
FG
12602 return _read(fh, off, len, bl);
12603}
12604
12605int Client::ll_read_block(Inode *in, uint64_t blockid,
12606 char *buf,
12607 uint64_t offset,
12608 uint64_t length,
12609 file_layout_t* layout)
12610{
12611 Mutex::Locker lock(client_lock);
181888fb
FG
12612
12613 if (unmounting)
12614 return -ENOTCONN;
12615
7c673cae
FG
12616 vinodeno_t vino = ll_get_vino(in);
12617 object_t oid = file_object_t(vino.ino, blockid);
12618 C_SaferCond onfinish;
12619 bufferlist bl;
12620
12621 objecter->read(oid,
12622 object_locator_t(layout->pool_id),
12623 offset,
12624 length,
12625 vino.snapid,
12626 &bl,
12627 CEPH_OSD_FLAG_READ,
12628 &onfinish);
12629
12630 client_lock.Unlock();
12631 int r = onfinish.wait();
12632 client_lock.Lock();
12633
12634 if (r >= 0) {
12635 bl.copy(0, bl.length(), buf);
12636 r = bl.length();
12637 }
12638
12639 return r;
12640}
12641
12642/* It appears that the OSD doesn't return success unless the entire
12643 buffer was written, return the write length on success. */
12644
12645int Client::ll_write_block(Inode *in, uint64_t blockid,
12646 char* buf, uint64_t offset,
12647 uint64_t length, file_layout_t* layout,
12648 uint64_t snapseq, uint32_t sync)
12649{
12650 Mutex flock("Client::ll_write_block flock");
12651 vinodeno_t vino = ll_get_vino(in);
12652 Cond cond;
12653 bool done;
12654 int r = 0;
181888fb 12655 Context *onsafe = nullptr;
7c673cae
FG
12656
12657 if (length == 0) {
12658 return -EINVAL;
12659 }
12660 if (true || sync) {
12661 /* if write is stable, the epilogue is waiting on
12662 * flock */
12663 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12664 done = false;
12665 } else {
12666 /* if write is unstable, we just place a barrier for
12667 * future commits to wait on */
12668 /*onsafe = new C_Block_Sync(this, vino.ino,
12669 barrier_interval(offset, offset + length), &r);
12670 */
12671 done = true;
12672 }
12673 object_t oid = file_object_t(vino.ino, blockid);
12674 SnapContext fakesnap;
12675 bufferptr bp;
12676 if (length > 0) bp = buffer::copy(buf, length);
12677 bufferlist bl;
12678 bl.push_back(bp);
12679
12680 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12681 << dendl;
12682
12683 fakesnap.seq = snapseq;
12684
12685 /* lock just in time */
12686 client_lock.Lock();
181888fb
FG
12687 if (unmounting) {
12688 client_lock.Unlock();
12689 delete onsafe;
12690 return -ENOTCONN;
12691 }
7c673cae
FG
12692
12693 objecter->write(oid,
12694 object_locator_t(layout->pool_id),
12695 offset,
12696 length,
12697 fakesnap,
12698 bl,
12699 ceph::real_clock::now(),
12700 0,
12701 onsafe);
12702
12703 client_lock.Unlock();
12704 if (!done /* also !sync */) {
12705 flock.Lock();
12706 while (! done)
12707 cond.Wait(flock);
12708 flock.Unlock();
12709 }
12710
12711 if (r < 0) {
12712 return r;
12713 } else {
12714 return length;
12715 }
12716}
12717
12718int Client::ll_commit_blocks(Inode *in,
12719 uint64_t offset,
12720 uint64_t length)
12721{
12722 Mutex::Locker lock(client_lock);
12723 /*
12724 BarrierContext *bctx;
12725 vinodeno_t vino = ll_get_vino(in);
12726 uint64_t ino = vino.ino;
12727
12728 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12729 << offset << " to " << length << dendl;
12730
12731 if (length == 0) {
12732 return -EINVAL;
12733 }
12734
12735 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12736 if (p != barriers.end()) {
12737 barrier_interval civ(offset, offset + length);
12738 p->second->commit_barrier(civ);
12739 }
12740 */
12741 return 0;
12742}
12743
12744int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12745{
12746 Mutex::Locker lock(client_lock);
12747 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12748 "~" << len << dendl;
12749 tout(cct) << "ll_write" << std::endl;
12750 tout(cct) << (unsigned long)fh << std::endl;
12751 tout(cct) << off << std::endl;
12752 tout(cct) << len << std::endl;
12753
181888fb
FG
12754 if (unmounting)
12755 return -ENOTCONN;
12756
7c673cae
FG
12757 int r = _write(fh, off, len, data, NULL, 0);
12758 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12759 << dendl;
12760 return r;
12761}
12762
12763int Client::ll_flush(Fh *fh)
12764{
12765 Mutex::Locker lock(client_lock);
12766 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12767 tout(cct) << "ll_flush" << std::endl;
12768 tout(cct) << (unsigned long)fh << std::endl;
12769
181888fb
FG
12770 if (unmounting)
12771 return -ENOTCONN;
12772
7c673cae
FG
12773 return _flush(fh);
12774}
12775
12776int Client::ll_fsync(Fh *fh, bool syncdataonly)
12777{
12778 Mutex::Locker lock(client_lock);
12779 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12780 tout(cct) << "ll_fsync" << std::endl;
12781 tout(cct) << (unsigned long)fh << std::endl;
12782
181888fb
FG
12783 if (unmounting)
12784 return -ENOTCONN;
12785
7c673cae
FG
12786 int r = _fsync(fh, syncdataonly);
12787 if (r) {
12788 // If we're returning an error, clear it from the FH
12789 fh->take_async_err();
12790 }
12791 return r;
12792}
12793
12794#ifdef FALLOC_FL_PUNCH_HOLE
12795
12796int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12797{
12798 if (offset < 0 || length <= 0)
12799 return -EINVAL;
12800
12801 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12802 return -EOPNOTSUPP;
12803
12804 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12805 return -EOPNOTSUPP;
12806
12807 Inode *in = fh->inode.get();
12808
12809 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12810 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12811 return -ENOSPC;
12812 }
12813
12814 if (in->snapid != CEPH_NOSNAP)
12815 return -EROFS;
12816
12817 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12818 return -EBADF;
12819
12820 uint64_t size = offset + length;
12821 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12822 size > in->size &&
12823 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12824 return -EDQUOT;
12825 }
12826
12827 int have;
12828 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12829 if (r < 0)
12830 return r;
12831
12832 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12833 Cond uninline_cond;
12834 bool uninline_done = false;
12835 int uninline_ret = 0;
12836 Context *onuninline = NULL;
12837
12838 if (mode & FALLOC_FL_PUNCH_HOLE) {
12839 if (in->inline_version < CEPH_INLINE_NONE &&
12840 (have & CEPH_CAP_FILE_BUFFER)) {
12841 bufferlist bl;
12842 int len = in->inline_data.length();
12843 if (offset < len) {
12844 if (offset > 0)
12845 in->inline_data.copy(0, offset, bl);
12846 int size = length;
12847 if (offset + size > len)
12848 size = len - offset;
12849 if (size > 0)
12850 bl.append_zero(size);
12851 if (offset + size < len)
12852 in->inline_data.copy(offset + size, len - offset - size, bl);
12853 in->inline_data = bl;
12854 in->inline_version++;
12855 }
12856 in->mtime = ceph_clock_now();
12857 in->change_attr++;
12858 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12859 } else {
12860 if (in->inline_version < CEPH_INLINE_NONE) {
12861 onuninline = new C_SafeCond(&uninline_flock,
12862 &uninline_cond,
12863 &uninline_done,
12864 &uninline_ret);
12865 uninline_data(in, onuninline);
12866 }
12867
12868 Mutex flock("Client::_punch_hole flock");
12869 Cond cond;
12870 bool done = false;
12871 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12872
12873 unsafe_sync_write++;
12874 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12875
12876 _invalidate_inode_cache(in, offset, length);
12877 filer->zero(in->ino, &in->layout,
12878 in->snaprealm->get_snap_context(),
12879 offset, length,
12880 ceph::real_clock::now(),
12881 0, true, onfinish);
12882 in->mtime = ceph_clock_now();
12883 in->change_attr++;
12884 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12885
12886 client_lock.Unlock();
12887 flock.Lock();
12888 while (!done)
12889 cond.Wait(flock);
12890 flock.Unlock();
12891 client_lock.Lock();
12892 _sync_write_commit(in);
12893 }
12894 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12895 uint64_t size = offset + length;
12896 if (size > in->size) {
12897 in->size = size;
12898 in->mtime = ceph_clock_now();
12899 in->change_attr++;
12900 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12901
12902 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12903 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
12904 } else if (is_max_size_approaching(in)) {
12905 check_caps(in, 0);
7c673cae
FG
12906 }
12907 }
12908 }
12909
12910 if (onuninline) {
12911 client_lock.Unlock();
12912 uninline_flock.Lock();
12913 while (!uninline_done)
12914 uninline_cond.Wait(uninline_flock);
12915 uninline_flock.Unlock();
12916 client_lock.Lock();
12917
12918 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
12919 in->inline_data.clear();
12920 in->inline_version = CEPH_INLINE_NONE;
12921 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12922 check_caps(in, 0);
12923 } else
12924 r = uninline_ret;
12925 }
12926
12927 put_cap_ref(in, CEPH_CAP_FILE_WR);
12928 return r;
12929}
12930#else
12931
12932int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12933{
12934 return -EOPNOTSUPP;
12935}
12936
12937#endif
12938
12939
12940int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
12941{
12942 Mutex::Locker lock(client_lock);
12943 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
12944 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
12945 tout(cct) << (unsigned long)fh << std::endl;
12946
181888fb
FG
12947 if (unmounting)
12948 return -ENOTCONN;
12949
7c673cae
FG
12950 return _fallocate(fh, mode, offset, length);
12951}
12952
12953int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
12954{
12955 Mutex::Locker lock(client_lock);
12956 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
12957
181888fb
FG
12958 if (unmounting)
12959 return -ENOTCONN;
12960
7c673cae
FG
12961 Fh *fh = get_filehandle(fd);
12962 if (!fh)
12963 return -EBADF;
12964#if defined(__linux__) && defined(O_PATH)
12965 if (fh->flags & O_PATH)
12966 return -EBADF;
12967#endif
12968 return _fallocate(fh, mode, offset, length);
12969}
12970
12971int Client::ll_release(Fh *fh)
12972{
12973 Mutex::Locker lock(client_lock);
12974 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
12975 dendl;
12976 tout(cct) << "ll_release (fh)" << std::endl;
12977 tout(cct) << (unsigned long)fh << std::endl;
12978
181888fb
FG
12979 if (unmounting)
12980 return -ENOTCONN;
12981
7c673cae
FG
12982 if (ll_unclosed_fh_set.count(fh))
12983 ll_unclosed_fh_set.erase(fh);
12984 return _release_fh(fh);
12985}
12986
12987int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
12988{
12989 Mutex::Locker lock(client_lock);
12990
12991 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
12992 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
12993
181888fb
FG
12994 if (unmounting)
12995 return -ENOTCONN;
12996
7c673cae
FG
12997 return _getlk(fh, fl, owner);
12998}
12999
13000int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13001{
13002 Mutex::Locker lock(client_lock);
13003
13004 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13005 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13006
181888fb
FG
13007 if (unmounting)
13008 return -ENOTCONN;
13009
7c673cae
FG
13010 return _setlk(fh, fl, owner, sleep);
13011}
13012
13013int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13014{
13015 Mutex::Locker lock(client_lock);
13016
13017 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13018 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13019
181888fb
FG
13020 if (unmounting)
13021 return -ENOTCONN;
13022
7c673cae
FG
13023 return _flock(fh, cmd, owner);
13024}
13025
13026class C_Client_RequestInterrupt : public Context {
13027private:
13028 Client *client;
13029 MetaRequest *req;
13030public:
13031 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13032 req->get();
13033 }
13034 void finish(int r) override {
13035 Mutex::Locker l(client->client_lock);
13036 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13037 client->_interrupt_filelock(req);
13038 client->put_request(req);
13039 }
13040};
13041
13042void Client::ll_interrupt(void *d)
13043{
13044 MetaRequest *req = static_cast<MetaRequest*>(d);
13045 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13046 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13047 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13048}
13049
13050// =========================================
13051// layout
13052
13053// expose file layouts
13054
13055int Client::describe_layout(const char *relpath, file_layout_t *lp,
13056 const UserPerm& perms)
13057{
13058 Mutex::Locker lock(client_lock);
13059
181888fb
FG
13060 if (unmounting)
13061 return -ENOTCONN;
13062
7c673cae
FG
13063 filepath path(relpath);
13064 InodeRef in;
13065 int r = path_walk(path, &in, perms);
13066 if (r < 0)
13067 return r;
13068
13069 *lp = in->layout;
13070
13071 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13072 return 0;
13073}
13074
13075int Client::fdescribe_layout(int fd, file_layout_t *lp)
13076{
13077 Mutex::Locker lock(client_lock);
13078
181888fb
FG
13079 if (unmounting)
13080 return -ENOTCONN;
13081
7c673cae
FG
13082 Fh *f = get_filehandle(fd);
13083 if (!f)
13084 return -EBADF;
13085 Inode *in = f->inode.get();
13086
13087 *lp = in->layout;
13088
13089 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13090 return 0;
13091}
13092
d2e6a577
FG
13093int64_t Client::get_default_pool_id()
13094{
13095 Mutex::Locker lock(client_lock);
181888fb
FG
13096
13097 if (unmounting)
13098 return -ENOTCONN;
13099
d2e6a577
FG
13100 /* first data pool is the default */
13101 return mdsmap->get_first_data_pool();
13102}
7c673cae
FG
13103
13104// expose osdmap
13105
13106int64_t Client::get_pool_id(const char *pool_name)
13107{
13108 Mutex::Locker lock(client_lock);
181888fb
FG
13109
13110 if (unmounting)
13111 return -ENOTCONN;
13112
7c673cae
FG
13113 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13114 pool_name);
13115}
13116
13117string Client::get_pool_name(int64_t pool)
13118{
13119 Mutex::Locker lock(client_lock);
181888fb
FG
13120
13121 if (unmounting)
13122 return string();
13123
7c673cae
FG
13124 return objecter->with_osdmap([pool](const OSDMap& o) {
13125 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13126 });
13127}
13128
13129int Client::get_pool_replication(int64_t pool)
13130{
13131 Mutex::Locker lock(client_lock);
181888fb
FG
13132
13133 if (unmounting)
13134 return -ENOTCONN;
13135
7c673cae
FG
13136 return objecter->with_osdmap([pool](const OSDMap& o) {
13137 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13138 });
13139}
13140
13141int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13142{
13143 Mutex::Locker lock(client_lock);
13144
181888fb
FG
13145 if (unmounting)
13146 return -ENOTCONN;
13147
7c673cae
FG
13148 Fh *f = get_filehandle(fd);
13149 if (!f)
13150 return -EBADF;
13151 Inode *in = f->inode.get();
13152
13153 vector<ObjectExtent> extents;
13154 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13155 assert(extents.size() == 1);
13156
13157 objecter->with_osdmap([&](const OSDMap& o) {
13158 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13159 o.pg_to_acting_osds(pg, osds);
13160 });
13161
13162 if (osds.empty())
13163 return -EINVAL;
13164
13165 /*
13166 * Return the remainder of the extent (stripe unit)
13167 *
13168 * If length = 1 is passed to Striper::file_to_extents we get a single
13169 * extent back, but its length is one so we still need to compute the length
13170 * to the end of the stripe unit.
13171 *
13172 * If length = su then we may get 1 or 2 objects back in the extents vector
13173 * which would have to be examined. Even then, the offsets are local to the
13174 * object, so matching up to the file offset is extra work.
13175 *
13176 * It seems simpler to stick with length = 1 and manually compute the
13177 * remainder.
13178 */
13179 if (len) {
13180 uint64_t su = in->layout.stripe_unit;
13181 *len = su - (off % su);
13182 }
13183
13184 return 0;
13185}
13186
13187int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13188{
13189 Mutex::Locker lock(client_lock);
181888fb
FG
13190
13191 if (unmounting)
13192 return -ENOTCONN;
13193
7c673cae
FG
13194 if (id < 0)
13195 return -EINVAL;
13196 return objecter->with_osdmap([&](const OSDMap& o) {
13197 return o.crush->get_full_location_ordered(id, path);
13198 });
13199}
13200
13201int Client::get_file_stripe_address(int fd, loff_t offset,
13202 vector<entity_addr_t>& address)
13203{
13204 Mutex::Locker lock(client_lock);
13205
181888fb
FG
13206 if (unmounting)
13207 return -ENOTCONN;
13208
7c673cae
FG
13209 Fh *f = get_filehandle(fd);
13210 if (!f)
13211 return -EBADF;
13212 Inode *in = f->inode.get();
13213
13214 // which object?
13215 vector<ObjectExtent> extents;
13216 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13217 in->truncate_size, extents);
13218 assert(extents.size() == 1);
13219
13220 // now we have the object and its 'layout'
13221 return objecter->with_osdmap([&](const OSDMap& o) {
13222 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13223 vector<int> osds;
13224 o.pg_to_acting_osds(pg, osds);
13225 if (osds.empty())
13226 return -EINVAL;
13227 for (unsigned i = 0; i < osds.size(); i++) {
13228 entity_addr_t addr = o.get_addr(osds[i]);
13229 address.push_back(addr);
13230 }
13231 return 0;
13232 });
13233}
13234
13235int Client::get_osd_addr(int osd, entity_addr_t& addr)
13236{
13237 Mutex::Locker lock(client_lock);
181888fb
FG
13238
13239 if (unmounting)
13240 return -ENOTCONN;
13241
7c673cae
FG
13242 return objecter->with_osdmap([&](const OSDMap& o) {
13243 if (!o.exists(osd))
13244 return -ENOENT;
13245
13246 addr = o.get_addr(osd);
13247 return 0;
13248 });
13249}
13250
13251int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13252 loff_t length, loff_t offset)
13253{
13254 Mutex::Locker lock(client_lock);
13255
181888fb
FG
13256 if (unmounting)
13257 return -ENOTCONN;
13258
7c673cae
FG
13259 Fh *f = get_filehandle(fd);
13260 if (!f)
13261 return -EBADF;
13262 Inode *in = f->inode.get();
13263
13264 // map to a list of extents
13265 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13266
13267 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13268 return 0;
13269}
13270
13271
13272/*
13273 * find an osd with the same ip. -1 if none.
13274 */
13275int Client::get_local_osd()
13276{
13277 Mutex::Locker lock(client_lock);
181888fb
FG
13278
13279 if (unmounting)
13280 return -ENOTCONN;
13281
7c673cae
FG
13282 objecter->with_osdmap([this](const OSDMap& o) {
13283 if (o.get_epoch() != local_osd_epoch) {
13284 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13285 local_osd_epoch = o.get_epoch();
13286 }
13287 });
13288 return local_osd;
13289}
13290
13291
13292
13293
13294
13295
13296// ===============================
13297
13298void Client::ms_handle_connect(Connection *con)
13299{
13300 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13301}
13302
13303bool Client::ms_handle_reset(Connection *con)
13304{
13305 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13306 return false;
13307}
13308
13309void Client::ms_handle_remote_reset(Connection *con)
13310{
13311 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13312 Mutex::Locker l(client_lock);
13313 switch (con->get_peer_type()) {
13314 case CEPH_ENTITY_TYPE_MDS:
13315 {
13316 // kludge to figure out which mds this is; fixme with a Connection* state
13317 mds_rank_t mds = MDS_RANK_NONE;
13318 MetaSession *s = NULL;
13319 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13320 p != mds_sessions.end();
13321 ++p) {
13322 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13323 mds = p->first;
13324 s = p->second;
13325 }
13326 }
13327 if (mds >= 0) {
d2e6a577 13328 assert (s != NULL);
7c673cae
FG
13329 switch (s->state) {
13330 case MetaSession::STATE_CLOSING:
13331 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13332 _closed_mds_session(s);
13333 break;
13334
13335 case MetaSession::STATE_OPENING:
13336 {
13337 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13338 list<Context*> waiters;
13339 waiters.swap(s->waiting_for_open);
13340 _closed_mds_session(s);
13341 MetaSession *news = _get_or_open_mds_session(mds);
13342 news->waiting_for_open.swap(waiters);
13343 }
13344 break;
13345
13346 case MetaSession::STATE_OPEN:
13347 {
13348 const md_config_t *conf = cct->_conf;
13349 if (conf->client_reconnect_stale) {
13350 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13351 _closed_mds_session(s);
13352 } else {
13353 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13354 s->state = MetaSession::STATE_STALE;
13355 }
13356 }
13357 break;
13358
13359 case MetaSession::STATE_NEW:
13360 case MetaSession::STATE_CLOSED:
13361 default:
13362 break;
13363 }
13364 }
13365 }
13366 break;
13367 }
13368}
13369
13370bool Client::ms_handle_refused(Connection *con)
13371{
13372 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13373 return false;
13374}
13375
13376bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13377{
13378 if (dest_type == CEPH_ENTITY_TYPE_MON)
13379 return true;
13380 *authorizer = monclient->build_authorizer(dest_type);
13381 return true;
13382}
13383
13384Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13385{
13386 Inode *cur = in;
13387 utime_t now = ceph_clock_now();
13388
13389 while (cur) {
13390 if (cur != in && cur->quota.is_enable())
13391 break;
13392
13393 Inode *parent_in = NULL;
13394 if (!cur->dn_set.empty()) {
13395 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13396 Dentry *dn = *p;
13397 if (dn->lease_mds >= 0 &&
13398 dn->lease_ttl > now &&
13399 mds_sessions.count(dn->lease_mds)) {
13400 parent_in = dn->dir->parent_inode;
13401 } else {
13402 Inode *diri = dn->dir->parent_inode;
13403 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13404 diri->shared_gen == dn->cap_shared_gen) {
13405 parent_in = dn->dir->parent_inode;
13406 }
13407 }
13408 if (parent_in)
13409 break;
13410 }
13411 } else if (root_parents.count(cur)) {
13412 parent_in = root_parents[cur].get();
13413 }
13414
13415 if (parent_in) {
13416 cur = parent_in;
13417 continue;
13418 }
13419
13420 if (cur == root_ancestor)
13421 break;
13422
181888fb
FG
13423 // deleted inode
13424 if (cur->nlink == 0) {
13425 cur = root_ancestor;
13426 break;
13427 }
13428
7c673cae
FG
13429 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13430 filepath path(cur->ino);
13431 req->set_filepath(path);
13432 req->set_inode(cur);
13433
13434 InodeRef parent_ref;
13435 int ret = make_request(req, perms, &parent_ref);
13436 if (ret < 0) {
13437 ldout(cct, 1) << __func__ << " " << in->vino()
13438 << " failed to find parent of " << cur->vino()
13439 << " err " << ret << dendl;
13440 // FIXME: what to do?
13441 cur = root_ancestor;
13442 break;
13443 }
13444
13445 now = ceph_clock_now();
13446 if (cur == in)
13447 cur = parent_ref.get();
13448 else
13449 cur = in; // start over
13450 }
13451
13452 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13453 return cur;
13454}
13455
13456/**
13457 * Traverse quota ancestors of the Inode, return true
13458 * if any of them passes the passed function
13459 */
13460bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13461 std::function<bool (const Inode &in)> test)
13462{
13463 while (true) {
13464 assert(in != NULL);
13465 if (test(*in)) {
13466 return true;
13467 }
13468
13469 if (in == root_ancestor) {
13470 // We're done traversing, drop out
13471 return false;
13472 } else {
13473 // Continue up the tree
13474 in = get_quota_root(in, perms);
13475 }
13476 }
13477
13478 return false;
13479}
13480
13481bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13482{
13483 return check_quota_condition(in, perms,
13484 [](const Inode &in) {
13485 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13486 });
13487}
13488
13489bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13490 const UserPerm& perms)
13491{
13492 return check_quota_condition(in, perms,
13493 [&new_bytes](const Inode &in) {
13494 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13495 > in.quota.max_bytes;
13496 });
13497}
13498
13499bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13500{
13501 return check_quota_condition(in, perms,
13502 [](const Inode &in) {
13503 if (in.quota.max_bytes) {
13504 if (in.rstat.rbytes >= in.quota.max_bytes) {
13505 return true;
13506 }
13507
13508 assert(in.size >= in.reported_size);
13509 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13510 const uint64_t size = in.size - in.reported_size;
13511 return (space >> 4) < size;
13512 } else {
13513 return false;
13514 }
13515 });
13516}
13517
13518enum {
13519 POOL_CHECKED = 1,
13520 POOL_CHECKING = 2,
13521 POOL_READ = 4,
13522 POOL_WRITE = 8,
13523};
13524
13525int Client::check_pool_perm(Inode *in, int need)
13526{
13527 if (!cct->_conf->client_check_pool_perm)
13528 return 0;
13529
13530 int64_t pool_id = in->layout.pool_id;
13531 std::string pool_ns = in->layout.pool_ns;
13532 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13533 int have = 0;
13534 while (true) {
13535 auto it = pool_perms.find(perm_key);
13536 if (it == pool_perms.end())
13537 break;
13538 if (it->second == POOL_CHECKING) {
13539 // avoid concurrent checkings
13540 wait_on_list(waiting_for_pool_perm);
13541 } else {
13542 have = it->second;
13543 assert(have & POOL_CHECKED);
13544 break;
13545 }
13546 }
13547
13548 if (!have) {
13549 if (in->snapid != CEPH_NOSNAP) {
13550 // pool permission check needs to write to the first object. But for snapshot,
13551 // head of the first object may have alread been deleted. To avoid creating
13552 // orphan object, skip the check for now.
13553 return 0;
13554 }
13555
13556 pool_perms[perm_key] = POOL_CHECKING;
13557
13558 char oid_buf[32];
13559 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13560 object_t oid = oid_buf;
13561
13562 SnapContext nullsnapc;
13563
13564 C_SaferCond rd_cond;
13565 ObjectOperation rd_op;
13566 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13567
13568 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13569 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13570
13571 C_SaferCond wr_cond;
13572 ObjectOperation wr_op;
13573 wr_op.create(true);
13574
13575 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13576 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13577
13578 client_lock.Unlock();
13579 int rd_ret = rd_cond.wait();
13580 int wr_ret = wr_cond.wait();
13581 client_lock.Lock();
13582
13583 bool errored = false;
13584
13585 if (rd_ret == 0 || rd_ret == -ENOENT)
13586 have |= POOL_READ;
13587 else if (rd_ret != -EPERM) {
13588 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13589 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13590 errored = true;
13591 }
13592
13593 if (wr_ret == 0 || wr_ret == -EEXIST)
13594 have |= POOL_WRITE;
13595 else if (wr_ret != -EPERM) {
13596 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13597 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13598 errored = true;
13599 }
13600
13601 if (errored) {
13602 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13603 // Raise EIO because actual error code might be misleading for
13604 // userspace filesystem user.
13605 pool_perms.erase(perm_key);
13606 signal_cond_list(waiting_for_pool_perm);
13607 return -EIO;
13608 }
13609
13610 pool_perms[perm_key] = have | POOL_CHECKED;
13611 signal_cond_list(waiting_for_pool_perm);
13612 }
13613
13614 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13615 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13616 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13617 return -EPERM;
13618 }
13619 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13620 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13621 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13622 return -EPERM;
13623 }
13624
13625 return 0;
13626}
13627
13628int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13629{
13630 if (acl_type == POSIX_ACL) {
13631 if (in->xattrs.count(ACL_EA_ACCESS)) {
13632 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13633
13634 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13635 }
13636 }
13637 return -EAGAIN;
13638}
13639
13640int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13641{
13642 if (acl_type == NO_ACL)
13643 return 0;
13644
13645 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13646 if (r < 0)
13647 goto out;
13648
13649 if (acl_type == POSIX_ACL) {
13650 if (in->xattrs.count(ACL_EA_ACCESS)) {
13651 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13652 bufferptr acl(access_acl.c_str(), access_acl.length());
13653 r = posix_acl_access_chmod(acl, mode);
13654 if (r < 0)
13655 goto out;
13656 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13657 } else {
13658 r = 0;
13659 }
13660 }
13661out:
13662 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13663 return r;
13664}
13665
13666int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13667 const UserPerm& perms)
13668{
13669 if (acl_type == NO_ACL)
13670 return 0;
13671
13672 if (S_ISLNK(*mode))
13673 return 0;
13674
13675 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13676 if (r < 0)
13677 goto out;
13678
13679 if (acl_type == POSIX_ACL) {
13680 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13681 map<string, bufferptr> xattrs;
13682
13683 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13684 bufferptr acl(default_acl.c_str(), default_acl.length());
13685 r = posix_acl_inherit_mode(acl, mode);
13686 if (r < 0)
13687 goto out;
13688
13689 if (r > 0) {
13690 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13691 if (r < 0)
13692 goto out;
13693 if (r > 0)
13694 xattrs[ACL_EA_ACCESS] = acl;
13695 }
13696
13697 if (S_ISDIR(*mode))
13698 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13699
13700 r = xattrs.size();
13701 if (r > 0)
13702 ::encode(xattrs, xattrs_bl);
13703 } else {
13704 if (umask_cb)
13705 *mode &= ~umask_cb(callback_handle);
13706 r = 0;
13707 }
13708 }
13709out:
13710 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13711 return r;
13712}
13713
13714void Client::set_filer_flags(int flags)
13715{
13716 Mutex::Locker l(client_lock);
13717 assert(flags == 0 ||
13718 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13719 objecter->add_global_op_flags(flags);
13720}
13721
13722void Client::clear_filer_flags(int flags)
13723{
13724 Mutex::Locker l(client_lock);
13725 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13726 objecter->clear_global_op_flag(flags);
13727}
13728
13729/**
13730 * This is included in cap release messages, to cause
13731 * the MDS to wait until this OSD map epoch. It is necessary
13732 * in corner cases where we cancel RADOS ops, so that
13733 * nobody else tries to do IO to the same objects in
13734 * the same epoch as the cancelled ops.
13735 */
13736void Client::set_cap_epoch_barrier(epoch_t e)
13737{
13738 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13739 cap_epoch_barrier = e;
13740}
13741
13742const char** Client::get_tracked_conf_keys() const
13743{
13744 static const char* keys[] = {
13745 "client_cache_size",
13746 "client_cache_mid",
13747 "client_acl_type",
13748 NULL
13749 };
13750 return keys;
13751}
13752
13753void Client::handle_conf_change(const struct md_config_t *conf,
13754 const std::set <std::string> &changed)
13755{
13756 Mutex::Locker lock(client_lock);
13757
181888fb 13758 if (changed.count("client_cache_mid")) {
7c673cae
FG
13759 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13760 }
13761 if (changed.count("client_acl_type")) {
13762 acl_type = NO_ACL;
13763 if (cct->_conf->client_acl_type == "posix_acl")
13764 acl_type = POSIX_ACL;
13765 }
13766}
13767
13768void Client::init_groups(UserPerm *perms)
13769{
13770 gid_t *sgids;
13771 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13772 perms->init_gids(sgids, count);
13773}
13774
13775void intrusive_ptr_add_ref(Inode *in)
13776{
13777 in->get();
13778}
13779
13780void intrusive_ptr_release(Inode *in)
13781{
13782 in->client->put_inode(in);
13783}
13784
13785mds_rank_t Client::_get_random_up_mds() const
13786{
13787 assert(client_lock.is_locked_by_me());
13788
13789 std::set<mds_rank_t> up;
13790 mdsmap->get_up_mds_set(up);
13791
13792 if (up.empty())
13793 return MDS_RANK_NONE;
13794 std::set<mds_rank_t>::const_iterator p = up.begin();
13795 for (int n = rand() % up.size(); n; n--)
13796 ++p;
13797 return *p;
13798}
13799
13800
13801StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13802 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13803{
13804 monclient->set_messenger(m);
13805 objecter->set_client_incarnation(0);
13806}
13807
13808StandaloneClient::~StandaloneClient()
13809{
13810 delete objecter;
13811 objecter = nullptr;
13812}
13813
13814int StandaloneClient::init()
13815{
13816 timer.init();
13817 objectcacher->start();
13818 objecter->init();
13819
13820 client_lock.Lock();
13821 assert(!initialized);
13822
13823 messenger->add_dispatcher_tail(objecter);
13824 messenger->add_dispatcher_tail(this);
13825
13826 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13827 int r = monclient->init();
13828 if (r < 0) {
13829 // need to do cleanup because we're in an intermediate init state
13830 timer.shutdown();
13831 client_lock.Unlock();
13832 objecter->shutdown();
13833 objectcacher->stop();
13834 monclient->shutdown();
13835 return r;
13836 }
13837 objecter->start();
13838
13839 client_lock.Unlock();
13840 _finish_init();
13841
13842 return 0;
13843}
13844
13845void StandaloneClient::shutdown()
13846{
13847 Client::shutdown();
13848 objecter->shutdown();
13849 monclient->shutdown();
13850}
13851