]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to v12.2.3
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
b32b8144 86#include "Delegation.h"
7c673cae
FG
87#include "Dir.h"
88#include "ClientSnapRealm.h"
89#include "Fh.h"
90#include "MetaSession.h"
91#include "MetaRequest.h"
92#include "ObjecterWriteback.h"
93#include "posix_acl.h"
94
95#include "include/assert.h"
96#include "include/stat.h"
97
98#include "include/cephfs/ceph_statx.h"
99
100#if HAVE_GETGROUPLIST
101#include <grp.h>
102#include <pwd.h>
103#include <unistd.h>
104#endif
105
106#undef dout_prefix
107#define dout_prefix *_dout << "client." << whoami << " "
108
109#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111// FreeBSD fails to define this
112#ifndef O_DSYNC
113#define O_DSYNC 0x0
114#endif
115// Darwin fails to define this
116#ifndef O_RSYNC
117#define O_RSYNC 0x0
118#endif
119
120#ifndef O_DIRECT
121#define O_DIRECT 0x0
122#endif
123
124#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127{
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130}
131
132
133// -------------
134
135Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137{
138}
139
140bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142{
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163}
164
165
166// -------------
167
168dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174void Client::_reset_faked_inos()
175{
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181}
182
183void Client::_assign_faked_ino(Inode *in)
184{
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201}
202
203void Client::_release_faked_ino(Inode *in)
204{
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207}
208
209vinodeno_t Client::_map_faked_ino(ino_t ino)
210{
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220}
221
222vinodeno_t Client::map_faked_ino(ino_t ino)
223{
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226}
227
228// cons/des
229
230Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
239 getgroups_cb(NULL),
240 umask_cb(NULL),
241 can_invalidate_dentries(false),
7c673cae
FG
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
247 tick_event(NULL),
248 messenger(m), monclient(mc),
249 objecter(objecter_),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 initialized(false),
31f18b77 253 mounted(false), unmounting(false), blacklisted(false),
b32b8144 254 local_osd(-ENXIO), local_osd_epoch(0),
7c673cae 255 unsafe_sync_write(0),
b32b8144
FG
256 client_lock("Client::client_lock"),
257 deleg_timeout(0)
7c673cae
FG
258{
259 _reset_faked_inos();
260 //
261 root = 0;
262
263 num_flushing_caps = 0;
264
265 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
266 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
267
268 user_id = cct->_conf->client_mount_uid;
269 group_id = cct->_conf->client_mount_gid;
270
271 acl_type = NO_ACL;
272 if (cct->_conf->client_acl_type == "posix_acl")
273 acl_type = POSIX_ACL;
274
7c673cae
FG
275 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
276
277 // file handles
278 free_fd_set.insert(10, 1<<30);
279
280 mdsmap.reset(new MDSMap);
281
282 // osd interfaces
283 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
284 &client_lock));
285 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
286 client_flush_set_callback, // all commit callback
287 (void*)this,
288 cct->_conf->client_oc_size,
289 cct->_conf->client_oc_max_objects,
290 cct->_conf->client_oc_max_dirty,
291 cct->_conf->client_oc_target_dirty,
292 cct->_conf->client_oc_max_dirty_age,
293 true));
294 objecter_finisher.start();
295 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 296 objecter->enable_blacklist_events();
7c673cae
FG
297}
298
299
300Client::~Client()
301{
302 assert(!client_lock.is_locked());
303
31f18b77
FG
304 // It is necessary to hold client_lock, because any inode destruction
305 // may call into ObjectCacher, which asserts that it's lock (which is
306 // client_lock) is held.
307 client_lock.Lock();
7c673cae 308 tear_down_cache();
31f18b77 309 client_lock.Unlock();
7c673cae
FG
310}
311
312void Client::tear_down_cache()
313{
314 // fd's
315 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
316 it != fd_map.end();
317 ++it) {
318 Fh *fh = it->second;
319 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
320 _release_fh(fh);
321 }
322 fd_map.clear();
323
324 while (!opened_dirs.empty()) {
325 dir_result_t *dirp = *opened_dirs.begin();
326 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
327 _closedir(dirp);
328 }
329
330 // caps!
331 // *** FIXME ***
332
333 // empty lru
7c673cae
FG
334 trim_cache();
335 assert(lru.lru_get_size() == 0);
336
337 // close root ino
338 assert(inode_map.size() <= 1 + root_parents.size());
339 if (root && inode_map.size() == 1 + root_parents.size()) {
340 delete root;
341 root = 0;
342 root_ancestor = 0;
343 while (!root_parents.empty())
344 root_parents.erase(root_parents.begin());
345 inode_map.clear();
346 _reset_faked_inos();
347 }
348
349 assert(inode_map.empty());
350}
351
352inodeno_t Client::get_root_ino()
353{
354 Mutex::Locker l(client_lock);
355 if (use_faked_inos())
356 return root->faked_ino;
357 else
358 return root->ino;
359}
360
361Inode *Client::get_root()
362{
363 Mutex::Locker l(client_lock);
364 root->ll_get();
365 return root;
366}
367
368
369// debug crapola
370
371void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
372{
373 filepath path;
374 in->make_long_path(path);
375 ldout(cct, 1) << "dump_inode: "
376 << (disconnected ? "DISCONNECTED ":"")
377 << "inode " << in->ino
378 << " " << path
379 << " ref " << in->get_num_ref()
380 << *in << dendl;
381
382 if (f) {
383 f->open_object_section("inode");
384 f->dump_stream("path") << path;
385 if (disconnected)
386 f->dump_int("disconnected", 1);
387 in->dump(f);
388 f->close_section();
389 }
390
391 did.insert(in);
392 if (in->dir) {
393 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
394 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
395 it != in->dir->dentries.end();
396 ++it) {
397 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
398 if (f) {
399 f->open_object_section("dentry");
400 it->second->dump(f);
401 f->close_section();
402 }
403 if (it->second->inode)
404 dump_inode(f, it->second->inode.get(), did, false);
405 }
406 }
407}
408
409void Client::dump_cache(Formatter *f)
410{
411 set<Inode*> did;
412
413 ldout(cct, 1) << "dump_cache" << dendl;
414
415 if (f)
416 f->open_array_section("cache");
417
418 if (root)
419 dump_inode(f, root, did, true);
420
421 // make a second pass to catch anything disconnected
422 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
423 it != inode_map.end();
424 ++it) {
425 if (did.count(it->second))
426 continue;
427 dump_inode(f, it->second, did, true);
428 }
429
430 if (f)
431 f->close_section();
432}
433
434void Client::dump_status(Formatter *f)
435{
436 assert(client_lock.is_locked_by_me());
437
438 ldout(cct, 1) << __func__ << dendl;
439
440 const epoch_t osd_epoch
441 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
442
443 if (f) {
444 f->open_object_section("metadata");
445 for (const auto& kv : metadata)
446 f->dump_string(kv.first.c_str(), kv.second);
447 f->close_section();
448
449 f->dump_int("dentry_count", lru.lru_get_size());
450 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
451 f->dump_int("id", get_nodeid().v);
452 f->dump_int("inode_count", inode_map.size());
453 f->dump_int("mds_epoch", mdsmap->get_epoch());
454 f->dump_int("osd_epoch", osd_epoch);
455 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
456 }
457}
458
459int Client::init()
460{
461 timer.init();
462 objectcacher->start();
463
464 client_lock.Lock();
465 assert(!initialized);
466
467 messenger->add_dispatcher_tail(this);
468 client_lock.Unlock();
469
470 _finish_init();
471 return 0;
472}
473
474void Client::_finish_init()
475{
476 client_lock.Lock();
477 // logger
478 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
479 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
480 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
481 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
482 logger.reset(plb.create_perf_counters());
483 cct->get_perfcounters_collection()->add(logger.get());
484
485 client_lock.Unlock();
486
487 cct->_conf->add_observer(this);
488
489 AdminSocket* admin_socket = cct->get_admin_socket();
490 int ret = admin_socket->register_command("mds_requests",
491 "mds_requests",
492 &m_command_hook,
493 "show in-progress mds requests");
494 if (ret < 0) {
495 lderr(cct) << "error registering admin socket command: "
496 << cpp_strerror(-ret) << dendl;
497 }
498 ret = admin_socket->register_command("mds_sessions",
499 "mds_sessions",
500 &m_command_hook,
501 "show mds session state");
502 if (ret < 0) {
503 lderr(cct) << "error registering admin socket command: "
504 << cpp_strerror(-ret) << dendl;
505 }
506 ret = admin_socket->register_command("dump_cache",
507 "dump_cache",
508 &m_command_hook,
509 "show in-memory metadata cache contents");
510 if (ret < 0) {
511 lderr(cct) << "error registering admin socket command: "
512 << cpp_strerror(-ret) << dendl;
513 }
514 ret = admin_socket->register_command("kick_stale_sessions",
515 "kick_stale_sessions",
516 &m_command_hook,
517 "kick sessions that were remote reset");
518 if (ret < 0) {
519 lderr(cct) << "error registering admin socket command: "
520 << cpp_strerror(-ret) << dendl;
521 }
522 ret = admin_socket->register_command("status",
523 "status",
524 &m_command_hook,
525 "show overall client status");
526 if (ret < 0) {
527 lderr(cct) << "error registering admin socket command: "
528 << cpp_strerror(-ret) << dendl;
529 }
530
531 client_lock.Lock();
532 initialized = true;
533 client_lock.Unlock();
534}
535
536void Client::shutdown()
537{
538 ldout(cct, 1) << "shutdown" << dendl;
539
540 // If we were not mounted, but were being used for sending
541 // MDS commands, we may have sessions that need closing.
542 client_lock.Lock();
543 _close_sessions();
544 client_lock.Unlock();
545
546 cct->_conf->remove_observer(this);
547
548 AdminSocket* admin_socket = cct->get_admin_socket();
549 admin_socket->unregister_command("mds_requests");
550 admin_socket->unregister_command("mds_sessions");
551 admin_socket->unregister_command("dump_cache");
552 admin_socket->unregister_command("kick_stale_sessions");
553 admin_socket->unregister_command("status");
554
555 if (ino_invalidate_cb) {
556 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
557 async_ino_invalidator.wait_for_empty();
558 async_ino_invalidator.stop();
559 }
560
561 if (dentry_invalidate_cb) {
562 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
563 async_dentry_invalidator.wait_for_empty();
564 async_dentry_invalidator.stop();
565 }
566
567 if (switch_interrupt_cb) {
568 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
569 interrupt_finisher.wait_for_empty();
570 interrupt_finisher.stop();
571 }
572
573 if (remount_cb) {
574 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
575 remount_finisher.wait_for_empty();
576 remount_finisher.stop();
577 }
578
579 objectcacher->stop(); // outside of client_lock! this does a join.
580
581 client_lock.Lock();
582 assert(initialized);
583 initialized = false;
584 timer.shutdown();
585 client_lock.Unlock();
586
587 objecter_finisher.wait_for_empty();
588 objecter_finisher.stop();
589
590 if (logger) {
591 cct->get_perfcounters_collection()->remove(logger.get());
592 logger.reset();
593 }
594}
595
596
597// ===================
598// metadata cache stuff
599
600void Client::trim_cache(bool trim_kernel_dcache)
601{
181888fb
FG
602 uint64_t max = cct->_conf->client_cache_size;
603 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
604 unsigned last = 0;
605 while (lru.lru_get_size() != last) {
606 last = lru.lru_get_size();
607
181888fb 608 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
609
610 // trim!
31f18b77 611 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
612 if (!dn)
613 break; // done
614
615 trim_dentry(dn);
616 }
617
181888fb 618 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
619 _invalidate_kernel_dcache();
620
621 // hose root?
622 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
623 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
624 delete root;
625 root = 0;
626 root_ancestor = 0;
627 while (!root_parents.empty())
628 root_parents.erase(root_parents.begin());
629 inode_map.clear();
630 _reset_faked_inos();
631 }
632}
633
634void Client::trim_cache_for_reconnect(MetaSession *s)
635{
636 mds_rank_t mds = s->mds_num;
637 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
638
639 int trimmed = 0;
640 list<Dentry*> skipped;
641 while (lru.lru_get_size() > 0) {
642 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
643 if (!dn)
644 break;
645
646 if ((dn->inode && dn->inode->caps.count(mds)) ||
647 dn->dir->parent_inode->caps.count(mds)) {
648 trim_dentry(dn);
649 trimmed++;
650 } else
651 skipped.push_back(dn);
652 }
653
654 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
655 lru.lru_insert_mid(*p);
656
657 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
658 << " trimmed " << trimmed << " dentries" << dendl;
659
660 if (s->caps.size() > 0)
661 _invalidate_kernel_dcache();
662}
663
664void Client::trim_dentry(Dentry *dn)
665{
666 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
667 << " in dir " << hex << dn->dir->parent_inode->ino
668 << dendl;
669 if (dn->inode) {
670 Inode *diri = dn->dir->parent_inode;
671 diri->dir_release_count++;
672 clear_dir_complete_and_ordered(diri, true);
673 }
674 unlink(dn, false, false); // drop dir, drop dentry
675}
676
677
678void Client::update_inode_file_bits(Inode *in,
679 uint64_t truncate_seq, uint64_t truncate_size,
680 uint64_t size, uint64_t change_attr,
681 uint64_t time_warp_seq, utime_t ctime,
682 utime_t mtime,
683 utime_t atime,
684 version_t inline_version,
685 bufferlist& inline_data,
686 int issued)
687{
688 bool warn = false;
689 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
690 << " mtime " << mtime << dendl;
691 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
692 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
693 << " local " << in->time_warp_seq << dendl;
694 uint64_t prior_size = in->size;
695
696 if (inline_version > in->inline_version) {
697 in->inline_data = inline_data;
698 in->inline_version = inline_version;
699 }
700
701 /* always take a newer change attr */
702 if (change_attr > in->change_attr)
703 in->change_attr = change_attr;
704
705 if (truncate_seq > in->truncate_seq ||
706 (truncate_seq == in->truncate_seq && size > in->size)) {
707 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
708 in->size = size;
709 in->reported_size = size;
710 if (truncate_seq != in->truncate_seq) {
711 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
712 << truncate_seq << dendl;
713 in->truncate_seq = truncate_seq;
714 in->oset.truncate_seq = truncate_seq;
715
716 // truncate cached file data
717 if (prior_size > size) {
718 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
719 }
720 }
721
722 // truncate inline data
723 if (in->inline_version < CEPH_INLINE_NONE) {
724 uint32_t len = in->inline_data.length();
725 if (size < len)
726 in->inline_data.splice(size, len - size);
727 }
728 }
729 if (truncate_seq >= in->truncate_seq &&
730 in->truncate_size != truncate_size) {
731 if (in->is_file()) {
732 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
733 << truncate_size << dendl;
734 in->truncate_size = truncate_size;
735 in->oset.truncate_size = truncate_size;
736 } else {
737 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
738 }
739 }
740
741 // be careful with size, mtime, atime
742 if (issued & (CEPH_CAP_FILE_EXCL|
743 CEPH_CAP_FILE_WR|
744 CEPH_CAP_FILE_BUFFER|
745 CEPH_CAP_AUTH_EXCL|
746 CEPH_CAP_XATTR_EXCL)) {
747 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
748 if (ctime > in->ctime)
749 in->ctime = ctime;
750 if (time_warp_seq > in->time_warp_seq) {
751 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
752 << " is higher than local time_warp_seq "
753 << in->time_warp_seq << dendl;
754 //the mds updated times, so take those!
755 in->mtime = mtime;
756 in->atime = atime;
757 in->time_warp_seq = time_warp_seq;
758 } else if (time_warp_seq == in->time_warp_seq) {
759 //take max times
760 if (mtime > in->mtime)
761 in->mtime = mtime;
762 if (atime > in->atime)
763 in->atime = atime;
764 } else if (issued & CEPH_CAP_FILE_EXCL) {
765 //ignore mds values as we have a higher seq
766 } else warn = true;
767 } else {
768 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
769 if (time_warp_seq >= in->time_warp_seq) {
770 in->ctime = ctime;
771 in->mtime = mtime;
772 in->atime = atime;
773 in->time_warp_seq = time_warp_seq;
774 } else warn = true;
775 }
776 if (warn) {
777 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
778 << time_warp_seq << " is lower than local time_warp_seq "
779 << in->time_warp_seq
780 << dendl;
781 }
782}
783
784void Client::_fragmap_remove_non_leaves(Inode *in)
785{
786 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (!in->dirfragtree.is_leaf(p->first))
788 in->fragmap.erase(p++);
789 else
790 ++p;
791}
792
793void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
794{
795 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
796 if (p->second == mds)
797 in->fragmap.erase(p++);
798 else
799 ++p;
800}
801
802Inode * Client::add_update_inode(InodeStat *st, utime_t from,
803 MetaSession *session,
804 const UserPerm& request_perms)
805{
806 Inode *in;
807 bool was_new = false;
808 if (inode_map.count(st->vino)) {
809 in = inode_map[st->vino];
810 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
811 } else {
812 in = new Inode(this, st->vino, &st->layout);
813 inode_map[st->vino] = in;
814
815 if (use_faked_inos())
816 _assign_faked_ino(in);
817
818 if (!root) {
819 root = in;
820 root_ancestor = in;
821 cwd = root;
822 } else if (!mounted) {
823 root_parents[root_ancestor] = in;
824 root_ancestor = in;
825 }
826
827 // immutable bits
828 in->ino = st->vino.ino;
829 in->snapid = st->vino.snapid;
830 in->mode = st->mode & S_IFMT;
831 was_new = true;
832 }
833
834 in->rdev = st->rdev;
835 if (in->is_symlink())
836 in->symlink = st->symlink;
837
838 if (was_new)
839 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
840
841 if (!st->cap.caps)
842 return in; // as with readdir returning indoes in different snaprealms (no caps!)
843
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool updating_inode = false;
846 int issued = 0;
847 if (st->version == 0 ||
848 (in->version & ~1) < st->version) {
849 updating_inode = true;
850
851 int implemented = 0;
852 issued = in->caps_issued(&implemented) | in->caps_dirty();
853 issued |= implemented;
854
855 in->version = st->version;
856
857 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
862 }
863
864 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
865 in->nlink = st->nlink;
866 }
867
868 in->dirstat = st->dirstat;
869 in->rstat = st->rstat;
870 in->quota = st->quota;
871 in->layout = st->layout;
872
873 if (in->is_dir()) {
874 in->dir_layout = st->dir_layout;
875 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
876 }
877
878 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
879 st->change_attr, st->time_warp_seq, st->ctime,
880 st->mtime, st->atime, st->inline_version,
881 st->inline_data, issued);
882 } else if (st->inline_version > in->inline_version) {
883 in->inline_data = st->inline_data;
884 in->inline_version = st->inline_version;
885 }
886
887 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
888 st->xattrbl.length() &&
889 st->xattr_version > in->xattr_version) {
890 bufferlist::iterator p = st->xattrbl.begin();
891 ::decode(in->xattrs, p);
892 in->xattr_version = st->xattr_version;
893 }
894
895 // move me if/when version reflects fragtree changes.
896 if (in->dirfragtree != st->dirfragtree) {
897 in->dirfragtree = st->dirfragtree;
898 _fragmap_remove_non_leaves(in);
899 }
900
901 if (in->snapid == CEPH_NOSNAP) {
902 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
903 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
904 request_perms);
905 if (in->auth_cap && in->auth_cap->session == session)
906 in->max_size = st->max_size;
907 } else
908 in->snap_caps |= st->cap.caps;
909
910 // setting I_COMPLETE needs to happen after adding the cap
911 if (updating_inode &&
912 in->is_dir() &&
913 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
914 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
915 in->dirstat.nfiles == 0 &&
916 in->dirstat.nsubdirs == 0) {
917 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
918 in->flags |= I_COMPLETE | I_DIR_ORDERED;
919 if (in->dir) {
920 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
921 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
922 in->dir->readdir_cache.clear();
923 for (auto p = in->dir->dentries.begin();
924 p != in->dir->dentries.end();
925 ++p) {
926 unlink(p->second, true, true); // keep dir, keep dentry
927 }
928 if (in->dir->dentries.empty())
929 close_dir(in->dir);
930 }
931 }
932
933 return in;
934}
935
936
937/*
938 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
939 */
940Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
941 Inode *in, utime_t from, MetaSession *session,
942 Dentry *old_dentry)
943{
944 Dentry *dn = NULL;
945 if (dir->dentries.count(dname))
946 dn = dir->dentries[dname];
947
948 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
949 << " in dir " << dir->parent_inode->vino() << " dn " << dn
950 << dendl;
951
952 if (dn && dn->inode) {
953 if (dn->inode->vino() == in->vino()) {
954 touch_dn(dn);
955 ldout(cct, 12) << " had dentry " << dname
956 << " with correct vino " << dn->inode->vino()
957 << dendl;
958 } else {
959 ldout(cct, 12) << " had dentry " << dname
960 << " with WRONG vino " << dn->inode->vino()
961 << dendl;
962 unlink(dn, true, true); // keep dir, keep dentry
963 }
964 }
965
966 if (!dn || !dn->inode) {
967 InodeRef tmp_ref(in);
968 if (old_dentry) {
969 if (old_dentry->dir != dir) {
970 Inode *old_diri = old_dentry->dir->parent_inode;
971 old_diri->dir_ordered_count++;
972 clear_dir_complete_and_ordered(old_diri, false);
973 }
974 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
975 }
976 Inode *diri = dir->parent_inode;
977 diri->dir_ordered_count++;
978 clear_dir_complete_and_ordered(diri, false);
979 dn = link(dir, dname, in, dn);
980 }
981
982 update_dentry_lease(dn, dlease, from, session);
983 return dn;
984}
985
986void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
987{
988 utime_t dttl = from;
989 dttl += (float)dlease->duration_ms / 1000.0;
990
991 assert(dn);
992
993 if (dlease->mask & CEPH_LOCK_DN) {
994 if (dttl > dn->lease_ttl) {
995 ldout(cct, 10) << "got dentry lease on " << dn->name
996 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
997 dn->lease_ttl = dttl;
998 dn->lease_mds = session->mds_num;
999 dn->lease_seq = dlease->seq;
1000 dn->lease_gen = session->cap_gen;
1001 }
1002 }
1003 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1004}
1005
1006
1007/*
1008 * update MDS location cache for a single inode
1009 */
1010void Client::update_dir_dist(Inode *in, DirStat *dst)
1011{
1012 // auth
1013 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1014 if (dst->auth >= 0) {
1015 in->fragmap[dst->frag] = dst->auth;
1016 } else {
1017 in->fragmap.erase(dst->frag);
1018 }
1019 if (!in->dirfragtree.is_leaf(dst->frag)) {
1020 in->dirfragtree.force_to_leaf(cct, dst->frag);
1021 _fragmap_remove_non_leaves(in);
1022 }
1023
1024 // replicated
1025 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1026
1027 // dist
1028 /*
1029 if (!st->dirfrag_dist.empty()) { // FIXME
1030 set<int> dist = st->dirfrag_dist.begin()->second;
1031 if (dist.empty() && !in->dir_contacts.empty())
1032 ldout(cct, 9) << "lost dist spec for " << in->ino
1033 << " " << dist << dendl;
1034 if (!dist.empty() && in->dir_contacts.empty())
1035 ldout(cct, 9) << "got dist spec for " << in->ino
1036 << " " << dist << dendl;
1037 in->dir_contacts = dist;
1038 }
1039 */
1040}
1041
1042void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043{
1044 if (diri->flags & I_COMPLETE) {
1045 if (complete) {
1046 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1047 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048 } else {
1049 if (diri->flags & I_DIR_ORDERED) {
1050 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1051 diri->flags &= ~I_DIR_ORDERED;
1052 }
1053 }
1054 if (diri->dir)
1055 diri->dir->readdir_cache.clear();
1056 }
1057}
1058
1059/*
1060 * insert results from readdir or lssnap into the metadata cache.
1061 */
1062void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063
1064 MClientReply *reply = request->reply;
1065 ConnectionRef con = request->reply->get_connection();
1066 uint64_t features = con->get_features();
1067
1068 dir_result_t *dirp = request->dirp;
1069 assert(dirp);
1070
1071 // the extra buffer list is only set for readdir and lssnap replies
1072 bufferlist::iterator p = reply->get_extra_bl().begin();
1073 if (!p.end()) {
1074 // snapdir?
1075 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076 assert(diri);
1077 diri = open_snapdir(diri);
1078 }
1079
1080 // only open dir if we're actually adding stuff to it!
1081 Dir *dir = diri->open_dir();
1082 assert(dir);
1083
1084 // dirstat
1085 DirStat dst(p);
1086 __u32 numdn;
1087 __u16 flags;
1088 ::decode(numdn, p);
1089 ::decode(flags, p);
1090
1091 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1092 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093
1094 frag_t fg = (unsigned)request->head.args.readdir.frag;
1095 unsigned readdir_offset = dirp->next_offset;
1096 string readdir_start = dirp->last_name;
1097 assert(!readdir_start.empty() || readdir_offset == 2);
1098
1099 unsigned last_hash = 0;
1100 if (hash_order) {
1101 if (!readdir_start.empty()) {
1102 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1103 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1104 /* mds understands offset_hash */
1105 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1106 }
1107 }
1108
1109 if (fg != dst.frag) {
1110 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1111 fg = dst.frag;
1112 if (!hash_order) {
1113 readdir_offset = 2;
1114 readdir_start.clear();
1115 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1116 }
1117 }
1118
1119 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1120 << ", hash_order=" << hash_order
1121 << ", readdir_start " << readdir_start
1122 << ", last_hash " << last_hash
1123 << ", next_offset " << readdir_offset << dendl;
1124
1125 if (diri->snapid != CEPH_SNAPDIR &&
1126 fg.is_leftmost() && readdir_offset == 2 &&
1127 !(hash_order && last_hash)) {
1128 dirp->release_count = diri->dir_release_count;
1129 dirp->ordered_count = diri->dir_ordered_count;
1130 dirp->start_shared_gen = diri->shared_gen;
1131 dirp->cache_index = 0;
1132 }
1133
1134 dirp->buffer_frag = fg;
1135
1136 _readdir_drop_dirp_buffer(dirp);
1137 dirp->buffer.reserve(numdn);
1138
1139 string dname;
1140 LeaseStat dlease;
1141 for (unsigned i=0; i<numdn; i++) {
1142 ::decode(dname, p);
1143 ::decode(dlease, p);
1144 InodeStat ist(p, features);
1145
1146 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147
1148 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1149 request->perms);
1150 Dentry *dn;
1151 if (diri->dir->dentries.count(dname)) {
1152 Dentry *olddn = diri->dir->dentries[dname];
1153 if (olddn->inode != in) {
1154 // replace incorrect dentry
1155 unlink(olddn, true, true); // keep dir, dentry
1156 dn = link(dir, dname, in, olddn);
1157 assert(dn == olddn);
1158 } else {
1159 // keep existing dn
1160 dn = olddn;
1161 touch_dn(dn);
1162 }
1163 } else {
1164 // new dn
1165 dn = link(dir, dname, in, NULL);
1166 }
1167
1168 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169 if (hash_order) {
1170 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1171 if (hash != last_hash)
1172 readdir_offset = 2;
1173 last_hash = hash;
1174 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175 } else {
1176 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177 }
1178 // add to readdir cache
1179 if (dirp->release_count == diri->dir_release_count &&
1180 dirp->ordered_count == diri->dir_ordered_count &&
1181 dirp->start_shared_gen == diri->shared_gen) {
1182 if (dirp->cache_index == dir->readdir_cache.size()) {
1183 if (i == 0) {
1184 assert(!dirp->inode->is_complete_and_ordered());
1185 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186 }
1187 dir->readdir_cache.push_back(dn);
1188 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1189 if (dirp->inode->is_complete_and_ordered())
1190 assert(dir->readdir_cache[dirp->cache_index] == dn);
1191 else
1192 dir->readdir_cache[dirp->cache_index] = dn;
1193 } else {
1194 assert(0 == "unexpected readdir buffer idx");
1195 }
1196 dirp->cache_index++;
1197 }
1198 // add to cached result list
1199 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1200 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1201 }
1202
1203 if (numdn > 0)
1204 dirp->last_name = dname;
1205 if (end)
1206 dirp->next_offset = 2;
1207 else
1208 dirp->next_offset = readdir_offset;
1209
1210 if (dir->is_empty())
1211 close_dir(dir);
1212 }
1213}
1214
1215/** insert_trace
1216 *
1217 * insert a trace from a MDS reply into the cache.
1218 */
1219Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220{
1221 MClientReply *reply = request->reply;
1222 int op = request->get_op();
1223
1224 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1225 << " is_target=" << (int)reply->head.is_target
1226 << " is_dentry=" << (int)reply->head.is_dentry
1227 << dendl;
1228
1229 bufferlist::iterator p = reply->get_trace_bl().begin();
1230 if (request->got_unsafe) {
1231 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1232 assert(p.end());
1233 return NULL;
1234 }
1235
1236 if (p.end()) {
1237 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238
1239 Dentry *d = request->dentry();
1240 if (d) {
1241 Inode *diri = d->dir->parent_inode;
1242 diri->dir_release_count++;
1243 clear_dir_complete_and_ordered(diri, true);
1244 }
1245
1246 if (d && reply->get_result() == 0) {
1247 if (op == CEPH_MDS_OP_RENAME) {
1248 // rename
1249 Dentry *od = request->old_dentry();
1250 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251 assert(od);
1252 unlink(od, true, true); // keep dir, dentry
1253 } else if (op == CEPH_MDS_OP_RMDIR ||
1254 op == CEPH_MDS_OP_UNLINK) {
1255 // unlink, rmdir
1256 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1257 unlink(d, true, true); // keep dir, dentry
1258 }
1259 }
1260 return NULL;
1261 }
1262
1263 ConnectionRef con = request->reply->get_connection();
1264 uint64_t features = con->get_features();
1265 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1266
1267 // snap trace
1268 SnapRealm *realm = NULL;
1269 if (reply->snapbl.length())
1270 update_snap_trace(reply->snapbl, &realm);
1271
1272 ldout(cct, 10) << " hrm "
1273 << " is_target=" << (int)reply->head.is_target
1274 << " is_dentry=" << (int)reply->head.is_dentry
1275 << dendl;
1276
1277 InodeStat dirst;
1278 DirStat dst;
1279 string dname;
1280 LeaseStat dlease;
1281 InodeStat ist;
1282
1283 if (reply->head.is_dentry) {
1284 dirst.decode(p, features);
1285 dst.decode(p);
1286 ::decode(dname, p);
1287 ::decode(dlease, p);
1288 }
1289
1290 Inode *in = 0;
1291 if (reply->head.is_target) {
1292 ist.decode(p, features);
1293 if (cct->_conf->client_debug_getattr_caps) {
1294 unsigned wanted = 0;
1295 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1296 wanted = request->head.args.getattr.mask;
1297 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1298 wanted = request->head.args.open.mask;
1299
1300 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1301 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1302 assert(0 == "MDS reply does not contain xattrs");
1303 }
1304
1305 in = add_update_inode(&ist, request->sent_stamp, session,
1306 request->perms);
1307 }
1308
1309 Inode *diri = NULL;
1310 if (reply->head.is_dentry) {
1311 diri = add_update_inode(&dirst, request->sent_stamp, session,
1312 request->perms);
1313 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1314
1315 if (in) {
1316 Dir *dir = diri->open_dir();
1317 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1318 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1319 } else {
1320 Dentry *dn = NULL;
1321 if (diri->dir && diri->dir->dentries.count(dname)) {
1322 dn = diri->dir->dentries[dname];
1323 if (dn->inode) {
1324 diri->dir_ordered_count++;
1325 clear_dir_complete_and_ordered(diri, false);
1326 unlink(dn, true, true); // keep dir, dentry
1327 }
1328 }
1329 if (dlease.duration_ms > 0) {
1330 if (!dn) {
1331 Dir *dir = diri->open_dir();
1332 dn = link(dir, dname, NULL, NULL);
1333 }
1334 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1335 }
1336 }
1337 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1338 op == CEPH_MDS_OP_MKSNAP) {
1339 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1340 // fake it for snap lookup
1341 vinodeno_t vino = ist.vino;
1342 vino.snapid = CEPH_SNAPDIR;
1343 assert(inode_map.count(vino));
1344 diri = inode_map[vino];
1345
1346 string dname = request->path.last_dentry();
1347
1348 LeaseStat dlease;
1349 dlease.duration_ms = 0;
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354 } else {
1355 if (diri->dir && diri->dir->dentries.count(dname)) {
1356 Dentry *dn = diri->dir->dentries[dname];
1357 if (dn->inode)
1358 unlink(dn, true, true); // keep dir, dentry
1359 }
1360 }
1361 }
1362
1363 if (in) {
1364 if (op == CEPH_MDS_OP_READDIR ||
1365 op == CEPH_MDS_OP_LSSNAP) {
1366 insert_readdir_results(request, session, in);
1367 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1368 // hack: return parent inode instead
1369 in = diri;
1370 }
1371
1372 if (request->dentry() == NULL && in != request->inode()) {
1373 // pin the target inode if its parent dentry is not pinned
1374 request->set_other_inode(in);
1375 }
1376 }
1377
1378 if (realm)
1379 put_snap_realm(realm);
1380
1381 request->target = in;
1382 return in;
1383}
1384
1385// -------
1386
1387mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388{
1389 mds_rank_t mds = MDS_RANK_NONE;
1390 __u32 hash = 0;
1391 bool is_hash = false;
1392
1393 Inode *in = NULL;
1394 Dentry *de = NULL;
1395 Cap *cap = NULL;
1396
1397 if (req->resend_mds >= 0) {
1398 mds = req->resend_mds;
1399 req->resend_mds = -1;
1400 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1401 goto out;
1402 }
1403
1404 if (cct->_conf->client_use_random_mds)
1405 goto random_mds;
1406
1407 in = req->inode();
1408 de = req->dentry();
1409 if (in) {
1410 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1411 if (req->path.depth()) {
1412 hash = in->hash_dentry_name(req->path[0]);
1413 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1414 << " on " << req->path[0]
1415 << " => " << hash << dendl;
1416 is_hash = true;
1417 }
1418 } else if (de) {
1419 if (de->inode) {
1420 in = de->inode.get();
1421 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422 } else {
1423 in = de->dir->parent_inode;
1424 hash = in->hash_dentry_name(de->name);
1425 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1426 << " on " << de->name
1427 << " => " << hash << dendl;
1428 is_hash = true;
1429 }
1430 }
1431 if (in) {
1432 if (in->snapid != CEPH_NOSNAP) {
1433 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1434 while (in->snapid != CEPH_NOSNAP) {
1435 if (in->snapid == CEPH_SNAPDIR)
1436 in = in->snapdir_parent.get();
1437 else if (!in->dn_set.empty())
1438 /* In most cases there will only be one dentry, so getting it
1439 * will be the correct action. If there are multiple hard links,
1440 * I think the MDS should be able to redirect as needed*/
1441 in = in->get_first_parent()->dir->parent_inode;
1442 else {
1443 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1444 break;
1445 }
1446 }
1447 is_hash = false;
1448 }
1449
1450 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1451 << " hash=" << hash << dendl;
1452
1453 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1454 frag_t fg = in->dirfragtree[hash];
1455 if (in->fragmap.count(fg)) {
1456 mds = in->fragmap[fg];
1457 if (phash_diri)
1458 *phash_diri = in;
1459 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1460 goto out;
1461 }
1462 }
1463
1464 if (req->auth_is_best())
1465 cap = in->auth_cap;
1466 if (!cap && !in->caps.empty())
1467 cap = in->caps.begin()->second;
1468 if (!cap)
1469 goto random_mds;
1470 mds = cap->session->mds_num;
1471 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1472
1473 goto out;
1474 }
1475
1476random_mds:
1477 if (mds < 0) {
1478 mds = _get_random_up_mds();
1479 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1480 }
1481
1482out:
1483 ldout(cct, 20) << "mds is " << mds << dendl;
1484 return mds;
1485}
1486
1487
1488void Client::connect_mds_targets(mds_rank_t mds)
1489{
1490 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1491 assert(mds_sessions.count(mds));
1492 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1493 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1494 q != info.export_targets.end();
1495 ++q) {
1496 if (mds_sessions.count(*q) == 0 &&
1497 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1498 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1499 << " export target mds." << *q << dendl;
1500 _open_mds_session(*q);
1501 }
1502 }
1503}
1504
1505void Client::dump_mds_sessions(Formatter *f)
1506{
1507 f->dump_int("id", get_nodeid().v);
1508 f->open_array_section("sessions");
1509 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1510 f->open_object_section("session");
1511 p->second->dump(f);
1512 f->close_section();
1513 }
1514 f->close_section();
1515 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516}
1517void Client::dump_mds_requests(Formatter *f)
1518{
1519 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1520 p != mds_requests.end();
1521 ++p) {
1522 f->open_object_section("request");
1523 p->second->dump(f);
1524 f->close_section();
1525 }
1526}
1527
1528int Client::verify_reply_trace(int r,
1529 MetaRequest *request, MClientReply *reply,
1530 InodeRef *ptarget, bool *pcreated,
1531 const UserPerm& perms)
1532{
1533 // check whether this request actually did the create, and set created flag
1534 bufferlist extra_bl;
1535 inodeno_t created_ino;
1536 bool got_created_ino = false;
1537 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538
1539 extra_bl.claim(reply->get_extra_bl());
1540 if (extra_bl.length() >= 8) {
1541 // if the extra bufferlist has a buffer, we assume its the created inode
1542 // and that this request to create succeeded in actually creating
1543 // the inode (won the race with other create requests)
1544 ::decode(created_ino, extra_bl);
1545 got_created_ino = true;
1546 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1547 }
1548
1549 if (pcreated)
1550 *pcreated = got_created_ino;
1551
1552 if (request->target) {
1553 *ptarget = request->target;
1554 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555 } else {
1556 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1557 (*ptarget) = p->second;
1558 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559 } else {
1560 // we got a traceless reply, and need to look up what we just
1561 // created. for now, do this by name. someday, do this by the
1562 // ino... which we know! FIXME.
1563 InodeRef target;
1564 Dentry *d = request->dentry();
1565 if (d) {
1566 if (d->dir) {
1567 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1568 << d->dir->parent_inode->ino << "/" << d->name
1569 << " got_ino " << got_created_ino
1570 << " ino " << created_ino
1571 << dendl;
1572 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1573 &target, perms);
1574 } else {
1575 // if the dentry is not linked, just do our best. see #5021.
1576 assert(0 == "how did this happen? i want logs!");
1577 }
1578 } else {
1579 Inode *in = request->inode();
1580 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1581 << in->ino << dendl;
1582 r = _getattr(in, request->regetattr_mask, perms, true);
1583 target = in;
1584 }
1585 if (r >= 0) {
1586 // verify ino returned in reply and trace_dist are the same
1587 if (got_created_ino &&
1588 created_ino.val != target->ino.val) {
1589 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1590 r = -EINTR;
1591 }
1592 if (ptarget)
1593 ptarget->swap(target);
1594 }
1595 }
1596 }
1597
1598 return r;
1599}
1600
1601
1602/**
1603 * make a request
1604 *
1605 * Blocking helper to make an MDS request.
1606 *
1607 * If the ptarget flag is set, behavior changes slightly: the caller
1608 * expects to get a pointer to the inode we are creating or operating
1609 * on. As a result, we will follow up any traceless mutation reply
1610 * with a getattr or lookup to transparently handle a traceless reply
1611 * from the MDS (as when the MDS restarts and the client has to replay
1612 * a request).
1613 *
1614 * @param request the MetaRequest to execute
1615 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1616 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1617 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1618 * @param use_mds [optional] prefer a specific mds (-1 for default)
1619 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620 */
1621int Client::make_request(MetaRequest *request,
1622 const UserPerm& perms,
1623 InodeRef *ptarget, bool *pcreated,
1624 mds_rank_t use_mds,
1625 bufferlist *pdirbl)
1626{
1627 int r = 0;
1628
1629 // assign a unique tid
1630 ceph_tid_t tid = ++last_tid;
1631 request->set_tid(tid);
1632
1633 // and timestamp
1634 request->op_stamp = ceph_clock_now();
1635
1636 // make note
1637 mds_requests[tid] = request->get();
1638 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1639 oldest_tid = tid;
1640
1641 request->set_caller_perms(perms);
1642
1643 if (cct->_conf->client_inject_fixed_oldest_tid) {
1644 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1645 request->set_oldest_client_tid(1);
1646 } else {
1647 request->set_oldest_client_tid(oldest_tid);
1648 }
1649
1650 // hack target mds?
1651 if (use_mds >= 0)
1652 request->resend_mds = use_mds;
1653
1654 while (1) {
1655 if (request->aborted())
1656 break;
1657
31f18b77
FG
1658 if (blacklisted) {
1659 request->abort(-EBLACKLISTED);
1660 break;
1661 }
1662
7c673cae
FG
1663 // set up wait cond
1664 Cond caller_cond;
1665 request->caller_cond = &caller_cond;
1666
1667 // choose mds
1668 Inode *hash_diri = NULL;
1669 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1670 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1671 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1672 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673 if (hash_diri) {
1674 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1675 _fragmap_remove_stopped_mds(hash_diri, mds);
1676 } else {
1677 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1678 request->resend_mds = _get_random_up_mds();
1679 }
1680 } else {
1681 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1682 wait_on_list(waiting_for_mdsmap);
1683 }
1684 continue;
1685 }
1686
1687 // open a session?
1688 MetaSession *session = NULL;
1689 if (!have_open_session(mds)) {
1690 session = _get_or_open_mds_session(mds);
1691
1692 // wait
1693 if (session->state == MetaSession::STATE_OPENING) {
1694 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1695 wait_on_context_list(session->waiting_for_open);
1696 // Abort requests on REJECT from MDS
1697 if (rejected_by_mds.count(mds)) {
1698 request->abort(-EPERM);
1699 break;
1700 }
1701 continue;
1702 }
1703
1704 if (!have_open_session(mds))
1705 continue;
1706 } else {
1707 session = mds_sessions[mds];
1708 }
1709
1710 // send request.
1711 send_request(request, session);
1712
1713 // wait for signal
1714 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1715 request->kick = false;
1716 while (!request->reply && // reply
1717 request->resend_mds < 0 && // forward
1718 !request->kick)
1719 caller_cond.Wait(client_lock);
1720 request->caller_cond = NULL;
1721
1722 // did we get a reply?
1723 if (request->reply)
1724 break;
1725 }
1726
1727 if (!request->reply) {
1728 assert(request->aborted());
1729 assert(!request->got_unsafe);
1730 r = request->get_abort_code();
1731 request->item.remove_myself();
1732 unregister_request(request);
1733 put_request(request); // ours
1734 return r;
1735 }
1736
1737 // got it!
1738 MClientReply *reply = request->reply;
1739 request->reply = NULL;
1740 r = reply->get_result();
1741 if (r >= 0)
1742 request->success = true;
1743
1744 // kick dispatcher (we've got it!)
1745 assert(request->dispatch_cond);
1746 request->dispatch_cond->Signal();
1747 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1748 request->dispatch_cond = 0;
1749
1750 if (r >= 0 && ptarget)
1751 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1752
1753 if (pdirbl)
1754 pdirbl->claim(reply->get_extra_bl());
1755
1756 // -- log times --
1757 utime_t lat = ceph_clock_now();
1758 lat -= request->sent_stamp;
1759 ldout(cct, 20) << "lat " << lat << dendl;
1760 logger->tinc(l_c_lat, lat);
1761 logger->tinc(l_c_reply, lat);
1762
1763 put_request(request);
1764
1765 reply->put();
1766 return r;
1767}
1768
1769void Client::unregister_request(MetaRequest *req)
1770{
1771 mds_requests.erase(req->tid);
1772 if (req->tid == oldest_tid) {
1773 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774 while (true) {
1775 if (p == mds_requests.end()) {
1776 oldest_tid = 0;
1777 break;
1778 }
1779 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1780 oldest_tid = p->first;
1781 break;
1782 }
1783 ++p;
1784 }
1785 }
1786 put_request(req);
1787}
1788
1789void Client::put_request(MetaRequest *request)
1790{
1791 if (request->_put()) {
1792 int op = -1;
1793 if (request->success)
1794 op = request->get_op();
1795 InodeRef other_in;
1796 request->take_other_inode(&other_in);
1797 delete request;
1798
1799 if (other_in &&
1800 (op == CEPH_MDS_OP_RMDIR ||
1801 op == CEPH_MDS_OP_RENAME ||
1802 op == CEPH_MDS_OP_RMSNAP)) {
1803 _try_to_trim_inode(other_in.get(), false);
1804 }
1805 }
1806}
1807
1808int Client::encode_inode_release(Inode *in, MetaRequest *req,
1809 mds_rank_t mds, int drop,
1810 int unless, int force)
1811{
1812 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1813 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1814 << ", have:" << ", force:" << force << ")" << dendl;
1815 int released = 0;
1816 if (in->caps.count(mds)) {
1817 Cap *caps = in->caps[mds];
1818 drop &= ~(in->dirty_caps | get_caps_used(in));
1819 if ((drop & caps->issued) &&
1820 !(unless & caps->issued)) {
1821 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1822 caps->issued &= ~drop;
1823 caps->implemented &= ~drop;
1824 released = 1;
1825 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1826 } else {
1827 released = force;
1828 }
1829 if (released) {
1830 ceph_mds_request_release rel;
1831 rel.ino = in->ino;
1832 rel.cap_id = caps->cap_id;
1833 rel.seq = caps->seq;
1834 rel.issue_seq = caps->issue_seq;
1835 rel.mseq = caps->mseq;
1836 rel.caps = caps->implemented;
1837 rel.wanted = caps->wanted;
1838 rel.dname_len = 0;
1839 rel.dname_seq = 0;
1840 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1841 }
1842 }
1843 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1844 << released << dendl;
1845 return released;
1846}
1847
1848void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1849 mds_rank_t mds, int drop, int unless)
1850{
1851 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1852 << dn << ")" << dendl;
1853 int released = 0;
1854 if (dn->dir)
1855 released = encode_inode_release(dn->dir->parent_inode, req,
1856 mds, drop, unless, 1);
1857 if (released && dn->lease_mds == mds) {
1858 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1859 MClientRequest::Release& rel = req->cap_releases.back();
1860 rel.item.dname_len = dn->name.length();
1861 rel.item.dname_seq = dn->lease_seq;
1862 rel.dname = dn->name;
1863 }
1864 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1865 << dn << ")" << dendl;
1866}
1867
1868
1869/*
1870 * This requires the MClientRequest *request member to be set.
1871 * It will error out horribly without one.
1872 * Additionally, if you set any *drop member, you'd better have
1873 * set the corresponding dentry!
1874 */
1875void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876{
1877 ldout(cct, 20) << "encode_cap_releases enter (req: "
1878 << req << ", mds: " << mds << ")" << dendl;
1879 if (req->inode_drop && req->inode())
1880 encode_inode_release(req->inode(), req,
1881 mds, req->inode_drop,
1882 req->inode_unless);
1883
1884 if (req->old_inode_drop && req->old_inode())
1885 encode_inode_release(req->old_inode(), req,
1886 mds, req->old_inode_drop,
1887 req->old_inode_unless);
1888 if (req->other_inode_drop && req->other_inode())
1889 encode_inode_release(req->other_inode(), req,
1890 mds, req->other_inode_drop,
1891 req->other_inode_unless);
1892
1893 if (req->dentry_drop && req->dentry())
1894 encode_dentry_release(req->dentry(), req,
1895 mds, req->dentry_drop,
1896 req->dentry_unless);
1897
1898 if (req->old_dentry_drop && req->old_dentry())
1899 encode_dentry_release(req->old_dentry(), req,
1900 mds, req->old_dentry_drop,
1901 req->old_dentry_unless);
1902 ldout(cct, 25) << "encode_cap_releases exit (req: "
1903 << req << ", mds " << mds <<dendl;
1904}
1905
1906bool Client::have_open_session(mds_rank_t mds)
1907{
1908 return
1909 mds_sessions.count(mds) &&
1910 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1911 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1912}
1913
1914MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915{
1916 if (mds_sessions.count(mds) == 0)
1917 return NULL;
1918 MetaSession *s = mds_sessions[mds];
1919 if (s->con != con)
1920 return NULL;
1921 return s;
1922}
1923
1924MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925{
1926 if (mds_sessions.count(mds))
1927 return mds_sessions[mds];
1928 return _open_mds_session(mds);
1929}
1930
1931/**
1932 * Populate a map of strings with client-identifying metadata,
1933 * such as the hostname. Call this once at initialization.
1934 */
1935void Client::populate_metadata(const std::string &mount_root)
1936{
1937 // Hostname
1938 struct utsname u;
1939 int r = uname(&u);
1940 if (r >= 0) {
1941 metadata["hostname"] = u.nodename;
1942 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943 } else {
1944 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1945 }
1946
1947 metadata["pid"] = stringify(getpid());
1948
1949 // Ceph entity id (the '0' in "client.0")
1950 metadata["entity_id"] = cct->_conf->name.get_id();
1951
1952 // Our mount position
1953 if (!mount_root.empty()) {
1954 metadata["root"] = mount_root;
1955 }
1956
1957 // Ceph version
1958 metadata["ceph_version"] = pretty_version_to_str();
1959 metadata["ceph_sha1"] = git_version_to_str();
1960
1961 // Apply any metadata from the user's configured overrides
1962 std::vector<std::string> tokens;
1963 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1964 for (const auto &i : tokens) {
1965 auto eqpos = i.find("=");
1966 // Throw out anything that isn't of the form "<str>=<str>"
1967 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1968 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1969 continue;
1970 }
1971 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1972 }
1973}
1974
1975/**
1976 * Optionally add or override client metadata fields.
1977 */
1978void Client::update_metadata(std::string const &k, std::string const &v)
1979{
1980 Mutex::Locker l(client_lock);
1981 assert(initialized);
1982
1983 if (metadata.count(k)) {
1984 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1985 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1986 }
1987
1988 metadata[k] = v;
1989}
1990
1991MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992{
1993 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1994 assert(mds_sessions.count(mds) == 0);
1995 MetaSession *session = new MetaSession;
1996 session->mds_num = mds;
1997 session->seq = 0;
1998 session->inst = mdsmap->get_inst(mds);
1999 session->con = messenger->get_connection(session->inst);
2000 session->state = MetaSession::STATE_OPENING;
2001 session->mds_state = MDSMap::STATE_NULL;
2002 mds_sessions[mds] = session;
2003
2004 // Maybe skip sending a request to open if this MDS daemon
2005 // has previously sent us a REJECT.
2006 if (rejected_by_mds.count(mds)) {
2007 if (rejected_by_mds[mds] == session->inst) {
2008 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2009 "because we were rejected" << dendl;
2010 return session;
2011 } else {
2012 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2013 "rejected us, trying with new inst" << dendl;
2014 rejected_by_mds.erase(mds);
2015 }
2016 }
2017
2018 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2019 m->client_meta = metadata;
2020 session->con->send_message(m);
2021 return session;
2022}
2023
2024void Client::_close_mds_session(MetaSession *s)
2025{
2026 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2027 s->state = MetaSession::STATE_CLOSING;
2028 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2029}
2030
2031void Client::_closed_mds_session(MetaSession *s)
2032{
2033 s->state = MetaSession::STATE_CLOSED;
2034 s->con->mark_down();
2035 signal_context_list(s->waiting_for_open);
2036 mount_cond.Signal();
2037 remove_session_caps(s);
2038 kick_requests_closed(s);
2039 mds_sessions.erase(s->mds_num);
2040 delete s;
2041}
2042
2043void Client::handle_client_session(MClientSession *m)
2044{
2045 mds_rank_t from = mds_rank_t(m->get_source().num());
2046 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047
2048 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049 if (!session) {
2050 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2051 m->put();
2052 return;
2053 }
2054
2055 switch (m->get_op()) {
2056 case CEPH_SESSION_OPEN:
2057 renew_caps(session);
2058 session->state = MetaSession::STATE_OPEN;
2059 if (unmounting)
2060 mount_cond.Signal();
2061 else
2062 connect_mds_targets(from);
2063 signal_context_list(session->waiting_for_open);
2064 break;
2065
2066 case CEPH_SESSION_CLOSE:
2067 _closed_mds_session(session);
2068 break;
2069
2070 case CEPH_SESSION_RENEWCAPS:
2071 if (session->cap_renew_seq == m->get_seq()) {
2072 session->cap_ttl =
2073 session->last_cap_renew_request + mdsmap->get_session_timeout();
2074 wake_inode_waiters(session);
2075 }
2076 break;
2077
2078 case CEPH_SESSION_STALE:
2079 renew_caps(session);
2080 break;
2081
2082 case CEPH_SESSION_RECALL_STATE:
2083 trim_caps(session, m->get_max_caps());
2084 break;
2085
2086 case CEPH_SESSION_FLUSHMSG:
2087 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2088 break;
2089
2090 case CEPH_SESSION_FORCE_RO:
2091 force_session_readonly(session);
2092 break;
2093
2094 case CEPH_SESSION_REJECT:
2095 rejected_by_mds[session->mds_num] = session->inst;
2096 _closed_mds_session(session);
2097
2098 break;
2099
2100 default:
2101 ceph_abort();
2102 }
2103
2104 m->put();
2105}
2106
2107bool Client::_any_stale_sessions() const
2108{
2109 assert(client_lock.is_locked_by_me());
2110
2111 for (const auto &i : mds_sessions) {
2112 if (i.second->state == MetaSession::STATE_STALE) {
2113 return true;
2114 }
2115 }
2116
2117 return false;
2118}
2119
2120void Client::_kick_stale_sessions()
2121{
2122 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123
2124 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2125 p != mds_sessions.end(); ) {
2126 MetaSession *s = p->second;
2127 ++p;
2128 if (s->state == MetaSession::STATE_STALE)
2129 _closed_mds_session(s);
2130 }
2131}
2132
2133void Client::send_request(MetaRequest *request, MetaSession *session,
2134 bool drop_cap_releases)
2135{
2136 // make the request
2137 mds_rank_t mds = session->mds_num;
2138 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2139 << " for mds." << mds << dendl;
2140 MClientRequest *r = build_client_request(request);
2141 if (request->dentry()) {
2142 r->set_dentry_wanted();
2143 }
2144 if (request->got_unsafe) {
2145 r->set_replayed_op();
2146 if (request->target)
2147 r->head.ino = request->target->ino;
2148 } else {
2149 encode_cap_releases(request, mds);
2150 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2151 request->cap_releases.clear();
2152 else
2153 r->releases.swap(request->cap_releases);
2154 }
2155 r->set_mdsmap_epoch(mdsmap->get_epoch());
2156 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2157 objecter->with_osdmap([r](const OSDMap& o) {
2158 r->set_osdmap_epoch(o.get_epoch());
2159 });
2160 }
2161
2162 if (request->mds == -1) {
2163 request->sent_stamp = ceph_clock_now();
2164 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2165 }
2166 request->mds = mds;
2167
2168 Inode *in = request->inode();
2169 if (in && in->caps.count(mds))
2170 request->sent_on_mseq = in->caps[mds]->mseq;
2171
2172 session->requests.push_back(&request->item);
2173
2174 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2175 session->con->send_message(r);
2176}
2177
2178MClientRequest* Client::build_client_request(MetaRequest *request)
2179{
2180 MClientRequest *req = new MClientRequest(request->get_op());
2181 req->set_tid(request->tid);
2182 req->set_stamp(request->op_stamp);
2183 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184
2185 // if the filepath's haven't been set, set them!
2186 if (request->path.empty()) {
2187 Inode *in = request->inode();
2188 Dentry *de = request->dentry();
2189 if (in)
2190 in->make_nosnap_relative_path(request->path);
2191 else if (de) {
2192 if (de->inode)
2193 de->inode->make_nosnap_relative_path(request->path);
2194 else if (de->dir) {
2195 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2196 request->path.push_dentry(de->name);
2197 }
2198 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2199 << " No path, inode, or appropriately-endowed dentry given!"
2200 << dendl;
2201 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2202 << " No path, inode, or dentry given!"
2203 << dendl;
2204 }
2205 req->set_filepath(request->get_filepath());
2206 req->set_filepath2(request->get_filepath2());
2207 req->set_data(request->data);
2208 req->set_retry_attempt(request->retry_attempt++);
2209 req->head.num_fwd = request->num_fwd;
2210 const gid_t *_gids;
2211 int gid_count = request->perms.get_gids(&_gids);
2212 req->set_gid_list(gid_count, _gids);
2213 return req;
2214}
2215
2216
2217
2218void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219{
2220 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2221 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2222 if (!session) {
2223 fwd->put();
2224 return;
2225 }
2226 ceph_tid_t tid = fwd->get_tid();
2227
2228 if (mds_requests.count(tid) == 0) {
2229 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2230 fwd->put();
2231 return;
2232 }
2233
2234 MetaRequest *request = mds_requests[tid];
2235 assert(request);
2236
2237 // reset retry counter
2238 request->retry_attempt = 0;
2239
2240 // request not forwarded, or dest mds has no session.
2241 // resend.
2242 ldout(cct, 10) << "handle_client_request tid " << tid
2243 << " fwd " << fwd->get_num_fwd()
2244 << " to mds." << fwd->get_dest_mds()
2245 << ", resending to " << fwd->get_dest_mds()
2246 << dendl;
2247
2248 request->mds = -1;
2249 request->item.remove_myself();
2250 request->num_fwd = fwd->get_num_fwd();
2251 request->resend_mds = fwd->get_dest_mds();
2252 request->caller_cond->Signal();
2253
2254 fwd->put();
2255}
2256
2257bool Client::is_dir_operation(MetaRequest *req)
2258{
2259 int op = req->get_op();
2260 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2261 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2262 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2263 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2264 return true;
2265 return false;
2266}
2267
2268void Client::handle_client_reply(MClientReply *reply)
2269{
2270 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2271 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2272 if (!session) {
2273 reply->put();
2274 return;
2275 }
2276
2277 ceph_tid_t tid = reply->get_tid();
2278 bool is_safe = reply->is_safe();
2279
2280 if (mds_requests.count(tid) == 0) {
2281 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2282 << " safe is:" << is_safe << dendl;
2283 reply->put();
2284 return;
2285 }
2286 MetaRequest *request = mds_requests.at(tid);
2287
2288 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2289 << " tid " << tid << dendl;
2290
2291 if (request->got_unsafe && !is_safe) {
2292 //duplicate response
2293 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2294 << mds_num << " safe:" << is_safe << dendl;
2295 reply->put();
2296 return;
2297 }
2298
2299 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2300 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2301 << " from mds." << request->mds << dendl;
2302 request->send_to_auth = true;
2303 request->resend_mds = choose_target_mds(request);
2304 Inode *in = request->inode();
2305 if (request->resend_mds >= 0 &&
2306 request->resend_mds == request->mds &&
2307 (in == NULL ||
2308 in->caps.count(request->resend_mds) == 0 ||
2309 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2310 // have to return ESTALE
2311 } else {
2312 request->caller_cond->Signal();
2313 reply->put();
2314 return;
2315 }
2316 ldout(cct, 20) << "have to return ESTALE" << dendl;
2317 }
2318
2319 assert(request->reply == NULL);
2320 request->reply = reply;
2321 insert_trace(request, session);
2322
2323 // Handle unsafe reply
2324 if (!is_safe) {
2325 request->got_unsafe = true;
2326 session->unsafe_requests.push_back(&request->unsafe_item);
2327 if (is_dir_operation(request)) {
2328 Inode *dir = request->inode();
2329 assert(dir);
2330 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331 }
2332 if (request->target) {
2333 InodeRef &in = request->target;
2334 in->unsafe_ops.push_back(&request->unsafe_target_item);
2335 }
2336 }
2337
2338 // Only signal the caller once (on the first reply):
2339 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2340 if (!is_safe || !request->got_unsafe) {
2341 Cond cond;
2342 request->dispatch_cond = &cond;
2343
2344 // wake up waiter
2345 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2346 request->caller_cond->Signal();
2347
2348 // wake for kick back
2349 while (request->dispatch_cond) {
2350 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2351 cond.Wait(client_lock);
2352 }
2353 }
2354
2355 if (is_safe) {
2356 // the filesystem change is committed to disk
2357 // we're done, clean up
2358 if (request->got_unsafe) {
2359 request->unsafe_item.remove_myself();
2360 request->unsafe_dir_item.remove_myself();
2361 request->unsafe_target_item.remove_myself();
2362 signal_cond_list(request->waitfor_safe);
2363 }
2364 request->item.remove_myself();
2365 unregister_request(request);
2366 }
2367 if (unmounting)
2368 mount_cond.Signal();
2369}
2370
2371void Client::_handle_full_flag(int64_t pool)
2372{
2373 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2374 << "on " << pool << dendl;
2375 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2376 // to do this rather than blocking, because otherwise when we fill up we
2377 // potentially lock caps forever on files with dirty pages, and we need
2378 // to be able to release those caps to the MDS so that it can delete files
2379 // and free up space.
2380 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381
2382 // For all inodes with layouts in this pool and a pending flush write op
2383 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2384 // from ObjectCacher so that it doesn't re-issue the write in response to
2385 // the ENOSPC error.
2386 // Fortunately since we're cancelling everything in a given pool, we don't
2387 // need to know which ops belong to which ObjectSet, we can just blow all
2388 // the un-flushed cached data away and mark any dirty inodes' async_err
2389 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2390 // affecting this pool, and all the objectsets we're purging were also
2391 // in this pool.
2392 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2393 i != inode_map.end(); ++i)
2394 {
2395 Inode *inode = i->second;
2396 if (inode->oset.dirty_or_tx
2397 && (pool == -1 || inode->layout.pool_id == pool)) {
2398 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2399 << " has dirty objects, purging and setting ENOSPC" << dendl;
2400 objectcacher->purge_set(&inode->oset);
2401 inode->set_async_err(-ENOSPC);
2402 }
2403 }
2404
2405 if (cancelled_epoch != (epoch_t)-1) {
2406 set_cap_epoch_barrier(cancelled_epoch);
2407 }
2408}
2409
2410void Client::handle_osd_map(MOSDMap *m)
2411{
31f18b77
FG
2412 std::set<entity_addr_t> new_blacklists;
2413 objecter->consume_blacklist_events(&new_blacklists);
2414
2415 const auto myaddr = messenger->get_myaddr();
2416 if (!blacklisted && new_blacklists.count(myaddr)) {
2417 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2418 return o.get_epoch();
2419 });
2420 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421 blacklisted = true;
2422 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2423 p != mds_requests.end(); ) {
2424 auto req = p->second;
2425 ++p;
2426 req->abort(-EBLACKLISTED);
2427 if (req->caller_cond) {
2428 req->kick = true;
2429 req->caller_cond->Signal();
2430 }
2431 }
2432
2433 // Progress aborts on any requests that were on this waitlist. Any
2434 // requests that were on a waiting_for_open session waitlist
2435 // will get kicked during close session below.
2436 signal_cond_list(waiting_for_mdsmap);
2437
2438 // Force-close all sessions: assume this is not abandoning any state
2439 // on the MDS side because the MDS will have seen the blacklist too.
2440 while(!mds_sessions.empty()) {
2441 auto i = mds_sessions.begin();
2442 auto session = i->second;
2443 _closed_mds_session(session);
2444 }
2445
2446 // Since we know all our OSD ops will fail, cancel them all preemtively,
2447 // so that on an unhealthy cluster we can umount promptly even if e.g.
2448 // some PGs were inaccessible.
2449 objecter->op_cancel_writes(-EBLACKLISTED);
2450
2451 } else if (blacklisted) {
2452 // Handle case where we were blacklisted but no longer are
2453 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2454 return o.is_blacklisted(myaddr);});
2455 }
2456
7c673cae
FG
2457 if (objecter->osdmap_full_flag()) {
2458 _handle_full_flag(-1);
2459 } else {
2460 // Accumulate local list of full pools so that I can drop
2461 // the objecter lock before re-entering objecter in
2462 // cancel_writes
2463 std::vector<int64_t> full_pools;
2464
2465 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2466 for (const auto& kv : o.get_pools()) {
2467 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2468 full_pools.push_back(kv.first);
2469 }
2470 }
2471 });
2472
2473 for (auto p : full_pools)
2474 _handle_full_flag(p);
2475
2476 // Subscribe to subsequent maps to watch for the full flag going
2477 // away. For the global full flag objecter does this for us, but
2478 // it pays no attention to the per-pool full flag so in this branch
2479 // we do it ourselves.
2480 if (!full_pools.empty()) {
2481 objecter->maybe_request_map();
2482 }
2483 }
2484
2485 m->put();
2486}
2487
2488
2489// ------------------------
2490// incoming messages
2491
2492
2493bool Client::ms_dispatch(Message *m)
2494{
2495 Mutex::Locker l(client_lock);
2496 if (!initialized) {
2497 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2498 m->put();
2499 return true;
2500 }
2501
2502 switch (m->get_type()) {
2503 // mounting and mds sessions
2504 case CEPH_MSG_MDS_MAP:
2505 handle_mds_map(static_cast<MMDSMap*>(m));
2506 break;
2507 case CEPH_MSG_FS_MAP:
2508 handle_fs_map(static_cast<MFSMap*>(m));
2509 break;
2510 case CEPH_MSG_FS_MAP_USER:
2511 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512 break;
2513 case CEPH_MSG_CLIENT_SESSION:
2514 handle_client_session(static_cast<MClientSession*>(m));
2515 break;
2516
2517 case CEPH_MSG_OSD_MAP:
2518 handle_osd_map(static_cast<MOSDMap*>(m));
2519 break;
2520
2521 // requests
2522 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2523 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524 break;
2525 case CEPH_MSG_CLIENT_REPLY:
2526 handle_client_reply(static_cast<MClientReply*>(m));
2527 break;
2528
2529 case CEPH_MSG_CLIENT_SNAP:
2530 handle_snap(static_cast<MClientSnap*>(m));
2531 break;
2532 case CEPH_MSG_CLIENT_CAPS:
2533 handle_caps(static_cast<MClientCaps*>(m));
2534 break;
2535 case CEPH_MSG_CLIENT_LEASE:
2536 handle_lease(static_cast<MClientLease*>(m));
2537 break;
2538 case MSG_COMMAND_REPLY:
2539 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2540 handle_command_reply(static_cast<MCommandReply*>(m));
2541 } else {
2542 return false;
2543 }
2544 break;
2545 case CEPH_MSG_CLIENT_QUOTA:
2546 handle_quota(static_cast<MClientQuota*>(m));
2547 break;
2548
2549 default:
2550 return false;
2551 }
2552
2553 // unmounting?
2554 if (unmounting) {
2555 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2556 << "+" << inode_map.size() << dendl;
2557 long unsigned size = lru.lru_get_size() + inode_map.size();
2558 trim_cache();
2559 if (size < lru.lru_get_size() + inode_map.size()) {
2560 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2561 mount_cond.Signal();
2562 } else {
2563 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2564 << "+" << inode_map.size() << dendl;
2565 }
2566 }
2567
2568 return true;
2569}
2570
2571void Client::handle_fs_map(MFSMap *m)
2572{
2573 fsmap.reset(new FSMap(m->get_fsmap()));
2574 m->put();
2575
2576 signal_cond_list(waiting_for_fsmap);
2577
2578 monclient->sub_got("fsmap", fsmap->get_epoch());
2579}
2580
2581void Client::handle_fs_map_user(MFSMapUser *m)
2582{
2583 fsmap_user.reset(new FSMapUser);
2584 *fsmap_user = m->get_fsmap();
2585 m->put();
2586
2587 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2588 signal_cond_list(waiting_for_fsmap);
2589}
2590
2591void Client::handle_mds_map(MMDSMap* m)
2592{
2593 if (m->get_epoch() <= mdsmap->get_epoch()) {
2594 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2595 << " is identical to or older than our "
2596 << mdsmap->get_epoch() << dendl;
2597 m->put();
2598 return;
2599 }
2600
2601 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602
2603 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2604 oldmap.swap(mdsmap);
2605
2606 mdsmap->decode(m->get_encoded());
2607
2608 // Cancel any commands for missing or laggy GIDs
2609 std::list<ceph_tid_t> cancel_ops;
2610 auto &commands = command_table.get_commands();
2611 for (const auto &i : commands) {
2612 auto &op = i.second;
2613 const mds_gid_t op_mds_gid = op.mds_gid;
2614 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2615 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2616 cancel_ops.push_back(i.first);
2617 if (op.outs) {
2618 std::ostringstream ss;
2619 ss << "MDS " << op_mds_gid << " went away";
2620 *(op.outs) = ss.str();
2621 }
2622 op.con->mark_down();
2623 if (op.on_finish) {
2624 op.on_finish->complete(-ETIMEDOUT);
2625 }
2626 }
2627 }
2628
2629 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2630 i != cancel_ops.end(); ++i) {
2631 command_table.erase(*i);
2632 }
2633
2634 // reset session
2635 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2636 p != mds_sessions.end(); ) {
2637 mds_rank_t mds = p->first;
2638 MetaSession *session = p->second;
2639 ++p;
2640
2641 int oldstate = oldmap->get_state(mds);
2642 int newstate = mdsmap->get_state(mds);
2643 if (!mdsmap->is_up(mds)) {
2644 session->con->mark_down();
2645 } else if (mdsmap->get_inst(mds) != session->inst) {
2646 session->con->mark_down();
2647 session->inst = mdsmap->get_inst(mds);
2648 // When new MDS starts to take over, notify kernel to trim unused entries
2649 // in its dcache/icache. Hopefully, the kernel will release some unused
2650 // inodes before the new MDS enters reconnect state.
2651 trim_cache_for_reconnect(session);
2652 } else if (oldstate == newstate)
2653 continue; // no change
2654
2655 session->mds_state = newstate;
2656 if (newstate == MDSMap::STATE_RECONNECT) {
2657 session->con = messenger->get_connection(session->inst);
2658 send_reconnect(session);
2659 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2660 if (oldstate < MDSMap::STATE_ACTIVE) {
2661 // kick new requests
2662 kick_requests(session);
2663 kick_flushing_caps(session);
2664 signal_context_list(session->waiting_for_open);
2665 kick_maxsize_requests(session);
2666 wake_inode_waiters(session);
2667 }
2668 connect_mds_targets(mds);
2669 } else if (newstate == MDSMap::STATE_NULL &&
2670 mds >= mdsmap->get_max_mds()) {
2671 _closed_mds_session(session);
2672 }
2673 }
2674
2675 // kick any waiting threads
2676 signal_cond_list(waiting_for_mdsmap);
2677
2678 m->put();
2679
2680 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2681}
2682
2683void Client::send_reconnect(MetaSession *session)
2684{
2685 mds_rank_t mds = session->mds_num;
2686 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687
2688 // trim unused caps to reduce MDS's cache rejoin time
2689 trim_cache_for_reconnect(session);
2690
2691 session->readonly = false;
2692
2693 if (session->release) {
2694 session->release->put();
2695 session->release = NULL;
2696 }
2697
2698 // reset my cap seq number
2699 session->seq = 0;
2700 //connect to the mds' offload targets
2701 connect_mds_targets(mds);
2702 //make sure unsafe requests get saved
2703 resend_unsafe_requests(session);
2704
2705 MClientReconnect *m = new MClientReconnect;
2706
2707 // i have an open session.
2708 ceph::unordered_set<inodeno_t> did_snaprealm;
2709 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2710 p != inode_map.end();
2711 ++p) {
2712 Inode *in = p->second;
2713 if (in->caps.count(mds)) {
2714 ldout(cct, 10) << " caps on " << p->first
2715 << " " << ccap_string(in->caps[mds]->issued)
2716 << " wants " << ccap_string(in->caps_wanted())
2717 << dendl;
2718 filepath path;
2719 in->make_long_path(path);
2720 ldout(cct, 10) << " path " << path << dendl;
2721
2722 bufferlist flockbl;
2723 _encode_filelocks(in, flockbl);
2724
2725 Cap *cap = in->caps[mds];
2726 cap->seq = 0; // reset seq.
2727 cap->issue_seq = 0; // reset seq.
2728 cap->mseq = 0; // reset seq.
2729 cap->issued = cap->implemented;
2730
2731 snapid_t snap_follows = 0;
2732 if (!in->cap_snaps.empty())
2733 snap_follows = in->cap_snaps.begin()->first;
2734
2735 m->add_cap(p->first.ino,
2736 cap->cap_id,
2737 path.get_ino(), path.get_path(), // ino
2738 in->caps_wanted(), // wanted
2739 cap->issued, // issued
2740 in->snaprealm->ino,
2741 snap_follows,
2742 flockbl);
2743
2744 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2745 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2746 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2747 did_snaprealm.insert(in->snaprealm->ino);
2748 }
2749 }
2750 }
2751
2752 early_kick_flushing_caps(session);
2753
2754 session->con->send_message(m);
2755
2756 mount_cond.Signal();
2757}
2758
2759
2760void Client::kick_requests(MetaSession *session)
2761{
2762 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2763 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2764 p != mds_requests.end();
2765 ++p) {
31f18b77
FG
2766 MetaRequest *req = p->second;
2767 if (req->got_unsafe)
2768 continue;
2769 if (req->aborted()) {
2770 if (req->caller_cond) {
2771 req->kick = true;
2772 req->caller_cond->Signal();
2773 }
7c673cae 2774 continue;
31f18b77
FG
2775 }
2776 if (req->retry_attempt > 0)
7c673cae 2777 continue; // new requests only
31f18b77 2778 if (req->mds == session->mds_num) {
7c673cae
FG
2779 send_request(p->second, session);
2780 }
2781 }
2782}
2783
2784void Client::resend_unsafe_requests(MetaSession *session)
2785{
2786 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2787 !iter.end();
2788 ++iter)
2789 send_request(*iter, session);
2790
2791 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2792 // process completed requests in clientreplay stage.
2793 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2794 p != mds_requests.end();
2795 ++p) {
2796 MetaRequest *req = p->second;
2797 if (req->got_unsafe)
2798 continue;
31f18b77
FG
2799 if (req->aborted())
2800 continue;
7c673cae
FG
2801 if (req->retry_attempt == 0)
2802 continue; // old requests only
2803 if (req->mds == session->mds_num)
2804 send_request(req, session, true);
2805 }
2806}
2807
2808void Client::wait_unsafe_requests()
2809{
2810 list<MetaRequest*> last_unsafe_reqs;
2811 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2812 p != mds_sessions.end();
2813 ++p) {
2814 MetaSession *s = p->second;
2815 if (!s->unsafe_requests.empty()) {
2816 MetaRequest *req = s->unsafe_requests.back();
2817 req->get();
2818 last_unsafe_reqs.push_back(req);
2819 }
2820 }
2821
2822 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2823 p != last_unsafe_reqs.end();
2824 ++p) {
2825 MetaRequest *req = *p;
2826 if (req->unsafe_item.is_on_list())
2827 wait_on_list(req->waitfor_safe);
2828 put_request(req);
2829 }
2830}
2831
2832void Client::kick_requests_closed(MetaSession *session)
2833{
2834 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2835 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2836 p != mds_requests.end(); ) {
2837 MetaRequest *req = p->second;
2838 ++p;
2839 if (req->mds == session->mds_num) {
2840 if (req->caller_cond) {
2841 req->kick = true;
2842 req->caller_cond->Signal();
2843 }
2844 req->item.remove_myself();
2845 if (req->got_unsafe) {
2846 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2847 req->unsafe_item.remove_myself();
2848 req->unsafe_dir_item.remove_myself();
2849 req->unsafe_target_item.remove_myself();
2850 signal_cond_list(req->waitfor_safe);
2851 unregister_request(req);
2852 }
2853 }
2854 }
2855 assert(session->requests.empty());
2856 assert(session->unsafe_requests.empty());
2857}
2858
2859
2860
2861
2862/************
2863 * leases
2864 */
2865
2866void Client::got_mds_push(MetaSession *s)
2867{
2868 s->seq++;
2869 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2870 if (s->state == MetaSession::STATE_CLOSING) {
2871 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2872 }
2873}
2874
2875void Client::handle_lease(MClientLease *m)
2876{
2877 ldout(cct, 10) << "handle_lease " << *m << dendl;
2878
2879 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880
2881 mds_rank_t mds = mds_rank_t(m->get_source().num());
2882 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2883 if (!session) {
2884 m->put();
2885 return;
2886 }
2887
2888 got_mds_push(session);
2889
2890 ceph_seq_t seq = m->get_seq();
2891
2892 Inode *in;
2893 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2894 if (inode_map.count(vino) == 0) {
2895 ldout(cct, 10) << " don't have vino " << vino << dendl;
2896 goto revoke;
2897 }
2898 in = inode_map[vino];
2899
2900 if (m->get_mask() & CEPH_LOCK_DN) {
2901 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2902 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2903 goto revoke;
2904 }
2905 Dentry *dn = in->dir->dentries[m->dname];
2906 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2907 dn->lease_mds = -1;
2908 }
2909
2910 revoke:
2911 m->get_connection()->send_message(
2912 new MClientLease(
2913 CEPH_MDS_LEASE_RELEASE, seq,
2914 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2915 m->put();
2916}
2917
2918void Client::put_inode(Inode *in, int n)
2919{
2920 ldout(cct, 10) << "put_inode on " << *in << dendl;
2921 int left = in->_put(n);
2922 if (left == 0) {
2923 // release any caps
2924 remove_all_caps(in);
2925
2926 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2927 bool unclean = objectcacher->release_set(&in->oset);
2928 assert(!unclean);
2929 inode_map.erase(in->vino());
2930 if (use_faked_inos())
2931 _release_faked_ino(in);
2932
2933 if (in == root) {
2934 root = 0;
2935 root_ancestor = 0;
2936 while (!root_parents.empty())
2937 root_parents.erase(root_parents.begin());
2938 }
2939
2940 delete in;
2941 }
2942}
2943
2944void Client::close_dir(Dir *dir)
2945{
2946 Inode *in = dir->parent_inode;
2947 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2948 assert(dir->is_empty());
2949 assert(in->dir == dir);
2950 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2951 if (!in->dn_set.empty())
2952 in->get_first_parent()->put(); // unpin dentry
2953
2954 delete in->dir;
2955 in->dir = 0;
2956 put_inode(in); // unpin inode
2957}
2958
2959 /**
2960 * Don't call this with in==NULL, use get_or_create for that
2961 * leave dn set to default NULL unless you're trying to add
2962 * a new inode to a pre-created Dentry
2963 */
2964Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2965{
2966 if (!dn) {
2967 // create a new Dentry
2968 dn = new Dentry;
2969 dn->name = name;
2970
2971 // link to dir
2972 dn->dir = dir;
2973 dir->dentries[dn->name] = dn;
2974 lru.lru_insert_mid(dn); // mid or top?
2975
2976 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2977 << " dn " << dn << " (new dn)" << dendl;
2978 } else {
2979 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2980 << " dn " << dn << " (old dn)" << dendl;
2981 }
2982
2983 if (in) { // link to inode
2984 dn->inode = in;
2985 if (in->is_dir()) {
2986 if (in->dir)
2987 dn->get(); // dir -> dn pin
2988 if (in->ll_ref)
2989 dn->get(); // ll_ref -> dn pin
2990 }
2991
2992 assert(in->dn_set.count(dn) == 0);
2993
2994 // only one parent for directories!
2995 if (in->is_dir() && !in->dn_set.empty()) {
2996 Dentry *olddn = in->get_first_parent();
2997 assert(olddn->dir != dir || olddn->name != name);
2998 Inode *old_diri = olddn->dir->parent_inode;
2999 old_diri->dir_release_count++;
3000 clear_dir_complete_and_ordered(old_diri, true);
3001 unlink(olddn, true, true); // keep dir, dentry
3002 }
3003
3004 in->dn_set.insert(dn);
3005
3006 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3007 }
3008
3009 return dn;
3010}
3011
3012void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3013{
3014 InodeRef in;
3015 in.swap(dn->inode);
3016 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3017 << " inode " << dn->inode << dendl;
3018
3019 // unlink from inode
3020 if (in) {
3021 if (in->is_dir()) {
3022 if (in->dir)
3023 dn->put(); // dir -> dn pin
3024 if (in->ll_ref)
3025 dn->put(); // ll_ref -> dn pin
3026 }
3027 dn->inode = 0;
3028 assert(in->dn_set.count(dn));
3029 in->dn_set.erase(dn);
3030 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3031 }
3032
3033 if (keepdentry) {
3034 dn->lease_mds = -1;
3035 } else {
3036 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3037
3038 // unlink from dir
3039 dn->dir->dentries.erase(dn->name);
3040 if (dn->dir->is_empty() && !keepdir)
3041 close_dir(dn->dir);
3042 dn->dir = 0;
3043
3044 // delete den
3045 lru.lru_remove(dn);
3046 dn->put();
3047 }
3048}
3049
3050/**
3051 * For asynchronous flushes, check for errors from the IO and
3052 * update the inode if necessary
3053 */
3054class C_Client_FlushComplete : public Context {
3055private:
3056 Client *client;
3057 InodeRef inode;
3058public:
3059 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3060 void finish(int r) override {
3061 assert(client->client_lock.is_locked_by_me());
3062 if (r != 0) {
3063 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3064 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3065 << " 0x" << std::hex << inode->ino << std::dec
3066 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3067 inode->set_async_err(r);
3068 }
3069 }
3070};
3071
3072
3073/****
3074 * caps
3075 */
3076
3077void Client::get_cap_ref(Inode *in, int cap)
3078{
3079 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3080 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3081 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3082 in->get();
3083 }
3084 if ((cap & CEPH_CAP_FILE_CACHE) &&
3085 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3086 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3087 in->get();
3088 }
3089 in->get_cap_ref(cap);
3090}
3091
3092void Client::put_cap_ref(Inode *in, int cap)
3093{
3094 int last = in->put_cap_ref(cap);
3095 if (last) {
3096 int put_nref = 0;
3097 int drop = last & ~in->caps_issued();
3098 if (in->snapid == CEPH_NOSNAP) {
3099 if ((last & CEPH_CAP_FILE_WR) &&
3100 !in->cap_snaps.empty() &&
3101 in->cap_snaps.rbegin()->second.writing) {
3102 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3103 in->cap_snaps.rbegin()->second.writing = 0;
3104 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3105 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3106 }
3107 if (last & CEPH_CAP_FILE_BUFFER) {
3108 for (auto &p : in->cap_snaps)
3109 p.second.dirty_data = 0;
3110 signal_cond_list(in->waitfor_commit);
3111 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3112 ++put_nref;
3113 }
3114 }
3115 if (last & CEPH_CAP_FILE_CACHE) {
3116 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3117 ++put_nref;
3118 }
3119 if (drop)
3120 check_caps(in, 0);
3121 if (put_nref)
3122 put_inode(in, put_nref);
3123 }
3124}
3125
3126int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127{
3128 int r = check_pool_perm(in, need);
3129 if (r < 0)
3130 return r;
3131
3132 while (1) {
3133 int file_wanted = in->caps_file_wanted();
3134 if ((file_wanted & need) != need) {
3135 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3136 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3137 << dendl;
3138 return -EBADF;
3139 }
3140
3141 int implemented;
3142 int have = in->caps_issued(&implemented);
3143
3144 bool waitfor_caps = false;
3145 bool waitfor_commit = false;
3146
3147 if (have & need & CEPH_CAP_FILE_WR) {
3148 if (endoff > 0 &&
3149 (endoff >= (loff_t)in->max_size ||
3150 endoff > (loff_t)(in->size << 1)) &&
3151 endoff > (loff_t)in->wanted_max_size) {
3152 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3153 in->wanted_max_size = endoff;
3154 check_caps(in, 0);
3155 }
3156
3157 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3158 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3159 waitfor_caps = true;
3160 }
3161 if (!in->cap_snaps.empty()) {
3162 if (in->cap_snaps.rbegin()->second.writing) {
3163 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3164 waitfor_caps = true;
3165 }
3166 for (auto &p : in->cap_snaps) {
3167 if (p.second.dirty_data) {
3168 waitfor_commit = true;
3169 break;
3170 }
3171 }
3172 if (waitfor_commit) {
3173 _flush(in, new C_Client_FlushComplete(this, in));
3174 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3175 }
3176 }
3177 }
3178
3179 if (!waitfor_caps && !waitfor_commit) {
3180 if ((have & need) == need) {
7c673cae
FG
3181 int revoking = implemented & ~have;
3182 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3183 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3184 << " revoking " << ccap_string(revoking)
7c673cae 3185 << dendl;
c07f9fc5 3186 if ((revoking & want) == 0) {
7c673cae
FG
3187 *phave = need | (have & want);
3188 in->get_cap_ref(need);
3189 return 0;
3190 }
3191 }
3192 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3193 waitfor_caps = true;
3194 }
3195
3196 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3197 in->auth_cap->session->readonly)
3198 return -EROFS;
3199
3200 if (in->flags & I_CAP_DROPPED) {
3201 int mds_wanted = in->caps_mds_wanted();
3202 if ((mds_wanted & need) != need) {
3203 int ret = _renew_caps(in);
3204 if (ret < 0)
3205 return ret;
3206 continue;
3207 }
3208 if ((mds_wanted & file_wanted) ==
3209 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3210 in->flags &= ~I_CAP_DROPPED;
3211 }
3212 }
3213
3214 if (waitfor_caps)
3215 wait_on_list(in->waitfor_caps);
3216 else if (waitfor_commit)
3217 wait_on_list(in->waitfor_commit);
3218 }
3219}
3220
3221int Client::get_caps_used(Inode *in)
3222{
3223 unsigned used = in->caps_used();
3224 if (!(used & CEPH_CAP_FILE_CACHE) &&
3225 !objectcacher->set_is_empty(&in->oset))
3226 used |= CEPH_CAP_FILE_CACHE;
3227 return used;
3228}
3229
3230void Client::cap_delay_requeue(Inode *in)
3231{
3232 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3233 in->hold_caps_until = ceph_clock_now();
3234 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3235 delayed_caps.push_back(&in->cap_item);
3236}
3237
3238void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3239 bool sync, int used, int want, int retain,
3240 int flush, ceph_tid_t flush_tid)
3241{
3242 int held = cap->issued | cap->implemented;
3243 int revoking = cap->implemented & ~cap->issued;
3244 retain &= ~revoking;
3245 int dropping = cap->issued & ~retain;
3246 int op = CEPH_CAP_OP_UPDATE;
3247
3248 ldout(cct, 10) << "send_cap " << *in
3249 << " mds." << session->mds_num << " seq " << cap->seq
3250 << (sync ? " sync " : " async ")
3251 << " used " << ccap_string(used)
3252 << " want " << ccap_string(want)
3253 << " flush " << ccap_string(flush)
3254 << " retain " << ccap_string(retain)
3255 << " held "<< ccap_string(held)
3256 << " revoking " << ccap_string(revoking)
3257 << " dropping " << ccap_string(dropping)
3258 << dendl;
3259
3260 if (cct->_conf->client_inject_release_failure && revoking) {
3261 const int would_have_issued = cap->issued & retain;
3262 const int would_have_implemented = cap->implemented & (cap->issued | used);
3263 // Simulated bug:
3264 // - tell the server we think issued is whatever they issued plus whatever we implemented
3265 // - leave what we have implemented in place
3266 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3267 cap->issued = cap->issued | cap->implemented;
3268
3269 // Make an exception for revoking xattr caps: we are injecting
3270 // failure to release other caps, but allow xattr because client
3271 // will block on xattr ops if it can't release these to MDS (#9800)
3272 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3273 cap->issued ^= xattr_mask & revoking;
3274 cap->implemented ^= xattr_mask & revoking;
3275
3276 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3277 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3278 } else {
3279 // Normal behaviour
3280 cap->issued &= retain;
3281 cap->implemented &= cap->issued | used;
3282 }
3283
3284 snapid_t follows = 0;
3285
3286 if (flush)
3287 follows = in->snaprealm->get_snap_context().seq;
3288
3289 MClientCaps *m = new MClientCaps(op,
3290 in->ino,
3291 0,
3292 cap->cap_id, cap->seq,
3293 cap->implemented,
3294 want,
3295 flush,
3296 cap->mseq,
3297 cap_epoch_barrier);
3298 m->caller_uid = in->cap_dirtier_uid;
3299 m->caller_gid = in->cap_dirtier_gid;
3300
3301 m->head.issue_seq = cap->issue_seq;
3302 m->set_tid(flush_tid);
3303
3304 m->head.uid = in->uid;
3305 m->head.gid = in->gid;
3306 m->head.mode = in->mode;
3307
3308 m->head.nlink = in->nlink;
3309
3310 if (flush & CEPH_CAP_XATTR_EXCL) {
3311 ::encode(in->xattrs, m->xattrbl);
3312 m->head.xattr_version = in->xattr_version;
3313 }
3314
3315 m->size = in->size;
3316 m->max_size = in->max_size;
3317 m->truncate_seq = in->truncate_seq;
3318 m->truncate_size = in->truncate_size;
3319 m->mtime = in->mtime;
3320 m->atime = in->atime;
3321 m->ctime = in->ctime;
3322 m->btime = in->btime;
3323 m->time_warp_seq = in->time_warp_seq;
3324 m->change_attr = in->change_attr;
3325 if (sync)
3326 m->flags |= CLIENT_CAPS_SYNC;
3327
3328 if (flush & CEPH_CAP_FILE_WR) {
3329 m->inline_version = in->inline_version;
3330 m->inline_data = in->inline_data;
3331 }
3332
3333 in->reported_size = in->size;
3334 m->set_snap_follows(follows);
3335 cap->wanted = want;
3336 if (cap == in->auth_cap) {
3337 m->set_max_size(in->wanted_max_size);
3338 in->requested_max_size = in->wanted_max_size;
3339 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3340 }
3341
3342 if (!session->flushing_caps_tids.empty())
3343 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3344
3345 session->con->send_message(m);
3346}
3347
31f18b77
FG
3348static bool is_max_size_approaching(Inode *in)
3349{
3350 /* mds will adjust max size according to the reported size */
3351 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3352 return false;
3353 if (in->size >= in->max_size)
3354 return true;
3355 /* half of previous max_size increment has been used */
3356 if (in->max_size > in->reported_size &&
3357 (in->size << 1) >= in->max_size + in->reported_size)
3358 return true;
3359 return false;
3360}
7c673cae
FG
3361
3362/**
3363 * check_caps
3364 *
3365 * Examine currently used and wanted versus held caps. Release, flush or ack
3366 * revoked caps to the MDS as appropriate.
3367 *
3368 * @param in the inode to check
3369 * @param flags flags to apply to cap check
3370 */
3371void Client::check_caps(Inode *in, unsigned flags)
3372{
3373 unsigned wanted = in->caps_wanted();
3374 unsigned used = get_caps_used(in);
3375 unsigned cap_used;
3376
3377 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3378 // we do this here because we don't want to drop to Fs (and then
3379 // drop the Fs if we do a create!) if that alone makes us send lookups
3380 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3381 wanted |= CEPH_CAP_FILE_EXCL;
3382 }
3383
3384 int implemented;
3385 int issued = in->caps_issued(&implemented);
3386 int revoking = implemented & ~issued;
3387
3388 int retain = wanted | used | CEPH_CAP_PIN;
3389 if (!unmounting) {
3390 if (wanted)
3391 retain |= CEPH_CAP_ANY;
3392 else
3393 retain |= CEPH_CAP_ANY_SHARED;
3394 }
3395
3396 ldout(cct, 10) << "check_caps on " << *in
3397 << " wanted " << ccap_string(wanted)
3398 << " used " << ccap_string(used)
3399 << " issued " << ccap_string(issued)
3400 << " revoking " << ccap_string(revoking)
3401 << " flags=" << flags
3402 << dendl;
3403
3404 if (in->snapid != CEPH_NOSNAP)
3405 return; //snap caps last forever, can't write
3406
3407 if (in->caps.empty())
3408 return; // guard if at end of func
3409
3410 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3411 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
3412 _release(in);
3413
3414 if (!in->cap_snaps.empty())
3415 flush_snaps(in);
3416
3417 if (flags & CHECK_CAPS_NODELAY)
3418 in->hold_caps_until = utime_t();
3419 else
3420 cap_delay_requeue(in);
3421
3422 utime_t now = ceph_clock_now();
3423
3424 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3425 while (it != in->caps.end()) {
3426 mds_rank_t mds = it->first;
3427 Cap *cap = it->second;
3428 ++it;
3429
3430 MetaSession *session = mds_sessions[mds];
3431 assert(session);
3432
3433 cap_used = used;
3434 if (in->auth_cap && cap != in->auth_cap)
3435 cap_used &= ~in->auth_cap->issued;
3436
3437 revoking = cap->implemented & ~cap->issued;
3438
3439 ldout(cct, 10) << " cap mds." << mds
3440 << " issued " << ccap_string(cap->issued)
3441 << " implemented " << ccap_string(cap->implemented)
3442 << " revoking " << ccap_string(revoking) << dendl;
3443
3444 if (in->wanted_max_size > in->max_size &&
3445 in->wanted_max_size > in->requested_max_size &&
3446 cap == in->auth_cap)
3447 goto ack;
3448
3449 /* approaching file_max? */
3450 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3451 cap == in->auth_cap &&
3452 is_max_size_approaching(in)) {
7c673cae 3453 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3454 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3455 goto ack;
3456 }
3457
3458 /* completed revocation? */
3459 if (revoking && (revoking & cap_used) == 0) {
3460 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3461 goto ack;
3462 }
3463
3464 /* want more caps from mds? */
3465 if (wanted & ~(cap->wanted | cap->issued))
3466 goto ack;
3467
3468 if (!revoking && unmounting && (cap_used == 0))
3469 goto ack;
3470
3471 if (wanted == cap->wanted && // mds knows what we want.
3472 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3473 !in->dirty_caps) // and we have no dirty caps
3474 continue;
3475
3476 if (now < in->hold_caps_until) {
3477 ldout(cct, 10) << "delaying cap release" << dendl;
3478 continue;
3479 }
3480
3481 ack:
3482 // re-send old cap/snapcap flushes first.
3483 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3484 session->mds_state < MDSMap::STATE_ACTIVE &&
3485 session->early_flushing_caps.count(in) == 0) {
3486 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3487 << " to mds." << session->mds_num << dendl;
3488 session->early_flushing_caps.insert(in);
3489 if (in->cap_snaps.size())
3490 flush_snaps(in, true);
3491 if (in->flushing_caps)
3492 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3493 }
3494
3495 int flushing;
3496 ceph_tid_t flush_tid;
3497 if (in->auth_cap == cap && in->dirty_caps) {
3498 flushing = mark_caps_flushing(in, &flush_tid);
3499 } else {
3500 flushing = 0;
3501 flush_tid = 0;
3502 }
3503
3504 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3505 retain, flushing, flush_tid);
3506 }
3507}
3508
3509
3510void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3511{
3512 int used = get_caps_used(in);
3513 int dirty = in->caps_dirty();
3514 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3515
3516 if (in->cap_snaps.size() &&
3517 in->cap_snaps.rbegin()->second.writing) {
3518 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3519 return;
3520 } else if (in->caps_dirty() ||
3521 (used & CEPH_CAP_FILE_WR) ||
3522 (dirty & CEPH_CAP_ANY_WR)) {
3523 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3524 assert(capsnapem.second == true); /* element inserted */
3525 CapSnap &capsnap = capsnapem.first->second;
3526 capsnap.context = old_snapc;
3527 capsnap.issued = in->caps_issued();
3528 capsnap.dirty = in->caps_dirty();
3529
3530 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3531
3532 capsnap.uid = in->uid;
3533 capsnap.gid = in->gid;
3534 capsnap.mode = in->mode;
3535 capsnap.btime = in->btime;
3536 capsnap.xattrs = in->xattrs;
3537 capsnap.xattr_version = in->xattr_version;
3538
3539 if (used & CEPH_CAP_FILE_WR) {
3540 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3541 capsnap.writing = 1;
3542 } else {
3543 finish_cap_snap(in, capsnap, used);
3544 }
3545 } else {
3546 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3547 }
3548}
3549
3550void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3551{
3552 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3553 capsnap.size = in->size;
3554 capsnap.mtime = in->mtime;
3555 capsnap.atime = in->atime;
3556 capsnap.ctime = in->ctime;
3557 capsnap.time_warp_seq = in->time_warp_seq;
3558 capsnap.change_attr = in->change_attr;
3559
3560 capsnap.dirty |= in->caps_dirty();
3561
3562 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3563 capsnap.inline_data = in->inline_data;
3564 capsnap.inline_version = in->inline_version;
3565 }
3566
3567 if (used & CEPH_CAP_FILE_BUFFER) {
3568 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3569 << " WRBUFFER, delaying" << dendl;
3570 } else {
3571 capsnap.dirty_data = 0;
3572 flush_snaps(in);
3573 }
3574}
3575
3576void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3577{
3578 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3579 in->cap_snaps.at(seq).dirty_data = 0;
3580 flush_snaps(in);
3581}
3582
3583void Client::flush_snaps(Inode *in, bool all_again)
3584{
3585 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3586 assert(in->cap_snaps.size());
3587
3588 // pick auth mds
3589 assert(in->auth_cap);
3590 MetaSession *session = in->auth_cap->session;
3591 int mseq = in->auth_cap->mseq;
3592
3593 for (auto &p : in->cap_snaps) {
3594 CapSnap &capsnap = p.second;
3595 if (!all_again) {
3596 // only flush once per session
3597 if (capsnap.flush_tid > 0)
3598 continue;
3599 }
3600
3601 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3602 << " follows " << p.first
3603 << " size " << capsnap.size
3604 << " mtime " << capsnap.mtime
3605 << " dirty_data=" << capsnap.dirty_data
3606 << " writing=" << capsnap.writing
3607 << " on " << *in << dendl;
3608 if (capsnap.dirty_data || capsnap.writing)
3609 continue;
3610
3611 if (capsnap.flush_tid == 0) {
3612 capsnap.flush_tid = ++last_flush_tid;
3613 if (!in->flushing_cap_item.is_on_list())
3614 session->flushing_caps.push_back(&in->flushing_cap_item);
3615 session->flushing_caps_tids.insert(capsnap.flush_tid);
3616 }
3617
3618 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3619 cap_epoch_barrier);
3620 if (user_id >= 0)
3621 m->caller_uid = user_id;
3622 if (group_id >= 0)
3623 m->caller_gid = group_id;
3624
3625 m->set_client_tid(capsnap.flush_tid);
3626 m->head.snap_follows = p.first;
3627
3628 m->head.caps = capsnap.issued;
3629 m->head.dirty = capsnap.dirty;
3630
3631 m->head.uid = capsnap.uid;
3632 m->head.gid = capsnap.gid;
3633 m->head.mode = capsnap.mode;
3634 m->btime = capsnap.btime;
3635
3636 m->size = capsnap.size;
3637
3638 m->head.xattr_version = capsnap.xattr_version;
3639 ::encode(capsnap.xattrs, m->xattrbl);
3640
3641 m->ctime = capsnap.ctime;
3642 m->btime = capsnap.btime;
3643 m->mtime = capsnap.mtime;
3644 m->atime = capsnap.atime;
3645 m->time_warp_seq = capsnap.time_warp_seq;
3646 m->change_attr = capsnap.change_attr;
3647
3648 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3649 m->inline_version = in->inline_version;
3650 m->inline_data = in->inline_data;
3651 }
3652
3653 assert(!session->flushing_caps_tids.empty());
3654 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3655
3656 session->con->send_message(m);
3657 }
3658}
3659
3660
3661
3662void Client::wait_on_list(list<Cond*>& ls)
3663{
3664 Cond cond;
3665 ls.push_back(&cond);
3666 cond.Wait(client_lock);
3667 ls.remove(&cond);
3668}
3669
3670void Client::signal_cond_list(list<Cond*>& ls)
3671{
3672 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3673 (*it)->Signal();
3674}
3675
3676void Client::wait_on_context_list(list<Context*>& ls)
3677{
3678 Cond cond;
3679 bool done = false;
3680 int r;
3681 ls.push_back(new C_Cond(&cond, &done, &r));
3682 while (!done)
3683 cond.Wait(client_lock);
3684}
3685
3686void Client::signal_context_list(list<Context*>& ls)
3687{
3688 while (!ls.empty()) {
3689 ls.front()->complete(0);
3690 ls.pop_front();
3691 }
3692}
3693
3694void Client::wake_inode_waiters(MetaSession *s)
3695{
3696 xlist<Cap*>::iterator iter = s->caps.begin();
3697 while (!iter.end()){
3698 signal_cond_list((*iter)->inode->waitfor_caps);
3699 ++iter;
3700 }
3701}
3702
3703
3704// flush dirty data (from objectcache)
3705
3706class C_Client_CacheInvalidate : public Context {
3707private:
3708 Client *client;
3709 vinodeno_t ino;
3710 int64_t offset, length;
3711public:
3712 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3713 client(c), offset(off), length(len) {
3714 if (client->use_faked_inos())
3715 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3716 else
3717 ino = in->vino();
3718 }
3719 void finish(int r) override {
3720 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3721 assert(!client->client_lock.is_locked_by_me());
3722 client->_async_invalidate(ino, offset, length);
3723 }
3724};
3725
3726void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3727{
3728 if (unmounting)
3729 return;
3730 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3731 ino_invalidate_cb(callback_handle, ino, off, len);
3732}
3733
3734void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3735
3736 if (ino_invalidate_cb)
3737 // we queue the invalidate, which calls the callback and decrements the ref
3738 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3739}
3740
3741void Client::_invalidate_inode_cache(Inode *in)
3742{
3743 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3744
3745 // invalidate our userspace inode cache
3746 if (cct->_conf->client_oc)
3747 objectcacher->release_set(&in->oset);
3748
3749 _schedule_invalidate_callback(in, 0, 0);
3750}
3751
3752void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3753{
3754 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3755
3756 // invalidate our userspace inode cache
3757 if (cct->_conf->client_oc) {
3758 vector<ObjectExtent> ls;
3759 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3760 objectcacher->discard_set(&in->oset, ls);
3761 }
3762
3763 _schedule_invalidate_callback(in, off, len);
3764}
3765
3766bool Client::_release(Inode *in)
3767{
3768 ldout(cct, 20) << "_release " << *in << dendl;
3769 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3770 _invalidate_inode_cache(in);
3771 return true;
3772 }
3773 return false;
3774}
3775
3776bool Client::_flush(Inode *in, Context *onfinish)
3777{
3778 ldout(cct, 10) << "_flush " << *in << dendl;
3779
3780 if (!in->oset.dirty_or_tx) {
3781 ldout(cct, 10) << " nothing to flush" << dendl;
3782 onfinish->complete(0);
3783 return true;
3784 }
3785
3786 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3787 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3788 objectcacher->purge_set(&in->oset);
3789 if (onfinish) {
3790 onfinish->complete(-ENOSPC);
3791 }
3792 return true;
3793 }
3794
3795 return objectcacher->flush_set(&in->oset, onfinish);
3796}
3797
3798void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3799{
3800 assert(client_lock.is_locked());
3801 if (!in->oset.dirty_or_tx) {
3802 ldout(cct, 10) << " nothing to flush" << dendl;
3803 return;
3804 }
3805
3806 Mutex flock("Client::_flush_range flock");
3807 Cond cond;
3808 bool safe = false;
3809 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3810 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3811 offset, size, onflush);
3812 if (!ret) {
3813 // wait for flush
3814 client_lock.Unlock();
3815 flock.Lock();
3816 while (!safe)
3817 cond.Wait(flock);
3818 flock.Unlock();
3819 client_lock.Lock();
3820 }
3821}
3822
3823void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3824{
3825 // Mutex::Locker l(client_lock);
3826 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3827 Inode *in = static_cast<Inode *>(oset->parent);
3828 assert(in);
3829 _flushed(in);
3830}
3831
3832void Client::_flushed(Inode *in)
3833{
3834 ldout(cct, 10) << "_flushed " << *in << dendl;
3835
3836 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3837}
3838
3839
3840
3841// checks common to add_update_cap, handle_cap_grant
3842void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3843{
3844 unsigned had = in->caps_issued();
3845
3846 if ((issued & CEPH_CAP_FILE_CACHE) &&
3847 !(had & CEPH_CAP_FILE_CACHE))
3848 in->cache_gen++;
3849
3850 if ((issued & CEPH_CAP_FILE_SHARED) &&
3851 !(had & CEPH_CAP_FILE_SHARED)) {
3852 in->shared_gen++;
3853
3854 if (in->is_dir())
3855 clear_dir_complete_and_ordered(in, true);
3856 }
3857}
3858
3859void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3860 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3861 int flags, const UserPerm& cap_perms)
3862{
3863 Cap *cap = 0;
3864 mds_rank_t mds = mds_session->mds_num;
3865 if (in->caps.count(mds)) {
3866 cap = in->caps[mds];
3867
3868 /*
3869 * auth mds of the inode changed. we received the cap export
3870 * message, but still haven't received the cap import message.
3871 * handle_cap_export() updated the new auth MDS' cap.
3872 *
3873 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3874 * a message that was send before the cap import message. So
3875 * don't remove caps.
3876 */
3877 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3878 assert(cap == in->auth_cap);
3879 assert(cap->cap_id == cap_id);
3880 seq = cap->seq;
3881 mseq = cap->mseq;
3882 issued |= cap->issued;
3883 flags |= CEPH_CAP_FLAG_AUTH;
3884 }
3885 } else {
3886 mds_session->num_caps++;
3887 if (!in->is_any_caps()) {
3888 assert(in->snaprealm == 0);
3889 in->snaprealm = get_snap_realm(realm);
3890 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3891 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3892 }
3893 in->caps[mds] = cap = new Cap;
3894
3895 mds_session->caps.push_back(&cap->cap_item);
3896 cap->session = mds_session;
3897 cap->inode = in;
3898 cap->gen = mds_session->cap_gen;
3899 cap_list.push_back(&in->cap_item);
3900 }
3901
3902 check_cap_issue(in, cap, issued);
3903
3904 if (flags & CEPH_CAP_FLAG_AUTH) {
3905 if (in->auth_cap != cap &&
3906 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3907 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3908 ldout(cct, 10) << "add_update_cap changing auth cap: "
3909 << "add myself to new auth MDS' flushing caps list" << dendl;
3910 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3911 }
3912 in->auth_cap = cap;
3913 }
3914 }
3915
3916 unsigned old_caps = cap->issued;
3917 cap->cap_id = cap_id;
3918 cap->issued |= issued;
3919 cap->implemented |= issued;
3920 cap->seq = seq;
3921 cap->issue_seq = seq;
3922 cap->mseq = mseq;
3923 cap->latest_perms = cap_perms;
3924 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3925 << " from mds." << mds
3926 << " on " << *in
3927 << dendl;
3928
3929 if ((issued & ~old_caps) && in->auth_cap == cap) {
3930 // non-auth MDS is revoking the newly grant caps ?
3931 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3932 if (it->second == cap)
3933 continue;
3934 if (it->second->implemented & ~it->second->issued & issued) {
3935 check_caps(in, CHECK_CAPS_NODELAY);
3936 break;
3937 }
3938 }
3939 }
3940
3941 if (issued & ~old_caps)
3942 signal_cond_list(in->waitfor_caps);
3943}
3944
3945void Client::remove_cap(Cap *cap, bool queue_release)
3946{
3947 Inode *in = cap->inode;
3948 MetaSession *session = cap->session;
3949 mds_rank_t mds = cap->session->mds_num;
3950
3951 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3952
3953 if (queue_release) {
3954 session->enqueue_cap_release(
3955 in->ino,
3956 cap->cap_id,
3957 cap->issue_seq,
3958 cap->mseq,
3959 cap_epoch_barrier);
3960 }
3961
3962 if (in->auth_cap == cap) {
3963 if (in->flushing_cap_item.is_on_list()) {
3964 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3965 in->flushing_cap_item.remove_myself();
3966 }
3967 in->auth_cap = NULL;
3968 }
3969 assert(in->caps.count(mds));
3970 in->caps.erase(mds);
3971
3972 cap->cap_item.remove_myself();
3973 delete cap;
3974 cap = nullptr;
3975
3976 if (!in->is_any_caps()) {
3977 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3978 in->snaprealm_item.remove_myself();
3979 put_snap_realm(in->snaprealm);
3980 in->snaprealm = 0;
3981 }
3982}
3983
3984void Client::remove_all_caps(Inode *in)
3985{
3986 while (!in->caps.empty())
3987 remove_cap(in->caps.begin()->second, true);
3988}
3989
3990void Client::remove_session_caps(MetaSession *s)
3991{
3992 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3993
3994 while (s->caps.size()) {
3995 Cap *cap = *s->caps.begin();
3996 Inode *in = cap->inode;
3997 bool dirty_caps = false, cap_snaps = false;
3998 if (in->auth_cap == cap) {
3999 cap_snaps = !in->cap_snaps.empty();
4000 dirty_caps = in->dirty_caps | in->flushing_caps;
4001 in->wanted_max_size = 0;
4002 in->requested_max_size = 0;
4003 in->flags |= I_CAP_DROPPED;
4004 }
4005 remove_cap(cap, false);
4006 signal_cond_list(in->waitfor_caps);
4007 if (cap_snaps) {
4008 InodeRef tmp_ref(in);
4009 in->cap_snaps.clear();
4010 }
4011 if (dirty_caps) {
4012 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4013 if (in->flushing_caps) {
4014 num_flushing_caps--;
4015 in->flushing_cap_tids.clear();
4016 }
4017 in->flushing_caps = 0;
4018 in->dirty_caps = 0;
4019 put_inode(in);
4020 }
4021 }
4022 s->flushing_caps_tids.clear();
4023 sync_cond.Signal();
4024}
4025
b32b8144
FG
4026int Client::_do_remount(void)
4027{
4028 errno = 0;
4029 int r = remount_cb(callback_handle);
4030 if (r != 0) {
4031 int e = errno;
4032 client_t whoami = get_nodeid();
4033 if (r == -1) {
4034 lderr(cct) <<
4035 "failed to remount (to trim kernel dentries): "
4036 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4037 } else {
4038 lderr(cct) <<
4039 "failed to remount (to trim kernel dentries): "
4040 "return code = " << r << dendl;
4041 }
4042 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4043 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
4044 if (should_abort && !unmounting) {
4045 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4046 ceph_abort();
4047 }
4048 }
4049 return r;
4050}
4051
7c673cae
FG
4052class C_Client_Remount : public Context {
4053private:
4054 Client *client;
4055public:
4056 explicit C_Client_Remount(Client *c) : client(c) {}
4057 void finish(int r) override {
b32b8144
FG
4058 assert(r == 0);
4059 client->_do_remount();
7c673cae
FG
4060 }
4061};
4062
4063void Client::_invalidate_kernel_dcache()
4064{
4065 if (unmounting)
4066 return;
4067 if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
4068 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4069 p != root->dir->dentries.end();
4070 ++p) {
4071 if (p->second->inode)
4072 _schedule_invalidate_dentry_callback(p->second, false);
4073 }
4074 } else if (remount_cb) {
4075 // Hacky:
4076 // when remounting a file system, linux kernel trims all unused dentries in the fs
4077 remount_finisher.queue(new C_Client_Remount(this));
4078 }
4079}
4080
4081void Client::trim_caps(MetaSession *s, int max)
4082{
4083 mds_rank_t mds = s->mds_num;
4084 int caps_size = s->caps.size();
4085 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4086 << " caps " << caps_size << dendl;
4087
4088 int trimmed = 0;
4089 xlist<Cap*>::iterator p = s->caps.begin();
b32b8144 4090 std::set<InodeRef> anchor; /* prevent put_inode from deleting all caps during traversal */
7c673cae
FG
4091 while ((caps_size - trimmed) > max && !p.end()) {
4092 Cap *cap = *p;
b32b8144 4093 InodeRef in(cap->inode);
7c673cae
FG
4094
4095 // Increment p early because it will be invalidated if cap
4096 // is deleted inside remove_cap
4097 ++p;
4098
4099 if (in->caps.size() > 1 && cap != in->auth_cap) {
4100 int mine = cap->issued | cap->implemented;
4101 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4102 // disposable non-auth cap
b32b8144 4103 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae
FG
4104 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4105 remove_cap(cap, true);
b32b8144 4106 /* N.B. no need to push onto anchor, as we are only removing one cap */
7c673cae
FG
4107 trimmed++;
4108 }
4109 } else {
4110 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4111 bool all = true;
4112 set<Dentry*>::iterator q = in->dn_set.begin();
7c673cae
FG
4113 while (q != in->dn_set.end()) {
4114 Dentry *dn = *q++;
4115 if (dn->lru_is_expireable()) {
4116 if (can_invalidate_dentries &&
4117 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4118 // Only issue one of these per DN for inodes in root: handle
4119 // others more efficiently by calling for root-child DNs at
4120 // the end of this function.
4121 _schedule_invalidate_dentry_callback(dn, true);
4122 }
b32b8144
FG
4123 ldout(cct, 20) << " anchoring inode: " << in->ino << dendl;
4124 anchor.insert(in);
7c673cae
FG
4125 trim_dentry(dn);
4126 } else {
4127 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4128 all = false;
4129 }
4130 }
4131 if (all && in->ino != MDS_INO_ROOT) {
4132 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4133 trimmed++;
4134 }
4135 }
4136 }
b32b8144
FG
4137 ldout(cct, 20) << " clearing anchored inodes" << dendl;
4138 anchor.clear();
7c673cae 4139
b32b8144
FG
4140 caps_size = s->caps.size();
4141 if (caps_size > max)
7c673cae
FG
4142 _invalidate_kernel_dcache();
4143}
4144
4145void Client::force_session_readonly(MetaSession *s)
4146{
4147 s->readonly = true;
4148 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4149 Inode *in = (*p)->inode;
4150 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4151 signal_cond_list(in->waitfor_caps);
4152 }
4153}
4154
4155void Client::mark_caps_dirty(Inode *in, int caps)
4156{
4157 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4158 << ccap_string(in->dirty_caps | caps) << dendl;
4159 if (caps && !in->caps_dirty())
4160 in->get();
4161 in->dirty_caps |= caps;
4162}
4163
4164int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4165{
4166 MetaSession *session = in->auth_cap->session;
4167
4168 int flushing = in->dirty_caps;
4169 assert(flushing);
4170
4171 ceph_tid_t flush_tid = ++last_flush_tid;
4172 in->flushing_cap_tids[flush_tid] = flushing;
4173
4174 if (!in->flushing_caps) {
4175 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4176 num_flushing_caps++;
4177 } else {
4178 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4179 }
4180
4181 in->flushing_caps |= flushing;
4182 in->dirty_caps = 0;
4183
4184 if (!in->flushing_cap_item.is_on_list())
4185 session->flushing_caps.push_back(&in->flushing_cap_item);
4186 session->flushing_caps_tids.insert(flush_tid);
4187
4188 *ptid = flush_tid;
4189 return flushing;
4190}
4191
4192void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4193{
4194 for (auto &p : in->cap_snaps) {
4195 CapSnap &capsnap = p.second;
4196 if (capsnap.flush_tid > 0) {
4197 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4198 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4199 }
4200 }
4201 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4202 it != in->flushing_cap_tids.end();
4203 ++it) {
4204 old_s->flushing_caps_tids.erase(it->first);
4205 new_s->flushing_caps_tids.insert(it->first);
4206 }
4207 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4208}
4209
4210/*
4211 * Flush all caps back to the MDS. Because the callers generally wait on the
4212 * result of this function (syncfs and umount cases), we set
4213 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4214 */
4215void Client::flush_caps_sync()
4216{
4217 ldout(cct, 10) << __func__ << dendl;
4218 xlist<Inode*>::iterator p = delayed_caps.begin();
4219 while (!p.end()) {
4220 unsigned flags = CHECK_CAPS_NODELAY;
4221 Inode *in = *p;
4222
4223 ++p;
4224 delayed_caps.pop_front();
4225 if (p.end() && cap_list.empty())
4226 flags |= CHECK_CAPS_SYNCHRONOUS;
4227 check_caps(in, flags);
4228 }
4229
4230 // other caps, too
4231 p = cap_list.begin();
4232 while (!p.end()) {
4233 unsigned flags = CHECK_CAPS_NODELAY;
4234 Inode *in = *p;
4235
4236 ++p;
4237 if (p.end())
4238 flags |= CHECK_CAPS_SYNCHRONOUS;
4239 check_caps(in, flags);
4240 }
4241}
4242
4243void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4244{
4245 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4246 Cap *cap = in->auth_cap;
4247 assert(cap->session == session);
4248
4249 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4250 p != in->flushing_cap_tids.end();
4251 ++p) {
4252 bool req_sync = false;
4253
4254 /* If this is a synchronous request, then flush the journal on last one */
4255 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4256 req_sync = true;
4257
4258 send_cap(in, session, cap, req_sync,
4259 (get_caps_used(in) | in->caps_dirty()),
4260 in->caps_wanted(), (cap->issued | cap->implemented),
4261 p->second, p->first);
4262 }
4263}
4264
4265void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4266{
4267 while (in->flushing_caps) {
4268 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4269 assert(it != in->flushing_cap_tids.end());
4270 if (it->first > want)
4271 break;
4272 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4273 << ccap_string(it->second) << " want " << want
4274 << " last " << it->first << dendl;
4275 wait_on_list(in->waitfor_caps);
4276 }
4277}
4278
4279void Client::wait_sync_caps(ceph_tid_t want)
4280{
4281 retry:
4282 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4283 << num_flushing_caps << " total flushing)" << dendl;
4284 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4285 p != mds_sessions.end();
4286 ++p) {
4287 MetaSession *s = p->second;
4288 if (s->flushing_caps_tids.empty())
4289 continue;
4290 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4291 if (oldest_tid <= want) {
4292 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4293 << " (want " << want << ")" << dendl;
4294 sync_cond.Wait(client_lock);
4295 goto retry;
4296 }
4297 }
4298}
4299
4300void Client::kick_flushing_caps(MetaSession *session)
4301{
4302 mds_rank_t mds = session->mds_num;
4303 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4304
4305 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4306 Inode *in = *p;
4307 if (session->early_flushing_caps.count(in))
4308 continue;
4309 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4310 if (in->cap_snaps.size())
4311 flush_snaps(in, true);
4312 if (in->flushing_caps)
4313 flush_caps(in, session);
4314 }
4315
4316 session->early_flushing_caps.clear();
4317}
4318
4319void Client::early_kick_flushing_caps(MetaSession *session)
4320{
4321 session->early_flushing_caps.clear();
4322
4323 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4324 Inode *in = *p;
4325 assert(in->auth_cap);
4326
4327 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4328 // stage. This guarantees that MDS processes the cap flush message before issuing
4329 // the flushing caps to other client.
4330 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4331 continue;
4332
4333 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4334 << " to mds." << session->mds_num << dendl;
4335
4336 session->early_flushing_caps.insert(in);
4337
4338 if (in->cap_snaps.size())
4339 flush_snaps(in, true);
4340 if (in->flushing_caps)
4341 flush_caps(in, session);
4342
4343 }
4344}
4345
4346void Client::kick_maxsize_requests(MetaSession *session)
4347{
4348 xlist<Cap*>::iterator iter = session->caps.begin();
4349 while (!iter.end()){
4350 (*iter)->inode->requested_max_size = 0;
4351 (*iter)->inode->wanted_max_size = 0;
4352 signal_cond_list((*iter)->inode->waitfor_caps);
4353 ++iter;
4354 }
4355}
4356
4357void SnapRealm::build_snap_context()
4358{
4359 set<snapid_t> snaps;
4360 snapid_t max_seq = seq;
4361
4362 // start with prior_parents?
4363 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4364 snaps.insert(prior_parent_snaps[i]);
4365
4366 // current parent's snaps
4367 if (pparent) {
4368 const SnapContext& psnapc = pparent->get_snap_context();
4369 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4370 if (psnapc.snaps[i] >= parent_since)
4371 snaps.insert(psnapc.snaps[i]);
4372 if (psnapc.seq > max_seq)
4373 max_seq = psnapc.seq;
4374 }
4375
4376 // my snaps
4377 for (unsigned i=0; i<my_snaps.size(); i++)
4378 snaps.insert(my_snaps[i]);
4379
4380 // ok!
4381 cached_snap_context.seq = max_seq;
4382 cached_snap_context.snaps.resize(0);
4383 cached_snap_context.snaps.reserve(snaps.size());
4384 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4385 cached_snap_context.snaps.push_back(*p);
4386}
4387
4388void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4389{
4390 list<SnapRealm*> q;
4391 q.push_back(realm);
4392
4393 while (!q.empty()) {
4394 realm = q.front();
4395 q.pop_front();
4396
4397 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4398 realm->invalidate_cache();
4399
4400 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4401 p != realm->pchildren.end();
4402 ++p)
4403 q.push_back(*p);
4404 }
4405}
4406
4407SnapRealm *Client::get_snap_realm(inodeno_t r)
4408{
4409 SnapRealm *realm = snap_realms[r];
4410 if (!realm)
4411 snap_realms[r] = realm = new SnapRealm(r);
4412 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4413 realm->nref++;
4414 return realm;
4415}
4416
4417SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4418{
4419 if (snap_realms.count(r) == 0) {
4420 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4421 return NULL;
4422 }
4423 SnapRealm *realm = snap_realms[r];
4424 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4425 realm->nref++;
4426 return realm;
4427}
4428
4429void Client::put_snap_realm(SnapRealm *realm)
4430{
4431 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4432 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4433 if (--realm->nref == 0) {
4434 snap_realms.erase(realm->ino);
4435 if (realm->pparent) {
4436 realm->pparent->pchildren.erase(realm);
4437 put_snap_realm(realm->pparent);
4438 }
4439 delete realm;
4440 }
4441}
4442
4443bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4444{
4445 if (realm->parent != parent) {
4446 ldout(cct, 10) << "adjust_realm_parent " << *realm
4447 << " " << realm->parent << " -> " << parent << dendl;
4448 realm->parent = parent;
4449 if (realm->pparent) {
4450 realm->pparent->pchildren.erase(realm);
4451 put_snap_realm(realm->pparent);
4452 }
4453 realm->pparent = get_snap_realm(parent);
4454 realm->pparent->pchildren.insert(realm);
4455 return true;
4456 }
4457 return false;
4458}
4459
4460static bool has_new_snaps(const SnapContext& old_snapc,
4461 const SnapContext& new_snapc)
4462{
4463 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4464}
4465
4466
4467void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4468{
4469 SnapRealm *first_realm = NULL;
4470 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4471
4472 map<SnapRealm*, SnapContext> dirty_realms;
4473
4474 bufferlist::iterator p = bl.begin();
4475 while (!p.end()) {
4476 SnapRealmInfo info;
4477 ::decode(info, p);
4478 SnapRealm *realm = get_snap_realm(info.ino());
4479
4480 bool invalidate = false;
4481
4482 if (info.seq() > realm->seq) {
4483 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4484 << dendl;
4485
4486 if (flush) {
4487 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4488 // flush me + children
4489 list<SnapRealm*> q;
4490 q.push_back(realm);
4491 while (!q.empty()) {
4492 SnapRealm *realm = q.front();
4493 q.pop_front();
4494
4495 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4496 p != realm->pchildren.end();
4497 ++p)
4498 q.push_back(*p);
4499
4500 if (dirty_realms.count(realm) == 0) {
4501 realm->nref++;
4502 dirty_realms[realm] = realm->get_snap_context();
4503 }
4504 }
4505 }
4506
4507 // update
4508 realm->seq = info.seq();
4509 realm->created = info.created();
4510 realm->parent_since = info.parent_since();
4511 realm->prior_parent_snaps = info.prior_parent_snaps;
4512 realm->my_snaps = info.my_snaps;
4513 invalidate = true;
4514 }
4515
4516 // _always_ verify parent
4517 if (adjust_realm_parent(realm, info.parent()))
4518 invalidate = true;
4519
4520 if (invalidate) {
4521 invalidate_snaprealm_and_children(realm);
4522 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4523 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4524 } else {
4525 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4526 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4527 }
4528
4529 if (!first_realm)
4530 first_realm = realm;
4531 else
4532 put_snap_realm(realm);
4533 }
4534
4535 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4536 q != dirty_realms.end();
4537 ++q) {
4538 SnapRealm *realm = q->first;
4539 // if there are new snaps ?
4540 if (has_new_snaps(q->second, realm->get_snap_context())) {
4541 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4542 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4543 while (!r.end()) {
4544 Inode *in = *r;
4545 ++r;
4546 queue_cap_snap(in, q->second);
4547 }
4548 } else {
4549 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4550 }
4551 put_snap_realm(realm);
4552 }
4553
4554 if (realm_ret)
4555 *realm_ret = first_realm;
4556 else
4557 put_snap_realm(first_realm);
4558}
4559
4560void Client::handle_snap(MClientSnap *m)
4561{
4562 ldout(cct, 10) << "handle_snap " << *m << dendl;
4563 mds_rank_t mds = mds_rank_t(m->get_source().num());
4564 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4565 if (!session) {
4566 m->put();
4567 return;
4568 }
4569
4570 got_mds_push(session);
4571
4572 map<Inode*, SnapContext> to_move;
4573 SnapRealm *realm = 0;
4574
4575 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4576 assert(m->head.split);
4577 SnapRealmInfo info;
4578 bufferlist::iterator p = m->bl.begin();
4579 ::decode(info, p);
4580 assert(info.ino() == m->head.split);
4581
4582 // flush, then move, ino's.
4583 realm = get_snap_realm(info.ino());
4584 ldout(cct, 10) << " splitting off " << *realm << dendl;
4585 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4586 p != m->split_inos.end();
4587 ++p) {
4588 vinodeno_t vino(*p, CEPH_NOSNAP);
4589 if (inode_map.count(vino)) {
4590 Inode *in = inode_map[vino];
4591 if (!in->snaprealm || in->snaprealm == realm)
4592 continue;
4593 if (in->snaprealm->created > info.created()) {
4594 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4595 << *in->snaprealm << dendl;
4596 continue;
4597 }
4598 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4599
4600
4601 in->snaprealm_item.remove_myself();
4602 to_move[in] = in->snaprealm->get_snap_context();
4603 put_snap_realm(in->snaprealm);
4604 }
4605 }
4606
4607 // move child snaprealms, too
4608 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4609 p != m->split_realms.end();
4610 ++p) {
4611 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4612 SnapRealm *child = get_snap_realm_maybe(*p);
4613 if (!child)
4614 continue;
4615 adjust_realm_parent(child, realm->ino);
4616 put_snap_realm(child);
4617 }
4618 }
4619
4620 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4621
4622 if (realm) {
4623 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4624 Inode *in = p->first;
4625 in->snaprealm = realm;
4626 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4627 realm->nref++;
4628 // queue for snap writeback
4629 if (has_new_snaps(p->second, realm->get_snap_context()))
4630 queue_cap_snap(in, p->second);
4631 }
4632 put_snap_realm(realm);
4633 }
4634
4635 m->put();
4636}
4637
4638void Client::handle_quota(MClientQuota *m)
4639{
4640 mds_rank_t mds = mds_rank_t(m->get_source().num());
4641 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4642 if (!session) {
4643 m->put();
4644 return;
4645 }
4646
4647 got_mds_push(session);
4648
4649 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4650
4651 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4652 if (inode_map.count(vino)) {
4653 Inode *in = NULL;
4654 in = inode_map[vino];
4655
4656 if (in) {
4657 in->quota = m->quota;
4658 in->rstat = m->rstat;
4659 }
4660 }
4661
4662 m->put();
4663}
4664
4665void Client::handle_caps(MClientCaps *m)
4666{
4667 mds_rank_t mds = mds_rank_t(m->get_source().num());
4668 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4669 if (!session) {
4670 m->put();
4671 return;
4672 }
4673
4674 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4675 // Pause RADOS operations until we see the required epoch
4676 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4677 }
4678
4679 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4680 // Record the barrier so that we will transmit it to MDS when releasing
4681 set_cap_epoch_barrier(m->osd_epoch_barrier);
4682 }
4683
4684 got_mds_push(session);
4685
4686 m->clear_payload(); // for if/when we send back to MDS
4687
4688 Inode *in = 0;
4689 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4690 if (inode_map.count(vino))
4691 in = inode_map[vino];
4692 if (!in) {
4693 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4694 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4695 session->enqueue_cap_release(
4696 m->get_ino(),
4697 m->get_cap_id(),
4698 m->get_seq(),
4699 m->get_mseq(),
4700 cap_epoch_barrier);
4701 } else {
4702 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4703 }
4704 m->put();
4705
4706 // in case the mds is waiting on e.g. a revocation
4707 flush_cap_releases();
4708 return;
4709 }
4710
4711 switch (m->get_op()) {
4712 case CEPH_CAP_OP_EXPORT:
4713 return handle_cap_export(session, in, m);
4714 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4715 return handle_cap_flushsnap_ack(session, in, m);
4716 case CEPH_CAP_OP_IMPORT:
4717 handle_cap_import(session, in, m);
4718 }
4719
4720 if (in->caps.count(mds) == 0) {
4721 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4722 m->put();
4723 return;
4724 }
4725
4726 Cap *cap = in->caps[mds];
4727
4728 switch (m->get_op()) {
4729 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4730 case CEPH_CAP_OP_IMPORT:
4731 case CEPH_CAP_OP_REVOKE:
4732 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4733 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4734 default:
4735 m->put();
4736 }
4737}
4738
4739void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4740{
4741 mds_rank_t mds = session->mds_num;
4742
4743 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4744 << " IMPORT from mds." << mds << dendl;
4745
4746 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4747 Cap *cap = NULL;
4748 UserPerm cap_perms;
4749 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4750 cap = in->caps[peer_mds];
4751 if (cap) {
4752 cap_perms = cap->latest_perms;
4753 }
4754 }
4755
4756 // add/update it
4757 SnapRealm *realm = NULL;
4758 update_snap_trace(m->snapbl, &realm);
4759
4760 add_update_cap(in, session, m->get_cap_id(),
4761 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4762 CEPH_CAP_FLAG_AUTH, cap_perms);
4763
4764 if (cap && cap->cap_id == m->peer.cap_id) {
4765 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4766 }
4767
4768 if (realm)
4769 put_snap_realm(realm);
4770
4771 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4772 // reflush any/all caps (if we are now the auth_cap)
4773 if (in->cap_snaps.size())
4774 flush_snaps(in, true);
4775 if (in->flushing_caps)
4776 flush_caps(in, session);
4777 }
4778}
4779
4780void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4781{
4782 mds_rank_t mds = session->mds_num;
4783
4784 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4785 << " EXPORT from mds." << mds << dendl;
4786
4787 Cap *cap = NULL;
4788 if (in->caps.count(mds))
4789 cap = in->caps[mds];
4790
4791 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4792
4793 if (cap && cap->cap_id == m->get_cap_id()) {
4794 if (m->peer.cap_id) {
4795 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4796 if (in->caps.count(peer_mds)) {
4797 Cap *tcap = in->caps[peer_mds];
181888fb 4798 if (tcap->cap_id == m->peer.cap_id &&
7c673cae
FG
4799 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4800 tcap->cap_id = m->peer.cap_id;
4801 tcap->seq = m->peer.seq - 1;
4802 tcap->issue_seq = tcap->seq;
4803 tcap->mseq = m->peer.mseq;
4804 tcap->issued |= cap->issued;
4805 tcap->implemented |= cap->issued;
4806 if (cap == in->auth_cap)
4807 in->auth_cap = tcap;
4808 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4809 adjust_session_flushing_caps(in, session, tsession);
4810 }
4811 } else {
4812 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4813 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4814 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4815 cap->latest_perms);
4816 }
4817 } else {
4818 if (cap == in->auth_cap)
4819 in->flags |= I_CAP_DROPPED;
4820 }
4821
4822 remove_cap(cap, false);
4823 }
4824
4825 m->put();
4826}
4827
4828void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4829{
4830 mds_rank_t mds = session->mds_num;
4831 assert(in->caps[mds]);
4832
4833 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4834 << " size " << in->size << " -> " << m->get_size()
4835 << dendl;
4836
4837 int implemented = 0;
4838 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4839 issued |= implemented;
4840 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4841 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4842 m->get_ctime(), m->get_mtime(), m->get_atime(),
4843 m->inline_version, m->inline_data, issued);
4844 m->put();
4845}
4846
4847void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4848{
4849 ceph_tid_t flush_ack_tid = m->get_client_tid();
4850 int dirty = m->get_dirty();
4851 int cleaned = 0;
4852 int flushed = 0;
4853
4854 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4855 it != in->flushing_cap_tids.end(); ) {
4856 if (it->first == flush_ack_tid)
4857 cleaned = it->second;
4858 if (it->first <= flush_ack_tid) {
4859 session->flushing_caps_tids.erase(it->first);
4860 in->flushing_cap_tids.erase(it++);
4861 ++flushed;
4862 continue;
4863 }
4864 cleaned &= ~it->second;
4865 if (!cleaned)
4866 break;
4867 ++it;
4868 }
4869
4870 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4871 << " cleaned " << ccap_string(cleaned) << " on " << *in
4872 << " with " << ccap_string(dirty) << dendl;
4873
4874 if (flushed) {
4875 signal_cond_list(in->waitfor_caps);
4876 if (session->flushing_caps_tids.empty() ||
4877 *session->flushing_caps_tids.begin() > flush_ack_tid)
4878 sync_cond.Signal();
4879 }
4880
4881 if (!dirty) {
4882 in->cap_dirtier_uid = -1;
4883 in->cap_dirtier_gid = -1;
4884 }
4885
4886 if (!cleaned) {
4887 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4888 } else {
4889 if (in->flushing_caps) {
4890 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4891 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4892 in->flushing_caps &= ~cleaned;
4893 if (in->flushing_caps == 0) {
4894 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4895 num_flushing_caps--;
4896 if (in->cap_snaps.empty())
4897 in->flushing_cap_item.remove_myself();
4898 }
4899 if (!in->caps_dirty())
4900 put_inode(in);
4901 }
4902 }
4903
4904 m->put();
4905}
4906
4907
4908void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4909{
4910 mds_rank_t mds = session->mds_num;
4911 assert(in->caps[mds]);
4912 snapid_t follows = m->get_snap_follows();
4913
4914 if (in->cap_snaps.count(follows)) {
4915 CapSnap &capsnap = in->cap_snaps.at(follows);
4916 if (m->get_client_tid() != capsnap.flush_tid) {
4917 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4918 } else {
4919 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4920 << " on " << *in << dendl;
4921 InodeRef tmp_ref;
4922 if (in->get_num_ref() == 1)
4923 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4924 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4925 in->flushing_cap_item.remove_myself();
4926 session->flushing_caps_tids.erase(capsnap.flush_tid);
4927 in->cap_snaps.erase(follows);
4928 }
4929 } else {
4930 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4931 << " on " << *in << dendl;
4932 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4933 }
4934
4935 m->put();
4936}
4937
4938class C_Client_DentryInvalidate : public Context {
4939private:
4940 Client *client;
4941 vinodeno_t dirino;
4942 vinodeno_t ino;
4943 string name;
4944public:
4945 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4946 client(c), name(dn->name) {
4947 if (client->use_faked_inos()) {
4948 dirino.ino = dn->dir->parent_inode->faked_ino;
4949 if (del)
4950 ino.ino = dn->inode->faked_ino;
4951 } else {
4952 dirino = dn->dir->parent_inode->vino();
4953 if (del)
4954 ino = dn->inode->vino();
4955 }
4956 if (!del)
4957 ino.ino = inodeno_t();
4958 }
4959 void finish(int r) override {
4960 // _async_dentry_invalidate is responsible for its own locking
4961 assert(!client->client_lock.is_locked_by_me());
4962 client->_async_dentry_invalidate(dirino, ino, name);
4963 }
4964};
4965
4966void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4967{
4968 if (unmounting)
4969 return;
4970 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4971 << " in dir " << dirino << dendl;
4972 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4973}
4974
4975void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4976{
4977 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4978 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4979}
4980
4981void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4982{
4983 int ref = in->get_num_ref();
4984
4985 if (in->dir && !in->dir->dentries.empty()) {
4986 for (auto p = in->dir->dentries.begin();
4987 p != in->dir->dentries.end(); ) {
4988 Dentry *dn = p->second;
4989 ++p;
4990 /* rmsnap removes whole subtree, need trim inodes recursively.
4991 * we don't need to invalidate dentries recursively. because
4992 * invalidating a directory dentry effectively invalidate
4993 * whole subtree */
4994 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
4995 _try_to_trim_inode(dn->inode.get(), false);
4996
4997 if (dn->lru_is_expireable())
4998 unlink(dn, true, false); // keep dir, drop dentry
4999 }
5000 if (in->dir->dentries.empty()) {
5001 close_dir(in->dir);
5002 --ref;
5003 }
5004 }
5005
5006 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5007 InodeRef snapdir = open_snapdir(in);
5008 _try_to_trim_inode(snapdir.get(), false);
5009 --ref;
5010 }
5011
5012 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5013 set<Dentry*>::iterator q = in->dn_set.begin();
5014 while (q != in->dn_set.end()) {
5015 Dentry *dn = *q++;
5016 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5017 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5018 _schedule_invalidate_dentry_callback(dn, true);
5019 unlink(dn, true, true);
5020 }
5021 }
5022}
5023
5024void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5025{
5026 mds_rank_t mds = session->mds_num;
5027 int used = get_caps_used(in);
5028 int wanted = in->caps_wanted();
5029
5030 const int old_caps = cap->issued;
5031 const int new_caps = m->get_caps();
5032 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5033 << " mds." << mds << " seq " << m->get_seq()
5034 << " caps now " << ccap_string(new_caps)
5035 << " was " << ccap_string(old_caps) << dendl;
5036 cap->seq = m->get_seq();
5037
5038 in->layout = m->get_layout();
5039
5040 // update inode
5041 int implemented = 0;
5042 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5043 issued |= implemented;
5044
5045 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5046 in->mode = m->head.mode;
5047 in->uid = m->head.uid;
5048 in->gid = m->head.gid;
5049 in->btime = m->btime;
5050 }
5051 bool deleted_inode = false;
5052 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5053 in->nlink = m->head.nlink;
5054 if (in->nlink == 0 &&
5055 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5056 deleted_inode = true;
5057 }
5058 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5059 m->xattrbl.length() &&
5060 m->head.xattr_version > in->xattr_version) {
5061 bufferlist::iterator p = m->xattrbl.begin();
5062 ::decode(in->xattrs, p);
5063 in->xattr_version = m->head.xattr_version;
5064 }
5065 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5066 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5067 m->get_mtime(), m->get_atime(),
5068 m->inline_version, m->inline_data, issued);
5069
5070 // max_size
5071 if (cap == in->auth_cap &&
5072 m->get_max_size() != in->max_size) {
5073 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5074 in->max_size = m->get_max_size();
5075 if (in->max_size > in->wanted_max_size) {
5076 in->wanted_max_size = 0;
5077 in->requested_max_size = 0;
5078 }
5079 }
5080
5081 bool check = false;
5082 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5083 check = true;
5084
5085 check_cap_issue(in, cap, new_caps);
5086
5087 // update caps
b32b8144
FG
5088 int revoked = old_caps & ~new_caps;
5089 if (revoked) {
5090 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5091 cap->issued = new_caps;
5092 cap->implemented |= new_caps;
5093
b32b8144
FG
5094 // recall delegations if we're losing caps necessary for them
5095 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5096 in->recall_deleg(false);
5097 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5098 in->recall_deleg(true);
5099
7c673cae
FG
5100 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5101 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5102 // waitin' for flush
5103 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5104 if (_release(in))
5105 check = true;
5106 } else {
5107 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5108 check = true;
5109 }
5110
5111 } else if (old_caps == new_caps) {
5112 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5113 } else {
5114 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5115 cap->issued = new_caps;
5116 cap->implemented |= new_caps;
5117
5118 if (cap == in->auth_cap) {
5119 // non-auth MDS is revoking the newly grant caps ?
5120 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5121 if (it->second == cap)
5122 continue;
5123 if (it->second->implemented & ~it->second->issued & new_caps) {
5124 check = true;
5125 break;
5126 }
5127 }
5128 }
5129 }
5130
5131 if (check)
5132 check_caps(in, 0);
5133
5134 // wake up waiters
5135 if (new_caps)
5136 signal_cond_list(in->waitfor_caps);
5137
5138 // may drop inode's last ref
5139 if (deleted_inode)
5140 _try_to_trim_inode(in, true);
5141
5142 m->put();
5143}
5144
5145int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5146{
5147 // cppcheck-suppress variableScope
5148 int sgid_count;
5149 gid_t *sgid_buf;
5150
5151 if (getgroups_cb) {
5152 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5153 if (sgid_count > 0) {
5154 *sgids = sgid_buf;
5155 return sgid_count;
5156 }
5157 }
5158
5159#if HAVE_GETGROUPLIST
5160 struct passwd *pw;
5161 pw = getpwuid(uid);
5162 if (pw == NULL) {
5163 ldout(cct, 3) << "getting user entry failed" << dendl;
5164 return -errno;
5165 }
5166 //use PAM to get the group list
5167 // initial number of group entries, defaults to posix standard of 16
5168 // PAM implementations may provide more than 16 groups....
5169 sgid_count = 16;
5170 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5171 if (sgid_buf == NULL) {
5172 ldout(cct, 3) << "allocating group memory failed" << dendl;
5173 return -ENOMEM;
5174 }
5175
5176 while (1) {
5177#if defined(__APPLE__)
5178 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5179#else
5180 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5181#endif
5182 // we need to resize the group list and try again
5183 void *_realloc = NULL;
5184 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5185 ldout(cct, 3) << "allocating group memory failed" << dendl;
5186 free(sgid_buf);
5187 return -ENOMEM;
5188 }
5189 sgid_buf = (gid_t*)_realloc;
5190 continue;
5191 }
5192 // list was successfully retrieved
5193 break;
5194 }
5195 *sgids = sgid_buf;
5196 return sgid_count;
5197#else
5198 return 0;
5199#endif
5200}
5201
5202int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5203{
5204 if (perms.uid() == 0)
5205 return 0;
5206
5207 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5208 int ret = _posix_acl_permission(in, perms, want);
5209 if (ret != -EAGAIN)
5210 return ret;
5211 }
5212
5213 // check permissions before doing anything else
5214 if (!in->check_mode(perms, want))
5215 return -EACCES;
5216 return 0;
5217}
5218
5219int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5220 const UserPerm& perms)
5221{
5222 int r = _getattr_for_perm(in, perms);
5223 if (r < 0)
5224 goto out;
5225
5226 r = 0;
5227 if (strncmp(name, "system.", 7) == 0) {
5228 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5229 r = -EPERM;
5230 } else {
5231 r = inode_permission(in, perms, want);
5232 }
5233out:
5234 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5235 return r;
5236}
5237
5238ostream& operator<<(ostream &out, const UserPerm& perm) {
5239 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5240 return out;
5241}
5242
5243int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5244 const UserPerm& perms)
5245{
181888fb 5246 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5247 int r = _getattr_for_perm(in, perms);
5248 if (r < 0)
5249 goto out;
5250
5251 if (mask & CEPH_SETATTR_SIZE) {
5252 r = inode_permission(in, perms, MAY_WRITE);
5253 if (r < 0)
5254 goto out;
5255 }
5256
5257 r = -EPERM;
5258 if (mask & CEPH_SETATTR_UID) {
5259 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5260 goto out;
5261 }
5262 if (mask & CEPH_SETATTR_GID) {
5263 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5264 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5265 goto out;
5266 }
5267
5268 if (mask & CEPH_SETATTR_MODE) {
5269 if (perms.uid() != 0 && perms.uid() != in->uid)
5270 goto out;
5271
5272 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5273 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5274 stx->stx_mode &= ~S_ISGID;
5275 }
5276
5277 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5278 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5279 if (perms.uid() != 0 && perms.uid() != in->uid) {
5280 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5281 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5282 check_mask |= CEPH_SETATTR_MTIME;
5283 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5284 check_mask |= CEPH_SETATTR_ATIME;
5285 if (check_mask & mask) {
5286 goto out;
5287 } else {
5288 r = inode_permission(in, perms, MAY_WRITE);
5289 if (r < 0)
5290 goto out;
5291 }
5292 }
5293 }
5294 r = 0;
5295out:
5296 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5297 return r;
5298}
5299
5300int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5301{
181888fb 5302 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5303 unsigned want = 0;
5304
5305 if ((flags & O_ACCMODE) == O_WRONLY)
5306 want = MAY_WRITE;
5307 else if ((flags & O_ACCMODE) == O_RDWR)
5308 want = MAY_READ | MAY_WRITE;
5309 else if ((flags & O_ACCMODE) == O_RDONLY)
5310 want = MAY_READ;
5311 if (flags & O_TRUNC)
5312 want |= MAY_WRITE;
5313
5314 int r = 0;
5315 switch (in->mode & S_IFMT) {
5316 case S_IFLNK:
5317 r = -ELOOP;
5318 goto out;
5319 case S_IFDIR:
5320 if (want & MAY_WRITE) {
5321 r = -EISDIR;
5322 goto out;
5323 }
5324 break;
5325 }
5326
5327 r = _getattr_for_perm(in, perms);
5328 if (r < 0)
5329 goto out;
5330
5331 r = inode_permission(in, perms, want);
5332out:
5333 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5334 return r;
5335}
5336
5337int Client::may_lookup(Inode *dir, const UserPerm& perms)
5338{
181888fb 5339 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5340 int r = _getattr_for_perm(dir, perms);
5341 if (r < 0)
5342 goto out;
5343
5344 r = inode_permission(dir, perms, MAY_EXEC);
5345out:
5346 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5347 return r;
5348}
5349
5350int Client::may_create(Inode *dir, const UserPerm& perms)
5351{
181888fb 5352 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5353 int r = _getattr_for_perm(dir, perms);
5354 if (r < 0)
5355 goto out;
5356
5357 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5358out:
5359 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5360 return r;
5361}
5362
5363int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5364{
181888fb 5365 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5366 int r = _getattr_for_perm(dir, perms);
5367 if (r < 0)
5368 goto out;
5369
5370 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5371 if (r < 0)
5372 goto out;
5373
5374 /* 'name == NULL' means rmsnap */
5375 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5376 InodeRef otherin;
5377 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5378 if (r < 0)
5379 goto out;
5380 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5381 r = -EPERM;
5382 }
5383out:
5384 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5385 return r;
5386}
5387
5388int Client::may_hardlink(Inode *in, const UserPerm& perms)
5389{
181888fb 5390 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5391 int r = _getattr_for_perm(in, perms);
5392 if (r < 0)
5393 goto out;
5394
5395 if (perms.uid() == 0 || perms.uid() == in->uid) {
5396 r = 0;
5397 goto out;
5398 }
5399
5400 r = -EPERM;
5401 if (!S_ISREG(in->mode))
5402 goto out;
5403
5404 if (in->mode & S_ISUID)
5405 goto out;
5406
5407 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5408 goto out;
5409
5410 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5411out:
5412 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5413 return r;
5414}
5415
5416int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5417{
5418 int mask = CEPH_STAT_CAP_MODE;
5419 bool force = false;
5420 if (acl_type != NO_ACL) {
5421 mask |= CEPH_STAT_CAP_XATTR;
5422 force = in->xattr_version == 0;
5423 }
5424 return _getattr(in, mask, perms, force);
5425}
5426
5427vinodeno_t Client::_get_vino(Inode *in)
5428{
5429 /* The caller must hold the client lock */
5430 return vinodeno_t(in->ino, in->snapid);
5431}
5432
5433inodeno_t Client::_get_inodeno(Inode *in)
5434{
5435 /* The caller must hold the client lock */
5436 return in->ino;
5437}
5438
5439
5440/**
5441 * Resolve an MDS spec to a list of MDS daemon GIDs.
5442 *
5443 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5444 * It may be '*' in which case it matches all GIDs.
5445 *
5446 * If no error is returned, the `targets` vector will be populated with at least
5447 * one MDS.
5448 */
5449int Client::resolve_mds(
5450 const std::string &mds_spec,
5451 std::vector<mds_gid_t> *targets)
5452{
5453 assert(fsmap);
5454 assert(targets != nullptr);
5455
5456 mds_role_t role;
5457 std::stringstream ss;
5458 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5459 if (role_r == 0) {
5460 // We got a role, resolve it to a GID
5461 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5462 << role << "'" << dendl;
5463 targets->push_back(
5464 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5465 return 0;
5466 }
5467
5468 std::string strtol_err;
5469 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5470 if (strtol_err.empty()) {
5471 // It is a possible GID
5472 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5473 if (fsmap->gid_exists(mds_gid)) {
5474 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5475 targets->push_back(mds_gid);
5476 } else {
5477 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5478 << dendl;
5479 return -ENOENT;
5480 }
5481 } else if (mds_spec == "*") {
5482 // It is a wildcard: use all MDSs
5483 const auto mds_info = fsmap->get_mds_info();
5484
5485 if (mds_info.empty()) {
5486 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5487 return -ENOENT;
5488 }
5489
5490 for (const auto i : mds_info) {
5491 targets->push_back(i.first);
5492 }
5493 } else {
5494 // It did not parse as an integer, it is not a wildcard, it must be a name
5495 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5496 if (mds_gid == 0) {
5497 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5498
5499 lderr(cct) << "FSMap: " << *fsmap << dendl;
5500
5501 return -ENOENT;
5502 } else {
5503 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5504 << "' to GID " << mds_gid << dendl;
5505 targets->push_back(mds_gid);
5506 }
5507 }
5508
5509 return 0;
5510}
5511
5512
5513/**
5514 * Authenticate with mon and establish global ID
5515 */
5516int Client::authenticate()
5517{
5518 assert(client_lock.is_locked_by_me());
5519
5520 if (monclient->is_authenticated()) {
5521 return 0;
5522 }
5523
5524 client_lock.Unlock();
5525 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5526 client_lock.Lock();
5527 if (r < 0) {
5528 return r;
5529 }
5530
5531 whoami = monclient->get_global_id();
5532 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5533
5534 return 0;
5535}
5536
5537int Client::fetch_fsmap(bool user)
5538{
5539 int r;
5540 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5541 // rather than MDSMap because no one MDSMap contains all the daemons, and
5542 // a `tell` can address any daemon.
5543 version_t fsmap_latest;
5544 do {
5545 C_SaferCond cond;
5546 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5547 client_lock.Unlock();
5548 r = cond.wait();
5549 client_lock.Lock();
5550 } while (r == -EAGAIN);
5551
5552 if (r < 0) {
5553 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5554 return r;
5555 }
5556
5557 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5558
5559 if (user) {
5560 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5561 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5562 monclient->renew_subs();
5563 wait_on_list(waiting_for_fsmap);
5564 }
5565 assert(fsmap_user);
5566 assert(fsmap_user->get_epoch() >= fsmap_latest);
5567 } else {
5568 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5569 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5570 monclient->renew_subs();
5571 wait_on_list(waiting_for_fsmap);
5572 }
5573 assert(fsmap);
5574 assert(fsmap->get_epoch() >= fsmap_latest);
5575 }
5576 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5577 << fsmap_latest << dendl;
5578 return 0;
5579}
5580
5581/**
5582 *
5583 * @mds_spec one of ID, rank, GID, "*"
5584 *
5585 */
5586int Client::mds_command(
5587 const std::string &mds_spec,
5588 const vector<string>& cmd,
5589 const bufferlist& inbl,
5590 bufferlist *outbl,
5591 string *outs,
5592 Context *onfinish)
5593{
5594 Mutex::Locker lock(client_lock);
5595
181888fb
FG
5596 if (!initialized)
5597 return -ENOTCONN;
7c673cae
FG
5598
5599 int r;
5600 r = authenticate();
5601 if (r < 0) {
5602 return r;
5603 }
5604
5605 r = fetch_fsmap(false);
5606 if (r < 0) {
5607 return r;
5608 }
5609
5610 // Look up MDS target(s) of the command
5611 std::vector<mds_gid_t> targets;
5612 r = resolve_mds(mds_spec, &targets);
5613 if (r < 0) {
5614 return r;
5615 }
5616
5617 // If daemons are laggy, we won't send them commands. If all
5618 // are laggy then we fail.
5619 std::vector<mds_gid_t> non_laggy;
5620 for (const auto gid : targets) {
5621 const auto info = fsmap->get_info_gid(gid);
5622 if (!info.laggy()) {
5623 non_laggy.push_back(gid);
5624 }
5625 }
5626 if (non_laggy.size() == 0) {
5627 *outs = "All targeted MDS daemons are laggy";
5628 return -ENOENT;
5629 }
5630
5631 if (metadata.empty()) {
5632 // We are called on an unmounted client, so metadata
5633 // won't be initialized yet.
5634 populate_metadata("");
5635 }
5636
5637 // Send commands to targets
5638 C_GatherBuilder gather(cct, onfinish);
5639 for (const auto target_gid : non_laggy) {
5640 const auto info = fsmap->get_info_gid(target_gid);
5641
5642 // Open a connection to the target MDS
5643 entity_inst_t inst = info.get_inst();
5644 ConnectionRef conn = messenger->get_connection(inst);
5645
5646 // Generate MDSCommandOp state
5647 auto &op = command_table.start_command();
5648
5649 op.on_finish = gather.new_sub();
5650 op.cmd = cmd;
5651 op.outbl = outbl;
5652 op.outs = outs;
5653 op.inbl = inbl;
5654 op.mds_gid = target_gid;
5655 op.con = conn;
5656
5657 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5658 << " tid=" << op.tid << cmd << dendl;
5659
5660 // Construct and send MCommand
5661 MCommand *m = op.get_message(monclient->get_fsid());
5662 conn->send_message(m);
5663 }
5664 gather.activate();
5665
5666 return 0;
5667}
5668
5669void Client::handle_command_reply(MCommandReply *m)
5670{
5671 ceph_tid_t const tid = m->get_tid();
5672
5673 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5674
5675 if (!command_table.exists(tid)) {
5676 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5677 m->put();
5678 return;
5679 }
5680
5681 auto &op = command_table.get_command(tid);
5682 if (op.outbl) {
5683 op.outbl->claim(m->get_data());
5684 }
5685 if (op.outs) {
5686 *op.outs = m->rs;
5687 }
5688
5689 if (op.on_finish) {
5690 op.on_finish->complete(m->r);
5691 }
5692
5693 command_table.erase(tid);
5694
5695 m->put();
5696}
5697
5698// -------------------
5699// MOUNT
5700
5701int Client::mount(const std::string &mount_root, const UserPerm& perms,
5702 bool require_mds)
5703{
5704 Mutex::Locker lock(client_lock);
5705
5706 if (mounted) {
5707 ldout(cct, 5) << "already mounted" << dendl;
5708 return 0;
5709 }
5710
b32b8144
FG
5711 unmounting = false;
5712
7c673cae
FG
5713 int r = authenticate();
5714 if (r < 0) {
5715 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5716 return r;
5717 }
5718
5719 std::string want = "mdsmap";
5720 const auto &mds_ns = cct->_conf->client_mds_namespace;
5721 if (!mds_ns.empty()) {
5722 r = fetch_fsmap(true);
5723 if (r < 0)
5724 return r;
5725 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5726 if (cid == FS_CLUSTER_ID_NONE)
5727 return -ENOENT;
5728
5729 std::ostringstream oss;
5730 oss << want << "." << cid;
5731 want = oss.str();
5732 }
5733 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5734
5735 monclient->sub_want(want, 0, 0);
5736 monclient->renew_subs();
5737
5738 tick(); // start tick
5739
5740 if (require_mds) {
5741 while (1) {
5742 auto availability = mdsmap->is_cluster_available();
5743 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5744 // Error out
5745 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5746 return CEPH_FUSE_NO_MDS_UP;
5747 } else if (availability == MDSMap::AVAILABLE) {
5748 // Continue to mount
5749 break;
5750 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5751 // Else, wait. MDSMonitor will update the map to bring
5752 // us to a conclusion eventually.
5753 wait_on_list(waiting_for_mdsmap);
5754 } else {
5755 // Unexpected value!
5756 ceph_abort();
5757 }
5758 }
5759 }
5760
5761 populate_metadata(mount_root.empty() ? "/" : mount_root);
5762
5763 filepath fp(CEPH_INO_ROOT);
5764 if (!mount_root.empty()) {
5765 fp = filepath(mount_root.c_str());
5766 }
5767 while (true) {
5768 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5769 req->set_filepath(fp);
5770 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5771 int res = make_request(req, perms);
5772 if (res < 0) {
5773 if (res == -EACCES && root) {
5774 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5775 break;
5776 }
5777 return res;
5778 }
5779
5780 if (fp.depth())
5781 fp.pop_dentry();
5782 else
5783 break;
5784 }
5785
5786 assert(root);
5787 _ll_get(root);
5788
5789 mounted = true;
5790
5791 // trace?
5792 if (!cct->_conf->client_trace.empty()) {
5793 traceout.open(cct->_conf->client_trace.c_str());
5794 if (traceout.is_open()) {
5795 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5796 } else {
5797 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5798 }
5799 }
5800
5801 /*
5802 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5803 ldout(cct, 3) << "op: struct stat st;" << dendl;
5804 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5805 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5806 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5807 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5808 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5809 ldout(cct, 3) << "op: int fd;" << dendl;
5810 */
5811 return 0;
5812}
5813
5814// UNMOUNT
5815
5816void Client::_close_sessions()
5817{
5818 while (!mds_sessions.empty()) {
5819 // send session closes!
5820 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5821 p != mds_sessions.end();
5822 ++p) {
5823 if (p->second->state != MetaSession::STATE_CLOSING) {
5824 _close_mds_session(p->second);
5825 }
5826 }
5827
5828 // wait for sessions to close
5829 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5830 mount_cond.Wait(client_lock);
5831 }
5832}
5833
31f18b77
FG
5834void Client::flush_mdlog_sync()
5835{
5836 if (mds_requests.empty())
5837 return;
5838 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5839 p != mds_sessions.end();
5840 ++p) {
5841 MetaSession *s = p->second;
5842 flush_mdlog(s);
5843 }
5844}
5845
5846void Client::flush_mdlog(MetaSession *session)
5847{
5848 // Only send this to Luminous or newer MDS daemons, older daemons
5849 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5850 const uint64_t features = session->con->get_features();
5851 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5852 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5853 session->con->send_message(m);
5854 }
5855}
5856
5857
b32b8144 5858void Client::_unmount()
7c673cae 5859{
181888fb
FG
5860 if (unmounting)
5861 return;
7c673cae
FG
5862
5863 ldout(cct, 2) << "unmounting" << dendl;
5864 unmounting = true;
5865
b32b8144
FG
5866 deleg_timeout = 0;
5867
31f18b77 5868 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5869 while (!mds_requests.empty()) {
5870 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5871 mount_cond.Wait(client_lock);
5872 }
5873
5874 if (tick_event)
5875 timer.cancel_event(tick_event);
5876 tick_event = 0;
5877
5878 cwd.reset();
5879
5880 // clean up any unclosed files
5881 while (!fd_map.empty()) {
5882 Fh *fh = fd_map.begin()->second;
5883 fd_map.erase(fd_map.begin());
5884 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5885 _release_fh(fh);
5886 }
5887
5888 while (!ll_unclosed_fh_set.empty()) {
5889 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5890 Fh *fh = *it;
5891 ll_unclosed_fh_set.erase(fh);
5892 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5893 _release_fh(fh);
5894 }
5895
5896 while (!opened_dirs.empty()) {
5897 dir_result_t *dirp = *opened_dirs.begin();
5898 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5899 _closedir(dirp);
5900 }
5901
5902 _ll_drop_pins();
5903
31f18b77
FG
5904 if (blacklisted) {
5905 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5906
5907 if (cct->_conf->client_oc) {
5908 // Purge all cached data so that ObjectCacher doesn't get hung up
5909 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5910 // is to just leave things marked dirty
5911 // (http://tracker.ceph.com/issues/9105)
5912 for (const auto &i : inode_map) {
5913 objectcacher->purge_set(&(i.second->oset));
5914 }
5915 }
5916
5917 mounted = false;
5918 return;
5919 }
5920
7c673cae
FG
5921 while (unsafe_sync_write > 0) {
5922 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5923 mount_cond.Wait(client_lock);
5924 }
5925
5926 if (cct->_conf->client_oc) {
5927 // flush/release all buffered data
5928 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5929 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5930 p != inode_map.end();
5931 p = next) {
5932 next = p;
5933 ++next;
5934 Inode *in = p->second;
5935 if (!in) {
5936 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5937 assert(in);
5938 }
5939 if (!in->caps.empty()) {
5940 InodeRef tmp_ref(in);
5941 _release(in);
5942 _flush(in, new C_Client_FlushComplete(this, in));
5943 }
5944 }
5945 }
5946
5947 flush_caps_sync();
5948 wait_sync_caps(last_flush_tid);
5949
5950 // empty lru cache
7c673cae
FG
5951 trim_cache();
5952
5953 while (lru.lru_get_size() > 0 ||
5954 !inode_map.empty()) {
5955 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5956 << "+" << inode_map.size() << " items"
5957 << ", waiting (for caps to release?)"
5958 << dendl;
5959 utime_t until = ceph_clock_now() + utime_t(5, 0);
5960 int r = mount_cond.WaitUntil(client_lock, until);
5961 if (r == ETIMEDOUT) {
5962 dump_cache(NULL);
5963 }
5964 }
5965 assert(lru.lru_get_size() == 0);
5966 assert(inode_map.empty());
5967
5968 // stop tracing
5969 if (!cct->_conf->client_trace.empty()) {
5970 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5971 traceout.close();
5972 }
5973
5974 _close_sessions();
5975
5976 mounted = false;
5977
5978 ldout(cct, 2) << "unmounted." << dendl;
5979}
5980
b32b8144
FG
5981void Client::unmount()
5982{
5983 Mutex::Locker lock(client_lock);
5984 _unmount();
5985}
5986
7c673cae
FG
5987void Client::flush_cap_releases()
5988{
5989 // send any cap releases
5990 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5991 p != mds_sessions.end();
5992 ++p) {
5993 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5994 p->first)) {
5995 if (cct->_conf->client_inject_release_failure) {
5996 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5997 p->second->release->put();
5998 } else {
5999 p->second->con->send_message(p->second->release);
6000 }
6001 p->second->release = 0;
6002 }
6003 }
6004}
6005
6006void Client::tick()
6007{
6008 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6009 sleep(cct->_conf->client_debug_inject_tick_delay);
6010 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6011 cct->_conf->apply_changes(NULL);
6012 }
6013
6014 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6015 tick_event = timer.add_event_after(
6016 cct->_conf->client_tick_interval,
6017 new FunctionContext([this](int) {
6018 // Called back via Timer, which takes client_lock for us
6019 assert(client_lock.is_locked_by_me());
6020 tick();
6021 }));
7c673cae
FG
6022 utime_t now = ceph_clock_now();
6023
6024 if (!mounted && !mds_requests.empty()) {
6025 MetaRequest *req = mds_requests.begin()->second;
6026 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6027 req->abort(-ETIMEDOUT);
6028 if (req->caller_cond) {
6029 req->kick = true;
6030 req->caller_cond->Signal();
6031 }
6032 signal_cond_list(waiting_for_mdsmap);
6033 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6034 p != mds_sessions.end();
6035 ++p)
6036 signal_context_list(p->second->waiting_for_open);
6037 }
6038 }
6039
6040 if (mdsmap->get_epoch()) {
6041 // renew caps?
6042 utime_t el = now - last_cap_renew;
6043 if (el > mdsmap->get_session_timeout() / 3.0)
6044 renew_caps();
6045
6046 flush_cap_releases();
6047 }
6048
6049 // delayed caps
6050 xlist<Inode*>::iterator p = delayed_caps.begin();
6051 while (!p.end()) {
6052 Inode *in = *p;
6053 ++p;
6054 if (in->hold_caps_until > now)
6055 break;
6056 delayed_caps.pop_front();
6057 cap_list.push_back(&in->cap_item);
6058 check_caps(in, CHECK_CAPS_NODELAY);
6059 }
6060
6061 trim_cache(true);
6062}
6063
6064void Client::renew_caps()
6065{
6066 ldout(cct, 10) << "renew_caps()" << dendl;
6067 last_cap_renew = ceph_clock_now();
6068
6069 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6070 p != mds_sessions.end();
6071 ++p) {
6072 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6073 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6074 renew_caps(p->second);
6075 }
6076}
6077
6078void Client::renew_caps(MetaSession *session)
6079{
6080 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6081 session->last_cap_renew_request = ceph_clock_now();
6082 uint64_t seq = ++session->cap_renew_seq;
6083 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6084}
6085
6086
6087// ===============================================================
6088// high level (POSIXy) interface
6089
6090int Client::_do_lookup(Inode *dir, const string& name, int mask,
6091 InodeRef *target, const UserPerm& perms)
6092{
6093 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6094 MetaRequest *req = new MetaRequest(op);
6095 filepath path;
6096 dir->make_nosnap_relative_path(path);
6097 path.push_dentry(name);
6098 req->set_filepath(path);
6099 req->set_inode(dir);
6100 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6101 mask |= DEBUG_GETATTR_CAPS;
6102 req->head.args.getattr.mask = mask;
6103
6104 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6105
6106 int r = make_request(req, perms, target);
6107 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6108 return r;
6109}
6110
6111int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6112 const UserPerm& perms)
6113{
6114 int r = 0;
6115 Dentry *dn = NULL;
6116
6117 if (!dir->is_dir()) {
6118 r = -ENOTDIR;
6119 goto done;
6120 }
6121
6122 if (dname == "..") {
6123 if (dir->dn_set.empty())
6124 *target = dir;
6125 else
6126 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6127 goto done;
6128 }
6129
6130 if (dname == ".") {
6131 *target = dir;
6132 goto done;
6133 }
6134
6135 if (dname.length() > NAME_MAX) {
6136 r = -ENAMETOOLONG;
6137 goto done;
6138 }
6139
6140 if (dname == cct->_conf->client_snapdir &&
6141 dir->snapid == CEPH_NOSNAP) {
6142 *target = open_snapdir(dir);
6143 goto done;
6144 }
6145
6146 if (dir->dir &&
6147 dir->dir->dentries.count(dname)) {
6148 dn = dir->dir->dentries[dname];
6149
6150 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6151 << " seq " << dn->lease_seq
6152 << dendl;
6153
6154 if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
6155 // is dn lease valid?
6156 utime_t now = ceph_clock_now();
6157 if (dn->lease_mds >= 0 &&
6158 dn->lease_ttl > now &&
6159 mds_sessions.count(dn->lease_mds)) {
6160 MetaSession *s = mds_sessions[dn->lease_mds];
6161 if (s->cap_ttl > now &&
6162 s->cap_gen == dn->lease_gen) {
6163 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6164 // make trim_caps() behave.
6165 dir->try_touch_cap(dn->lease_mds);
6166 goto hit_dn;
6167 }
6168 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6169 << " vs lease_gen " << dn->lease_gen << dendl;
6170 }
6171 // dir lease?
6172 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
6173 if (dn->cap_shared_gen == dir->shared_gen &&
6174 (!dn->inode || dn->inode->caps_issued_mask(mask)))
6175 goto hit_dn;
6176 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6177 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6178 << *dir << " dn '" << dname << "'" << dendl;
6179 return -ENOENT;
6180 }
6181 }
6182 } else {
6183 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6184 }
6185 } else {
6186 // can we conclude ENOENT locally?
6187 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
6188 (dir->flags & I_COMPLETE)) {
6189 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6190 return -ENOENT;
6191 }
6192 }
6193
6194 r = _do_lookup(dir, dname, mask, target, perms);
6195 goto done;
6196
6197 hit_dn:
6198 if (dn->inode) {
6199 *target = dn->inode;
6200 } else {
6201 r = -ENOENT;
6202 }
6203 touch_dn(dn);
6204
6205 done:
6206 if (r < 0)
6207 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6208 else
6209 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6210 return r;
6211}
6212
6213int Client::get_or_create(Inode *dir, const char* name,
6214 Dentry **pdn, bool expect_null)
6215{
6216 // lookup
6217 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6218 dir->open_dir();
6219 if (dir->dir->dentries.count(name)) {
6220 Dentry *dn = dir->dir->dentries[name];
6221
6222 // is dn lease valid?
6223 utime_t now = ceph_clock_now();
6224 if (dn->inode &&
6225 dn->lease_mds >= 0 &&
6226 dn->lease_ttl > now &&
6227 mds_sessions.count(dn->lease_mds)) {
6228 MetaSession *s = mds_sessions[dn->lease_mds];
6229 if (s->cap_ttl > now &&
6230 s->cap_gen == dn->lease_gen) {
6231 if (expect_null)
6232 return -EEXIST;
6233 }
6234 }
6235 *pdn = dn;
6236 } else {
6237 // otherwise link up a new one
6238 *pdn = link(dir->dir, name, NULL, NULL);
6239 }
6240
6241 // success
6242 return 0;
6243}
6244
6245int Client::path_walk(const filepath& origpath, InodeRef *end,
6246 const UserPerm& perms, bool followsym, int mask)
6247{
6248 filepath path = origpath;
6249 InodeRef cur;
6250 if (origpath.absolute())
6251 cur = root;
6252 else
6253 cur = cwd;
6254 assert(cur);
6255
6256 ldout(cct, 10) << "path_walk " << path << dendl;
6257
6258 int symlinks = 0;
6259
6260 unsigned i=0;
6261 while (i < path.depth() && cur) {
6262 int caps = 0;
6263 const string &dname = path[i];
6264 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6265 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6266 InodeRef next;
6267 if (cct->_conf->client_permissions) {
6268 int r = may_lookup(cur.get(), perms);
6269 if (r < 0)
6270 return r;
6271 caps = CEPH_CAP_AUTH_SHARED;
6272 }
6273
6274 /* Get extra requested caps on the last component */
6275 if (i == (path.depth() - 1))
6276 caps |= mask;
6277 int r = _lookup(cur.get(), dname, caps, &next, perms);
6278 if (r < 0)
6279 return r;
6280 // only follow trailing symlink if followsym. always follow
6281 // 'directory' symlinks.
6282 if (next && next->is_symlink()) {
6283 symlinks++;
6284 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6285 if (symlinks > MAXSYMLINKS) {
6286 return -ELOOP;
6287 }
6288
6289 if (i < path.depth() - 1) {
6290 // dir symlink
6291 // replace consumed components of path with symlink dir target
6292 filepath resolved(next->symlink.c_str());
6293 resolved.append(path.postfixpath(i + 1));
6294 path = resolved;
6295 i = 0;
6296 if (next->symlink[0] == '/') {
6297 cur = root;
6298 }
6299 continue;
6300 } else if (followsym) {
6301 if (next->symlink[0] == '/') {
6302 path = next->symlink.c_str();
6303 i = 0;
6304 // reset position
6305 cur = root;
6306 } else {
6307 filepath more(next->symlink.c_str());
6308 // we need to remove the symlink component from off of the path
6309 // before adding the target that the symlink points to. remain
6310 // at the same position in the path.
6311 path.pop_dentry();
6312 path.append(more);
6313 }
6314 continue;
6315 }
6316 }
6317 cur.swap(next);
6318 i++;
6319 }
6320 if (!cur)
6321 return -ENOENT;
6322 if (end)
6323 end->swap(cur);
6324 return 0;
6325}
6326
6327
6328// namespace ops
6329
6330int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6331{
6332 Mutex::Locker lock(client_lock);
6333 tout(cct) << "link" << std::endl;
6334 tout(cct) << relexisting << std::endl;
6335 tout(cct) << relpath << std::endl;
6336
181888fb
FG
6337 if (unmounting)
6338 return -ENOTCONN;
6339
7c673cae
FG
6340 filepath existing(relexisting);
6341
6342 InodeRef in, dir;
6343 int r = path_walk(existing, &in, perm, true);
6344 if (r < 0)
6345 return r;
6346 if (std::string(relpath) == "/") {
6347 r = -EEXIST;
6348 return r;
6349 }
6350 filepath path(relpath);
6351 string name = path.last_dentry();
6352 path.pop_dentry();
6353
6354 r = path_walk(path, &dir, perm, true);
6355 if (r < 0)
6356 return r;
6357 if (cct->_conf->client_permissions) {
6358 if (S_ISDIR(in->mode)) {
6359 r = -EPERM;
6360 return r;
6361 }
6362 r = may_hardlink(in.get(), perm);
6363 if (r < 0)
6364 return r;
6365 r = may_create(dir.get(), perm);
6366 if (r < 0)
6367 return r;
6368 }
6369 r = _link(in.get(), dir.get(), name.c_str(), perm);
6370 return r;
6371}
6372
6373int Client::unlink(const char *relpath, const UserPerm& perm)
6374{
6375 Mutex::Locker lock(client_lock);
6376 tout(cct) << "unlink" << std::endl;
6377 tout(cct) << relpath << std::endl;
6378
181888fb
FG
6379 if (unmounting)
6380 return -ENOTCONN;
6381
7c673cae
FG
6382 if (std::string(relpath) == "/")
6383 return -EISDIR;
6384
6385 filepath path(relpath);
6386 string name = path.last_dentry();
6387 path.pop_dentry();
6388 InodeRef dir;
6389 int r = path_walk(path, &dir, perm);
6390 if (r < 0)
6391 return r;
6392 if (cct->_conf->client_permissions) {
6393 r = may_delete(dir.get(), name.c_str(), perm);
6394 if (r < 0)
6395 return r;
6396 }
6397 return _unlink(dir.get(), name.c_str(), perm);
6398}
6399
6400int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6401{
6402 Mutex::Locker lock(client_lock);
6403 tout(cct) << "rename" << std::endl;
6404 tout(cct) << relfrom << std::endl;
6405 tout(cct) << relto << std::endl;
6406
181888fb
FG
6407 if (unmounting)
6408 return -ENOTCONN;
6409
7c673cae
FG
6410 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6411 return -EBUSY;
6412
6413 filepath from(relfrom);
6414 filepath to(relto);
6415 string fromname = from.last_dentry();
6416 from.pop_dentry();
6417 string toname = to.last_dentry();
6418 to.pop_dentry();
6419
6420 InodeRef fromdir, todir;
6421 int r = path_walk(from, &fromdir, perm);
6422 if (r < 0)
6423 goto out;
6424 r = path_walk(to, &todir, perm);
6425 if (r < 0)
6426 goto out;
6427
6428 if (cct->_conf->client_permissions) {
6429 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6430 if (r < 0)
6431 return r;
6432 r = may_delete(todir.get(), toname.c_str(), perm);
6433 if (r < 0 && r != -ENOENT)
6434 return r;
6435 }
6436 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6437out:
6438 return r;
6439}
6440
6441// dirs
6442
6443int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6444{
6445 Mutex::Locker lock(client_lock);
6446 tout(cct) << "mkdir" << std::endl;
6447 tout(cct) << relpath << std::endl;
6448 tout(cct) << mode << std::endl;
6449 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6450
181888fb
FG
6451 if (unmounting)
6452 return -ENOTCONN;
6453
7c673cae
FG
6454 if (std::string(relpath) == "/")
6455 return -EEXIST;
6456
6457 filepath path(relpath);
6458 string name = path.last_dentry();
6459 path.pop_dentry();
6460 InodeRef dir;
6461 int r = path_walk(path, &dir, perm);
6462 if (r < 0)
6463 return r;
6464 if (cct->_conf->client_permissions) {
6465 r = may_create(dir.get(), perm);
6466 if (r < 0)
6467 return r;
6468 }
6469 return _mkdir(dir.get(), name.c_str(), mode, perm);
6470}
6471
6472int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6473{
6474 Mutex::Locker lock(client_lock);
6475 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6476 tout(cct) << "mkdirs" << std::endl;
6477 tout(cct) << relpath << std::endl;
6478 tout(cct) << mode << std::endl;
6479
181888fb
FG
6480 if (unmounting)
6481 return -ENOTCONN;
6482
7c673cae
FG
6483 //get through existing parts of path
6484 filepath path(relpath);
6485 unsigned int i;
6486 int r = 0, caps = 0;
6487 InodeRef cur, next;
6488 cur = cwd;
6489 for (i=0; i<path.depth(); ++i) {
6490 if (cct->_conf->client_permissions) {
6491 r = may_lookup(cur.get(), perms);
6492 if (r < 0)
6493 break;
6494 caps = CEPH_CAP_AUTH_SHARED;
6495 }
6496 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6497 if (r < 0)
6498 break;
6499 cur.swap(next);
6500 }
6501 //check that we have work left to do
6502 if (i==path.depth()) return -EEXIST;
6503 if (r!=-ENOENT) return r;
6504 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6505 //make new directory at each level
6506 for (; i<path.depth(); ++i) {
6507 if (cct->_conf->client_permissions) {
6508 r = may_create(cur.get(), perms);
6509 if (r < 0)
6510 return r;
6511 }
6512 //make new dir
6513 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6514
7c673cae 6515 //check proper creation/existence
c07f9fc5
FG
6516 if(-EEXIST == r && i < path.depth() - 1) {
6517 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6518 }
6519 if (r < 0)
6520 return r;
7c673cae
FG
6521 //move to new dir and continue
6522 cur.swap(next);
6523 ldout(cct, 20) << "mkdirs: successfully created directory "
6524 << filepath(cur->ino).get_path() << dendl;
6525 }
6526 return 0;
6527}
6528
6529int Client::rmdir(const char *relpath, const UserPerm& perms)
6530{
6531 Mutex::Locker lock(client_lock);
6532 tout(cct) << "rmdir" << std::endl;
6533 tout(cct) << relpath << std::endl;
6534
181888fb
FG
6535 if (unmounting)
6536 return -ENOTCONN;
6537
7c673cae
FG
6538 if (std::string(relpath) == "/")
6539 return -EBUSY;
6540
6541 filepath path(relpath);
6542 string name = path.last_dentry();
6543 path.pop_dentry();
6544 InodeRef dir;
6545 int r = path_walk(path, &dir, perms);
6546 if (r < 0)
6547 return r;
6548 if (cct->_conf->client_permissions) {
6549 int r = may_delete(dir.get(), name.c_str(), perms);
6550 if (r < 0)
6551 return r;
6552 }
6553 return _rmdir(dir.get(), name.c_str(), perms);
6554}
6555
6556int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6557{
6558 Mutex::Locker lock(client_lock);
6559 tout(cct) << "mknod" << std::endl;
6560 tout(cct) << relpath << std::endl;
6561 tout(cct) << mode << std::endl;
6562 tout(cct) << rdev << std::endl;
6563
181888fb
FG
6564 if (unmounting)
6565 return -ENOTCONN;
6566
7c673cae
FG
6567 if (std::string(relpath) == "/")
6568 return -EEXIST;
6569
6570 filepath path(relpath);
6571 string name = path.last_dentry();
6572 path.pop_dentry();
6573 InodeRef dir;
6574 int r = path_walk(path, &dir, perms);
6575 if (r < 0)
6576 return r;
6577 if (cct->_conf->client_permissions) {
6578 int r = may_create(dir.get(), perms);
6579 if (r < 0)
6580 return r;
6581 }
6582 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6583}
6584
6585// symlinks
6586
6587int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6588{
6589 Mutex::Locker lock(client_lock);
6590 tout(cct) << "symlink" << std::endl;
6591 tout(cct) << target << std::endl;
6592 tout(cct) << relpath << std::endl;
6593
181888fb
FG
6594 if (unmounting)
6595 return -ENOTCONN;
6596
7c673cae
FG
6597 if (std::string(relpath) == "/")
6598 return -EEXIST;
6599
6600 filepath path(relpath);
6601 string name = path.last_dentry();
6602 path.pop_dentry();
6603 InodeRef dir;
6604 int r = path_walk(path, &dir, perms);
6605 if (r < 0)
6606 return r;
6607 if (cct->_conf->client_permissions) {
6608 int r = may_create(dir.get(), perms);
6609 if (r < 0)
6610 return r;
6611 }
6612 return _symlink(dir.get(), name.c_str(), target, perms);
6613}
6614
6615int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6616{
6617 Mutex::Locker lock(client_lock);
6618 tout(cct) << "readlink" << std::endl;
6619 tout(cct) << relpath << std::endl;
6620
181888fb
FG
6621 if (unmounting)
6622 return -ENOTCONN;
6623
7c673cae
FG
6624 filepath path(relpath);
6625 InodeRef in;
6626 int r = path_walk(path, &in, perms, false);
6627 if (r < 0)
6628 return r;
6629
6630 return _readlink(in.get(), buf, size);
6631}
6632
6633int Client::_readlink(Inode *in, char *buf, size_t size)
6634{
6635 if (!in->is_symlink())
6636 return -EINVAL;
6637
6638 // copy into buf (at most size bytes)
6639 int r = in->symlink.length();
6640 if (r > (int)size)
6641 r = size;
6642 memcpy(buf, in->symlink.c_str(), r);
6643 return r;
6644}
6645
6646
6647// inode stuff
6648
6649int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6650{
6651 bool yes = in->caps_issued_mask(mask);
6652
6653 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6654 if (yes && !force)
6655 return 0;
6656
6657 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6658 filepath path;
6659 in->make_nosnap_relative_path(path);
6660 req->set_filepath(path);
6661 req->set_inode(in);
6662 req->head.args.getattr.mask = mask;
6663
6664 int res = make_request(req, perms);
6665 ldout(cct, 10) << "_getattr result=" << res << dendl;
6666 return res;
6667}
6668
6669int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6670 const UserPerm& perms, InodeRef *inp)
6671{
6672 int issued = in->caps_issued();
6673
6674 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6675 ccap_string(issued) << dendl;
6676
6677 if (in->snapid != CEPH_NOSNAP) {
6678 return -EROFS;
6679 }
6680 if ((mask & CEPH_SETATTR_SIZE) &&
6681 (unsigned long)stx->stx_size > in->size &&
6682 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6683 perms)) {
6684 return -EDQUOT;
6685 }
6686
6687 // make the change locally?
6688 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6689 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6690 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6691 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6692 << in->cap_dirtier_gid << ", forcing sync setattr"
6693 << dendl;
6694 /*
6695 * This works because we implicitly flush the caps as part of the
6696 * request, so the cap update check will happen with the writeback
6697 * cap context, and then the setattr check will happen with the
6698 * caller's context.
6699 *
6700 * In reality this pattern is likely pretty rare (different users
6701 * setattr'ing the same file). If that turns out not to be the
6702 * case later, we can build a more complex pipelined cap writeback
6703 * infrastructure...
6704 */
6705 if (!mask)
6706 mask |= CEPH_SETATTR_CTIME;
6707 goto force_request;
6708 }
6709
6710 if (!mask) {
6711 // caller just needs us to bump the ctime
6712 in->ctime = ceph_clock_now();
6713 in->cap_dirtier_uid = perms.uid();
6714 in->cap_dirtier_gid = perms.gid();
6715 if (issued & CEPH_CAP_AUTH_EXCL)
6716 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6717 else if (issued & CEPH_CAP_FILE_EXCL)
6718 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6719 else if (issued & CEPH_CAP_XATTR_EXCL)
6720 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6721 else
6722 mask |= CEPH_SETATTR_CTIME;
6723 }
6724
6725 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6726 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6727
6728 mask &= ~CEPH_SETATTR_KILL_SGUID;
6729
6730 if (mask & CEPH_SETATTR_UID) {
6731 in->ctime = ceph_clock_now();
6732 in->cap_dirtier_uid = perms.uid();
6733 in->cap_dirtier_gid = perms.gid();
6734 in->uid = stx->stx_uid;
6735 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6736 mask &= ~CEPH_SETATTR_UID;
6737 kill_sguid = true;
6738 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6739 }
6740 if (mask & CEPH_SETATTR_GID) {
6741 in->ctime = ceph_clock_now();
6742 in->cap_dirtier_uid = perms.uid();
6743 in->cap_dirtier_gid = perms.gid();
6744 in->gid = stx->stx_gid;
6745 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6746 mask &= ~CEPH_SETATTR_GID;
6747 kill_sguid = true;
6748 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6749 }
6750
6751 if (mask & CEPH_SETATTR_MODE) {
6752 in->ctime = ceph_clock_now();
6753 in->cap_dirtier_uid = perms.uid();
6754 in->cap_dirtier_gid = perms.gid();
6755 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6756 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6757 mask &= ~CEPH_SETATTR_MODE;
6758 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6759 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6760 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6761 in->mode &= ~(S_ISUID|S_ISGID);
7c673cae
FG
6762 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6763 }
6764
6765 if (mask & CEPH_SETATTR_BTIME) {
6766 in->ctime = ceph_clock_now();
6767 in->cap_dirtier_uid = perms.uid();
6768 in->cap_dirtier_gid = perms.gid();
6769 in->btime = utime_t(stx->stx_btime);
6770 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6771 mask &= ~CEPH_SETATTR_BTIME;
6772 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6773 }
6774 } else if (mask & CEPH_SETATTR_SIZE) {
6775 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6776 mask |= CEPH_SETATTR_KILL_SGUID;
6777 }
6778
6779 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6780 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6781 if (mask & CEPH_SETATTR_MTIME)
6782 in->mtime = utime_t(stx->stx_mtime);
6783 if (mask & CEPH_SETATTR_ATIME)
6784 in->atime = utime_t(stx->stx_atime);
6785 in->ctime = ceph_clock_now();
6786 in->cap_dirtier_uid = perms.uid();
6787 in->cap_dirtier_gid = perms.gid();
6788 in->time_warp_seq++;
6789 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6790 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6791 }
6792 }
6793 if (!mask) {
6794 in->change_attr++;
6795 return 0;
6796 }
6797
6798force_request:
6799 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6800
6801 filepath path;
6802
6803 in->make_nosnap_relative_path(path);
6804 req->set_filepath(path);
6805 req->set_inode(in);
6806
6807 if (mask & CEPH_SETATTR_KILL_SGUID) {
6808 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6809 }
6810 if (mask & CEPH_SETATTR_MODE) {
6811 req->head.args.setattr.mode = stx->stx_mode;
6812 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6813 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6814 }
6815 if (mask & CEPH_SETATTR_UID) {
6816 req->head.args.setattr.uid = stx->stx_uid;
6817 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6818 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6819 }
6820 if (mask & CEPH_SETATTR_GID) {
6821 req->head.args.setattr.gid = stx->stx_gid;
6822 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6823 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6824 }
6825 if (mask & CEPH_SETATTR_BTIME) {
6826 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6827 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6828 }
6829 if (mask & CEPH_SETATTR_MTIME) {
6830 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6831 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6832 CEPH_CAP_FILE_WR;
6833 }
6834 if (mask & CEPH_SETATTR_ATIME) {
6835 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6836 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6837 CEPH_CAP_FILE_WR;
6838 }
6839 if (mask & CEPH_SETATTR_SIZE) {
6840 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6841 req->head.args.setattr.size = stx->stx_size;
6842 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6843 } else { //too big!
6844 put_request(req);
6845 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6846 return -EFBIG;
6847 }
6848 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6849 CEPH_CAP_FILE_WR;
6850 }
6851 req->head.args.setattr.mask = mask;
6852
6853 req->regetattr_mask = mask;
6854
6855 int res = make_request(req, perms, inp);
6856 ldout(cct, 10) << "_setattr result=" << res << dendl;
6857 return res;
6858}
6859
6860/* Note that we only care about attrs that setattr cares about */
6861void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6862{
6863 stx->stx_size = st->st_size;
6864 stx->stx_mode = st->st_mode;
6865 stx->stx_uid = st->st_uid;
6866 stx->stx_gid = st->st_gid;
6867 stx->stx_mtime = st->st_mtim;
6868 stx->stx_atime = st->st_atim;
6869}
6870
6871int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6872 const UserPerm& perms, InodeRef *inp)
6873{
6874 int ret = _do_setattr(in, stx, mask, perms, inp);
6875 if (ret < 0)
6876 return ret;
6877 if (mask & CEPH_SETATTR_MODE)
6878 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6879 return ret;
6880}
6881
6882int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6883 const UserPerm& perms)
6884{
6885 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6886 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6887 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6888 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6889 if (cct->_conf->client_permissions) {
6890 int r = may_setattr(in.get(), stx, mask, perms);
6891 if (r < 0)
6892 return r;
6893 }
6894 return __setattrx(in.get(), stx, mask, perms);
6895}
6896
6897int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6898 const UserPerm& perms)
6899{
6900 struct ceph_statx stx;
6901
6902 stat_to_statx(attr, &stx);
6903 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
6904
6905 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6906 mask &= ~CEPH_SETATTR_UID;
6907 }
6908 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6909 mask &= ~CEPH_SETATTR_GID;
6910 }
6911
7c673cae
FG
6912 return _setattrx(in, &stx, mask, perms);
6913}
6914
6915int Client::setattr(const char *relpath, struct stat *attr, int mask,
6916 const UserPerm& perms)
6917{
6918 Mutex::Locker lock(client_lock);
6919 tout(cct) << "setattr" << std::endl;
6920 tout(cct) << relpath << std::endl;
6921 tout(cct) << mask << std::endl;
6922
181888fb
FG
6923 if (unmounting)
6924 return -ENOTCONN;
6925
7c673cae
FG
6926 filepath path(relpath);
6927 InodeRef in;
6928 int r = path_walk(path, &in, perms);
6929 if (r < 0)
6930 return r;
6931 return _setattr(in, attr, mask, perms);
6932}
6933
6934int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6935 const UserPerm& perms, int flags)
6936{
6937 Mutex::Locker lock(client_lock);
6938 tout(cct) << "setattrx" << std::endl;
6939 tout(cct) << relpath << std::endl;
6940 tout(cct) << mask << std::endl;
6941
181888fb
FG
6942 if (unmounting)
6943 return -ENOTCONN;
6944
7c673cae
FG
6945 filepath path(relpath);
6946 InodeRef in;
6947 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6948 if (r < 0)
6949 return r;
6950 return _setattrx(in, stx, mask, perms);
6951}
6952
6953int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6954{
6955 Mutex::Locker lock(client_lock);
6956 tout(cct) << "fsetattr" << std::endl;
6957 tout(cct) << fd << std::endl;
6958 tout(cct) << mask << std::endl;
6959
181888fb
FG
6960 if (unmounting)
6961 return -ENOTCONN;
6962
7c673cae
FG
6963 Fh *f = get_filehandle(fd);
6964 if (!f)
6965 return -EBADF;
6966#if defined(__linux__) && defined(O_PATH)
6967 if (f->flags & O_PATH)
6968 return -EBADF;
6969#endif
6970 return _setattr(f->inode, attr, mask, perms);
6971}
6972
6973int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6974{
6975 Mutex::Locker lock(client_lock);
6976 tout(cct) << "fsetattr" << std::endl;
6977 tout(cct) << fd << std::endl;
6978 tout(cct) << mask << std::endl;
6979
181888fb
FG
6980 if (unmounting)
6981 return -ENOTCONN;
6982
7c673cae
FG
6983 Fh *f = get_filehandle(fd);
6984 if (!f)
6985 return -EBADF;
6986#if defined(__linux__) && defined(O_PATH)
6987 if (f->flags & O_PATH)
6988 return -EBADF;
6989#endif
6990 return _setattrx(f->inode, stx, mask, perms);
6991}
6992
6993int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6994 frag_info_t *dirstat, int mask)
6995{
6996 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6997 Mutex::Locker lock(client_lock);
6998 tout(cct) << "stat" << std::endl;
6999 tout(cct) << relpath << std::endl;
181888fb
FG
7000
7001 if (unmounting)
7002 return -ENOTCONN;
7003
7c673cae
FG
7004 filepath path(relpath);
7005 InodeRef in;
7006 int r = path_walk(path, &in, perms, true, mask);
7007 if (r < 0)
7008 return r;
7009 r = _getattr(in, mask, perms);
7010 if (r < 0) {
7011 ldout(cct, 3) << "stat exit on error!" << dendl;
7012 return r;
7013 }
7014 fill_stat(in, stbuf, dirstat);
7015 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7016 return r;
7017}
7018
7019unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7020{
7021 unsigned mask = 0;
7022
7023 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7024 if (flags & AT_NO_ATTR_SYNC)
7025 goto out;
7026
7027 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7028 mask |= CEPH_CAP_PIN;
7029 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7030 mask |= CEPH_CAP_AUTH_SHARED;
7031 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7032 mask |= CEPH_CAP_LINK_SHARED;
7033 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7034 mask |= CEPH_CAP_FILE_SHARED;
7035 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7036 mask |= CEPH_CAP_XATTR_SHARED;
7037out:
7038 return mask;
7039}
7040
7041int Client::statx(const char *relpath, struct ceph_statx *stx,
7042 const UserPerm& perms,
7043 unsigned int want, unsigned int flags)
7044{
7045 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7046 Mutex::Locker lock(client_lock);
7047 tout(cct) << "statx" << std::endl;
7048 tout(cct) << relpath << std::endl;
181888fb
FG
7049
7050 if (unmounting)
7051 return -ENOTCONN;
7052
7c673cae
FG
7053 filepath path(relpath);
7054 InodeRef in;
7055
7056 unsigned mask = statx_to_mask(flags, want);
7057
7058 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7059 if (r < 0)
7060 return r;
7061
7062 r = _getattr(in, mask, perms);
7063 if (r < 0) {
7064 ldout(cct, 3) << "statx exit on error!" << dendl;
7065 return r;
7066 }
7067
7068 fill_statx(in, mask, stx);
7069 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7070 return r;
7071}
7072
7073int Client::lstat(const char *relpath, struct stat *stbuf,
7074 const UserPerm& perms, frag_info_t *dirstat, int mask)
7075{
7076 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7077 Mutex::Locker lock(client_lock);
7078 tout(cct) << "lstat" << std::endl;
7079 tout(cct) << relpath << std::endl;
181888fb
FG
7080
7081 if (unmounting)
7082 return -ENOTCONN;
7083
7c673cae
FG
7084 filepath path(relpath);
7085 InodeRef in;
7086 // don't follow symlinks
7087 int r = path_walk(path, &in, perms, false, mask);
7088 if (r < 0)
7089 return r;
7090 r = _getattr(in, mask, perms);
7091 if (r < 0) {
7092 ldout(cct, 3) << "lstat exit on error!" << dendl;
7093 return r;
7094 }
7095 fill_stat(in, stbuf, dirstat);
7096 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7097 return r;
7098}
7099
7100int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7101{
7102 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7103 << " mode 0" << oct << in->mode << dec
7104 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7105 memset(st, 0, sizeof(struct stat));
7106 if (use_faked_inos())
7107 st->st_ino = in->faked_ino;
7108 else
7109 st->st_ino = in->ino;
7110 st->st_dev = in->snapid;
7111 st->st_mode = in->mode;
7112 st->st_rdev = in->rdev;
7113 st->st_nlink = in->nlink;
7114 st->st_uid = in->uid;
7115 st->st_gid = in->gid;
7116 if (in->ctime > in->mtime) {
7117 stat_set_ctime_sec(st, in->ctime.sec());
7118 stat_set_ctime_nsec(st, in->ctime.nsec());
7119 } else {
7120 stat_set_ctime_sec(st, in->mtime.sec());
7121 stat_set_ctime_nsec(st, in->mtime.nsec());
7122 }
7123 stat_set_atime_sec(st, in->atime.sec());
7124 stat_set_atime_nsec(st, in->atime.nsec());
7125 stat_set_mtime_sec(st, in->mtime.sec());
7126 stat_set_mtime_nsec(st, in->mtime.nsec());
7127 if (in->is_dir()) {
7128 if (cct->_conf->client_dirsize_rbytes)
7129 st->st_size = in->rstat.rbytes;
7130 else
7131 st->st_size = in->dirstat.size();
7132 st->st_blocks = 1;
7133 } else {
7134 st->st_size = in->size;
7135 st->st_blocks = (in->size + 511) >> 9;
7136 }
7137 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7138
7139 if (dirstat)
7140 *dirstat = in->dirstat;
7141 if (rstat)
7142 *rstat = in->rstat;
7143
7144 return in->caps_issued();
7145}
7146
7147void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7148{
7149 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7150 << " mode 0" << oct << in->mode << dec
7151 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7152 memset(stx, 0, sizeof(struct ceph_statx));
7153
7154 /*
7155 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7156 * so that all bits are set.
7157 */
7158 if (!mask)
7159 mask = ~0;
7160
7161 /* These are always considered to be available */
7162 stx->stx_dev = in->snapid;
7163 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7164
7165 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7166 stx->stx_mode = S_IFMT & in->mode;
7167 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7168 stx->stx_rdev = in->rdev;
7169 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7170
7171 if (mask & CEPH_CAP_AUTH_SHARED) {
7172 stx->stx_uid = in->uid;
7173 stx->stx_gid = in->gid;
7174 stx->stx_mode = in->mode;
7175 in->btime.to_timespec(&stx->stx_btime);
7176 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7177 }
7178
7179 if (mask & CEPH_CAP_LINK_SHARED) {
7180 stx->stx_nlink = in->nlink;
7181 stx->stx_mask |= CEPH_STATX_NLINK;
7182 }
7183
7184 if (mask & CEPH_CAP_FILE_SHARED) {
7185
7186 in->atime.to_timespec(&stx->stx_atime);
7187 in->mtime.to_timespec(&stx->stx_mtime);
7188
7189 if (in->is_dir()) {
7190 if (cct->_conf->client_dirsize_rbytes)
7191 stx->stx_size = in->rstat.rbytes;
7192 else
7193 stx->stx_size = in->dirstat.size();
7194 stx->stx_blocks = 1;
7195 } else {
7196 stx->stx_size = in->size;
7197 stx->stx_blocks = (in->size + 511) >> 9;
7198 }
7199 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7200 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7201 }
7202
7203 /* Change time and change_attr both require all shared caps to view */
7204 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7205 stx->stx_version = in->change_attr;
7206 if (in->ctime > in->mtime)
7207 in->ctime.to_timespec(&stx->stx_ctime);
7208 else
7209 in->mtime.to_timespec(&stx->stx_ctime);
7210 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7211 }
7212
7213}
7214
7215void Client::touch_dn(Dentry *dn)
7216{
7217 lru.lru_touch(dn);
7218}
7219
7220int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7221{
7222 Mutex::Locker lock(client_lock);
7223 tout(cct) << "chmod" << std::endl;
7224 tout(cct) << relpath << std::endl;
7225 tout(cct) << mode << std::endl;
181888fb
FG
7226
7227 if (unmounting)
7228 return -ENOTCONN;
7229
7c673cae
FG
7230 filepath path(relpath);
7231 InodeRef in;
7232 int r = path_walk(path, &in, perms);
7233 if (r < 0)
7234 return r;
7235 struct stat attr;
7236 attr.st_mode = mode;
7237 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7238}
7239
7240int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7241{
7242 Mutex::Locker lock(client_lock);
7243 tout(cct) << "fchmod" << std::endl;
7244 tout(cct) << fd << std::endl;
7245 tout(cct) << mode << std::endl;
181888fb
FG
7246
7247 if (unmounting)
7248 return -ENOTCONN;
7249
7c673cae
FG
7250 Fh *f = get_filehandle(fd);
7251 if (!f)
7252 return -EBADF;
7253#if defined(__linux__) && defined(O_PATH)
7254 if (f->flags & O_PATH)
7255 return -EBADF;
7256#endif
7257 struct stat attr;
7258 attr.st_mode = mode;
7259 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7260}
7261
7262int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7263{
7264 Mutex::Locker lock(client_lock);
7265 tout(cct) << "lchmod" << std::endl;
7266 tout(cct) << relpath << std::endl;
7267 tout(cct) << mode << std::endl;
181888fb
FG
7268
7269 if (unmounting)
7270 return -ENOTCONN;
7271
7c673cae
FG
7272 filepath path(relpath);
7273 InodeRef in;
7274 // don't follow symlinks
7275 int r = path_walk(path, &in, perms, false);
7276 if (r < 0)
7277 return r;
7278 struct stat attr;
7279 attr.st_mode = mode;
7280 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7281}
7282
7283int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7284 const UserPerm& perms)
7285{
7286 Mutex::Locker lock(client_lock);
7287 tout(cct) << "chown" << std::endl;
7288 tout(cct) << relpath << std::endl;
7289 tout(cct) << new_uid << std::endl;
7290 tout(cct) << new_gid << std::endl;
181888fb
FG
7291
7292 if (unmounting)
7293 return -ENOTCONN;
7294
7c673cae
FG
7295 filepath path(relpath);
7296 InodeRef in;
7297 int r = path_walk(path, &in, perms);
7298 if (r < 0)
7299 return r;
7300 struct stat attr;
7301 attr.st_uid = new_uid;
7302 attr.st_gid = new_gid;
181888fb 7303 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7304}
7305
7306int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7307{
7308 Mutex::Locker lock(client_lock);
7309 tout(cct) << "fchown" << std::endl;
7310 tout(cct) << fd << std::endl;
7311 tout(cct) << new_uid << std::endl;
7312 tout(cct) << new_gid << std::endl;
181888fb
FG
7313
7314 if (unmounting)
7315 return -ENOTCONN;
7316
7c673cae
FG
7317 Fh *f = get_filehandle(fd);
7318 if (!f)
7319 return -EBADF;
7320#if defined(__linux__) && defined(O_PATH)
7321 if (f->flags & O_PATH)
7322 return -EBADF;
7323#endif
7324 struct stat attr;
7325 attr.st_uid = new_uid;
7326 attr.st_gid = new_gid;
7327 int mask = 0;
7328 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7329 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7330 return _setattr(f->inode, &attr, mask, perms);
7331}
7332
7333int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7334 const UserPerm& perms)
7335{
7336 Mutex::Locker lock(client_lock);
7337 tout(cct) << "lchown" << std::endl;
7338 tout(cct) << relpath << std::endl;
7339 tout(cct) << new_uid << std::endl;
7340 tout(cct) << new_gid << std::endl;
181888fb
FG
7341
7342 if (unmounting)
7343 return -ENOTCONN;
7344
7c673cae
FG
7345 filepath path(relpath);
7346 InodeRef in;
7347 // don't follow symlinks
7348 int r = path_walk(path, &in, perms, false);
7349 if (r < 0)
7350 return r;
7351 struct stat attr;
7352 attr.st_uid = new_uid;
7353 attr.st_gid = new_gid;
7354 int mask = 0;
7355 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7356 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7357 return _setattr(in, &attr, mask, perms);
7358}
7359
7360int Client::utime(const char *relpath, struct utimbuf *buf,
7361 const UserPerm& perms)
7362{
7363 Mutex::Locker lock(client_lock);
7364 tout(cct) << "utime" << std::endl;
7365 tout(cct) << relpath << std::endl;
7366 tout(cct) << buf->modtime << std::endl;
7367 tout(cct) << buf->actime << std::endl;
181888fb
FG
7368
7369 if (unmounting)
7370 return -ENOTCONN;
7371
7c673cae
FG
7372 filepath path(relpath);
7373 InodeRef in;
7374 int r = path_walk(path, &in, perms);
7375 if (r < 0)
7376 return r;
7377 struct stat attr;
7378 stat_set_mtime_sec(&attr, buf->modtime);
7379 stat_set_mtime_nsec(&attr, 0);
7380 stat_set_atime_sec(&attr, buf->actime);
7381 stat_set_atime_nsec(&attr, 0);
7382 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7383}
7384
7385int Client::lutime(const char *relpath, struct utimbuf *buf,
7386 const UserPerm& perms)
7387{
7388 Mutex::Locker lock(client_lock);
7389 tout(cct) << "lutime" << std::endl;
7390 tout(cct) << relpath << std::endl;
7391 tout(cct) << buf->modtime << std::endl;
7392 tout(cct) << buf->actime << std::endl;
181888fb
FG
7393
7394 if (unmounting)
7395 return -ENOTCONN;
7396
7c673cae
FG
7397 filepath path(relpath);
7398 InodeRef in;
7399 // don't follow symlinks
7400 int r = path_walk(path, &in, perms, false);
7401 if (r < 0)
7402 return r;
7403 struct stat attr;
7404 stat_set_mtime_sec(&attr, buf->modtime);
7405 stat_set_mtime_nsec(&attr, 0);
7406 stat_set_atime_sec(&attr, buf->actime);
7407 stat_set_atime_nsec(&attr, 0);
7408 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7409}
7410
7411int Client::flock(int fd, int operation, uint64_t owner)
7412{
7413 Mutex::Locker lock(client_lock);
7414 tout(cct) << "flock" << std::endl;
7415 tout(cct) << fd << std::endl;
7416 tout(cct) << operation << std::endl;
7417 tout(cct) << owner << std::endl;
181888fb
FG
7418
7419 if (unmounting)
7420 return -ENOTCONN;
7421
7c673cae
FG
7422 Fh *f = get_filehandle(fd);
7423 if (!f)
7424 return -EBADF;
7425
7426 return _flock(f, operation, owner);
7427}
7428
7429int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7430{
7431 Mutex::Locker lock(client_lock);
7432 tout(cct) << "opendir" << std::endl;
7433 tout(cct) << relpath << std::endl;
181888fb
FG
7434
7435 if (unmounting)
7436 return -ENOTCONN;
7437
7c673cae
FG
7438 filepath path(relpath);
7439 InodeRef in;
7440 int r = path_walk(path, &in, perms, true);
7441 if (r < 0)
7442 return r;
7443 if (cct->_conf->client_permissions) {
7444 int r = may_open(in.get(), O_RDONLY, perms);
7445 if (r < 0)
7446 return r;
7447 }
7448 r = _opendir(in.get(), dirpp, perms);
7449 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7450 if (r != -ENOTDIR)
7451 tout(cct) << (unsigned long)*dirpp << std::endl;
7452 return r;
7453}
7454
7455int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7456{
7457 if (!in->is_dir())
7458 return -ENOTDIR;
7459 *dirpp = new dir_result_t(in, perms);
7460 opened_dirs.insert(*dirpp);
7461 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7462 return 0;
7463}
7464
7465
7466int Client::closedir(dir_result_t *dir)
7467{
7468 Mutex::Locker lock(client_lock);
7469 tout(cct) << "closedir" << std::endl;
7470 tout(cct) << (unsigned long)dir << std::endl;
7471
7472 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7473 _closedir(dir);
7474 return 0;
7475}
7476
7477void Client::_closedir(dir_result_t *dirp)
7478{
7479 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7480 if (dirp->inode) {
7481 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7482 dirp->inode.reset();
7483 }
7484 _readdir_drop_dirp_buffer(dirp);
7485 opened_dirs.erase(dirp);
7486 delete dirp;
7487}
7488
7489void Client::rewinddir(dir_result_t *dirp)
7490{
7491 Mutex::Locker lock(client_lock);
7c673cae 7492 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
181888fb
FG
7493
7494 if (unmounting)
7495 return;
7496
7c673cae
FG
7497 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7498 _readdir_drop_dirp_buffer(d);
7499 d->reset();
7500}
7501
7502loff_t Client::telldir(dir_result_t *dirp)
7503{
7504 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7505 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7506 return d->offset;
7507}
7508
7509void Client::seekdir(dir_result_t *dirp, loff_t offset)
7510{
7511 Mutex::Locker lock(client_lock);
7512
7513 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7514
181888fb
FG
7515 if (unmounting)
7516 return;
7517
7c673cae
FG
7518 if (offset == dirp->offset)
7519 return;
7520
7521 if (offset > dirp->offset)
7522 dirp->release_count = 0; // bump if we do a forward seek
7523 else
7524 dirp->ordered_count = 0; // disable filling readdir cache
7525
7526 if (dirp->hash_order()) {
7527 if (dirp->offset > offset) {
7528 _readdir_drop_dirp_buffer(dirp);
7529 dirp->reset();
7530 }
7531 } else {
7532 if (offset == 0 ||
7533 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7534 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7535 _readdir_drop_dirp_buffer(dirp);
7536 dirp->reset();
7537 }
7538 }
7539
7540 dirp->offset = offset;
7541}
7542
7543
7544//struct dirent {
7545// ino_t d_ino; /* inode number */
7546// off_t d_off; /* offset to the next dirent */
7547// unsigned short d_reclen; /* length of this record */
7548// unsigned char d_type; /* type of file */
7549// char d_name[256]; /* filename */
7550//};
7551void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7552{
7553 strncpy(de->d_name, name, 255);
7554 de->d_name[255] = '\0';
7555#ifndef __CYGWIN__
7556 de->d_ino = ino;
7557#if !defined(DARWIN) && !defined(__FreeBSD__)
7558 de->d_off = next_off;
7559#endif
7560 de->d_reclen = 1;
7561 de->d_type = IFTODT(type);
7562 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7563 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7564#endif
7565}
7566
7567void Client::_readdir_next_frag(dir_result_t *dirp)
7568{
7569 frag_t fg = dirp->buffer_frag;
7570
7571 if (fg.is_rightmost()) {
7572 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7573 dirp->set_end();
7574 return;
7575 }
7576
7577 // advance
7578 fg = fg.next();
7579 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7580
7581 if (dirp->hash_order()) {
7582 // keep last_name
7583 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7584 if (dirp->offset < new_offset) // don't decrease offset
7585 dirp->offset = new_offset;
7586 } else {
7587 dirp->last_name.clear();
7588 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7589 _readdir_rechoose_frag(dirp);
7590 }
7591}
7592
7593void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7594{
7595 assert(dirp->inode);
7596
7597 if (dirp->hash_order())
7598 return;
7599
7600 frag_t cur = frag_t(dirp->offset_high());
7601 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7602 if (fg != cur) {
7603 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7604 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7605 dirp->last_name.clear();
7606 dirp->next_offset = 2;
7607 }
7608}
7609
7610void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7611{
7612 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7613 dirp->buffer.clear();
7614}
7615
7616int Client::_readdir_get_frag(dir_result_t *dirp)
7617{
7618 assert(dirp);
7619 assert(dirp->inode);
7620
7621 // get the current frag.
7622 frag_t fg;
7623 if (dirp->hash_order())
7624 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7625 else
7626 fg = frag_t(dirp->offset_high());
7627
7628 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7629 << " offset " << hex << dirp->offset << dec << dendl;
7630
7631 int op = CEPH_MDS_OP_READDIR;
7632 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7633 op = CEPH_MDS_OP_LSSNAP;
7634
7635 InodeRef& diri = dirp->inode;
7636
7637 MetaRequest *req = new MetaRequest(op);
7638 filepath path;
7639 diri->make_nosnap_relative_path(path);
7640 req->set_filepath(path);
7641 req->set_inode(diri.get());
7642 req->head.args.readdir.frag = fg;
7643 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7644 if (dirp->last_name.length()) {
7645 req->path2.set_path(dirp->last_name.c_str());
7646 } else if (dirp->hash_order()) {
7647 req->head.args.readdir.offset_hash = dirp->offset_high();
7648 }
7649 req->dirp = dirp;
7650
7651 bufferlist dirbl;
7652 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7653
7654 if (res == -EAGAIN) {
7655 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7656 _readdir_rechoose_frag(dirp);
7657 return _readdir_get_frag(dirp);
7658 }
7659
7660 if (res == 0) {
7661 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7662 << " size " << dirp->buffer.size() << dendl;
7663 } else {
7664 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7665 dirp->set_end();
7666 }
7667
7668 return res;
7669}
7670
7671struct dentry_off_lt {
7672 bool operator()(const Dentry* dn, int64_t off) const {
7673 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7674 }
7675};
7676
7677int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7678 int caps, bool getref)
7679{
7680 assert(client_lock.is_locked());
7681 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7682 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7683 << dendl;
7684 Dir *dir = dirp->inode->dir;
7685
7686 if (!dir) {
7687 ldout(cct, 10) << " dir is empty" << dendl;
7688 dirp->set_end();
7689 return 0;
7690 }
7691
7692 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7693 dir->readdir_cache.end(),
7694 dirp->offset, dentry_off_lt());
7695
7696 string dn_name;
7697 while (true) {
7698 if (!dirp->inode->is_complete_and_ordered())
7699 return -EAGAIN;
7700 if (pd == dir->readdir_cache.end())
7701 break;
7702 Dentry *dn = *pd;
7703 if (dn->inode == NULL) {
7704 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7705 ++pd;
7706 continue;
7707 }
7708 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7709 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7710 ++pd;
7711 continue;
7712 }
7713
7714 int r = _getattr(dn->inode, caps, dirp->perms);
7715 if (r < 0)
7716 return r;
7717
7718 struct ceph_statx stx;
7719 struct dirent de;
7720 fill_statx(dn->inode, caps, &stx);
7721
7722 uint64_t next_off = dn->offset + 1;
7723 ++pd;
7724 if (pd == dir->readdir_cache.end())
7725 next_off = dir_result_t::END;
7726
7727 Inode *in = NULL;
7728 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7729 if (getref) {
7730 in = dn->inode.get();
7731 _ll_get(in);
7732 }
7733
7734 dn_name = dn->name; // fill in name while we have lock
7735
7736 client_lock.Unlock();
7737 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7738 client_lock.Lock();
7739 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7740 << " = " << r << dendl;
7741 if (r < 0) {
7742 return r;
7743 }
7744
7745 dirp->offset = next_off;
7746 if (dirp->at_end())
7747 dirp->next_offset = 2;
7748 else
7749 dirp->next_offset = dirp->offset_low();
7750 dirp->last_name = dn_name; // we successfully returned this one; update!
7751 if (r > 0)
7752 return r;
7753 }
7754
7755 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7756 dirp->set_end();
7757 return 0;
7758}
7759
7760int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7761 unsigned want, unsigned flags, bool getref)
7762{
7763 int caps = statx_to_mask(flags, want);
7764
7765 Mutex::Locker lock(client_lock);
7766
181888fb
FG
7767 if (unmounting)
7768 return -ENOTCONN;
7769
7c673cae
FG
7770 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7771
7772 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7773 << dec << " at_end=" << dirp->at_end()
7774 << " hash_order=" << dirp->hash_order() << dendl;
7775
7776 struct dirent de;
7777 struct ceph_statx stx;
7778 memset(&de, 0, sizeof(de));
7779 memset(&stx, 0, sizeof(stx));
7780
7781 InodeRef& diri = dirp->inode;
7782
7783 if (dirp->at_end())
7784 return 0;
7785
7786 if (dirp->offset == 0) {
7787 ldout(cct, 15) << " including ." << dendl;
7788 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7789 uint64_t next_off = 1;
7790
7791 int r;
7792 r = _getattr(diri, caps, dirp->perms);
7793 if (r < 0)
7794 return r;
7795
7796 fill_statx(diri, caps, &stx);
7797 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7798
7799 Inode *inode = NULL;
7800 if (getref) {
7801 inode = diri.get();
7802 _ll_get(inode);
7803 }
7804
7805 client_lock.Unlock();
7806 r = cb(p, &de, &stx, next_off, inode);
7807 client_lock.Lock();
7808 if (r < 0)
7809 return r;
7810
7811 dirp->offset = next_off;
7812 if (r > 0)
7813 return r;
7814 }
7815 if (dirp->offset == 1) {
7816 ldout(cct, 15) << " including .." << dendl;
7817 uint64_t next_off = 2;
7818 InodeRef in;
7819 if (diri->dn_set.empty())
7820 in = diri;
7821 else
7822 in = diri->get_first_parent()->inode;
7823
7824 int r;
7825 r = _getattr(diri, caps, dirp->perms);
7826 if (r < 0)
7827 return r;
7828
7829 fill_statx(in, caps, &stx);
7830 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7831
7832 Inode *inode = NULL;
7833 if (getref) {
7834 inode = in.get();
7835 _ll_get(inode);
7836 }
7837
7838 client_lock.Unlock();
7839 r = cb(p, &de, &stx, next_off, inode);
7840 client_lock.Lock();
7841 if (r < 0)
7842 return r;
7843
7844 dirp->offset = next_off;
7845 if (r > 0)
7846 return r;
7847 }
7848
7849 // can we read from our cache?
7850 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7851 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7852 << dirp->inode->is_complete_and_ordered()
7853 << " issued " << ccap_string(dirp->inode->caps_issued())
7854 << dendl;
7855 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7856 dirp->inode->is_complete_and_ordered() &&
7857 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
7858 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7859 if (err != -EAGAIN)
7860 return err;
7861 }
7862
7863 while (1) {
7864 if (dirp->at_end())
7865 return 0;
7866
7867 bool check_caps = true;
7868 if (!dirp->is_cached()) {
7869 int r = _readdir_get_frag(dirp);
7870 if (r)
7871 return r;
7872 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7873 // different than the requested one. (our dirfragtree was outdated)
7874 check_caps = false;
7875 }
7876 frag_t fg = dirp->buffer_frag;
7877
7878 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7879 << " offset " << hex << dirp->offset << dendl;
7880
7881 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7882 dirp->offset, dir_result_t::dentry_off_lt());
7883 it != dirp->buffer.end();
7884 ++it) {
7885 dir_result_t::dentry &entry = *it;
7886
7887 uint64_t next_off = entry.offset + 1;
7888
7889 int r;
7890 if (check_caps) {
7891 r = _getattr(entry.inode, caps, dirp->perms);
7892 if (r < 0)
7893 return r;
7894 }
7895
7896 fill_statx(entry.inode, caps, &stx);
7897 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7898
7899 Inode *inode = NULL;
7900 if (getref) {
7901 inode = entry.inode.get();
7902 _ll_get(inode);
7903 }
7904
7905 client_lock.Unlock();
7906 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7907 client_lock.Lock();
7908
7909 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7910 << " = " << r << dendl;
7911 if (r < 0)
7912 return r;
7913
7914 dirp->offset = next_off;
7915 if (r > 0)
7916 return r;
7917 }
7918
7919 if (dirp->next_offset > 2) {
7920 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7921 _readdir_drop_dirp_buffer(dirp);
7922 continue; // more!
7923 }
7924
7925 if (!fg.is_rightmost()) {
7926 // next frag!
7927 _readdir_next_frag(dirp);
7928 continue;
7929 }
7930
7931 if (diri->shared_gen == dirp->start_shared_gen &&
7932 diri->dir_release_count == dirp->release_count) {
7933 if (diri->dir_ordered_count == dirp->ordered_count) {
7934 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7935 if (diri->dir) {
7936 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7937 diri->dir->readdir_cache.resize(dirp->cache_index);
7938 }
7939 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7940 } else {
7941 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7942 diri->flags |= I_COMPLETE;
7943 }
7944 }
7945
7946 dirp->set_end();
7947 return 0;
7948 }
7949 ceph_abort();
7950 return 0;
7951}
7952
7953
7954int Client::readdir_r(dir_result_t *d, struct dirent *de)
7955{
7956 return readdirplus_r(d, de, 0, 0, 0, NULL);
7957}
7958
7959/*
7960 * readdirplus_r
7961 *
7962 * returns
7963 * 1 if we got a dirent
7964 * 0 for end of directory
7965 * <0 on error
7966 */
7967
7968struct single_readdir {
7969 struct dirent *de;
7970 struct ceph_statx *stx;
7971 Inode *inode;
7972 bool full;
7973};
7974
7975static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7976 struct ceph_statx *stx, off_t off,
7977 Inode *in)
7978{
7979 single_readdir *c = static_cast<single_readdir *>(p);
7980
7981 if (c->full)
7982 return -1; // already filled this dirent
7983
7984 *c->de = *de;
7985 if (c->stx)
7986 *c->stx = *stx;
7987 c->inode = in;
7988 c->full = true;
7989 return 1;
7990}
7991
7992struct dirent *Client::readdir(dir_result_t *d)
7993{
7994 int ret;
7995 static struct dirent de;
7996 single_readdir sr;
7997 sr.de = &de;
7998 sr.stx = NULL;
7999 sr.inode = NULL;
8000 sr.full = false;
8001
8002 // our callback fills the dirent and sets sr.full=true on first
8003 // call, and returns -1 the second time around.
8004 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8005 if (ret < -1) {
8006 errno = -ret; // this sucks.
8007 return (dirent *) NULL;
8008 }
8009 if (sr.full) {
8010 return &de;
8011 }
8012 return (dirent *) NULL;
8013}
8014
8015int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8016 struct ceph_statx *stx, unsigned want,
8017 unsigned flags, Inode **out)
8018{
8019 single_readdir sr;
8020 sr.de = de;
8021 sr.stx = stx;
8022 sr.inode = NULL;
8023 sr.full = false;
8024
8025 // our callback fills the dirent and sets sr.full=true on first
8026 // call, and returns -1 the second time around.
8027 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8028 if (r < -1)
8029 return r;
8030 if (out)
8031 *out = sr.inode;
8032 if (sr.full)
8033 return 1;
8034 return 0;
8035}
8036
8037
8038/* getdents */
8039struct getdents_result {
8040 char *buf;
8041 int buflen;
8042 int pos;
8043 bool fullent;
8044};
8045
8046static int _readdir_getdent_cb(void *p, struct dirent *de,
8047 struct ceph_statx *stx, off_t off, Inode *in)
8048{
8049 struct getdents_result *c = static_cast<getdents_result *>(p);
8050
8051 int dlen;
8052 if (c->fullent)
8053 dlen = sizeof(*de);
8054 else
8055 dlen = strlen(de->d_name) + 1;
8056
8057 if (c->pos + dlen > c->buflen)
8058 return -1; // doesn't fit
8059
8060 if (c->fullent) {
8061 memcpy(c->buf + c->pos, de, sizeof(*de));
8062 } else {
8063 memcpy(c->buf + c->pos, de->d_name, dlen);
8064 }
8065 c->pos += dlen;
8066 return 0;
8067}
8068
8069int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8070{
8071 getdents_result gr;
8072 gr.buf = buf;
8073 gr.buflen = buflen;
8074 gr.fullent = fullent;
8075 gr.pos = 0;
8076
8077 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8078
8079 if (r < 0) { // some error
8080 if (r == -1) { // buffer ran out of space
8081 if (gr.pos) { // but we got some entries already!
8082 return gr.pos;
8083 } // or we need a larger buffer
8084 return -ERANGE;
8085 } else { // actual error, return it
8086 return r;
8087 }
8088 }
8089 return gr.pos;
8090}
8091
8092
8093/* getdir */
8094struct getdir_result {
8095 list<string> *contents;
8096 int num;
8097};
8098
8099static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8100{
8101 getdir_result *r = static_cast<getdir_result *>(p);
8102
8103 r->contents->push_back(de->d_name);
8104 r->num++;
8105 return 0;
8106}
8107
8108int Client::getdir(const char *relpath, list<string>& contents,
8109 const UserPerm& perms)
8110{
8111 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8112 {
8113 Mutex::Locker lock(client_lock);
8114 tout(cct) << "getdir" << std::endl;
8115 tout(cct) << relpath << std::endl;
8116 }
8117
8118 dir_result_t *d;
8119 int r = opendir(relpath, &d, perms);
8120 if (r < 0)
8121 return r;
8122
8123 getdir_result gr;
8124 gr.contents = &contents;
8125 gr.num = 0;
8126 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8127
8128 closedir(d);
8129
8130 if (r < 0)
8131 return r;
8132 return gr.num;
8133}
8134
8135
8136/****** file i/o **********/
8137int Client::open(const char *relpath, int flags, const UserPerm& perms,
8138 mode_t mode, int stripe_unit, int stripe_count,
8139 int object_size, const char *data_pool)
8140{
8141 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8142 Mutex::Locker lock(client_lock);
8143 tout(cct) << "open" << std::endl;
8144 tout(cct) << relpath << std::endl;
8145 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8146
181888fb
FG
8147 if (unmounting)
8148 return -ENOTCONN;
8149
7c673cae
FG
8150 Fh *fh = NULL;
8151
8152#if defined(__linux__) && defined(O_PATH)
8153 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8154 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8155 * in kernel (fs/open.c). */
8156 if (flags & O_PATH)
8157 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8158#endif
8159
8160 filepath path(relpath);
8161 InodeRef in;
8162 bool created = false;
8163 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8164 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8165 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8166
8167 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8168 return -EEXIST;
8169
8170#if defined(__linux__) && defined(O_PATH)
8171 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8172#else
8173 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8174#endif
8175 return -ELOOP;
8176
8177 if (r == -ENOENT && (flags & O_CREAT)) {
8178 filepath dirpath = path;
8179 string dname = dirpath.last_dentry();
8180 dirpath.pop_dentry();
8181 InodeRef dir;
8182 r = path_walk(dirpath, &dir, perms, true,
8183 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8184 if (r < 0)
8185 goto out;
8186 if (cct->_conf->client_permissions) {
8187 r = may_create(dir.get(), perms);
8188 if (r < 0)
8189 goto out;
8190 }
8191 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8192 stripe_count, object_size, data_pool, &created, perms);
8193 }
8194 if (r < 0)
8195 goto out;
8196
8197 if (!created) {
8198 // posix says we can only check permissions of existing files
8199 if (cct->_conf->client_permissions) {
8200 r = may_open(in.get(), flags, perms);
8201 if (r < 0)
8202 goto out;
8203 }
8204 }
8205
8206 if (!fh)
8207 r = _open(in.get(), flags, mode, &fh, perms);
8208 if (r >= 0) {
8209 // allocate a integer file descriptor
8210 assert(fh);
8211 r = get_fd();
8212 assert(fd_map.count(r) == 0);
8213 fd_map[r] = fh;
8214 }
8215
8216 out:
8217 tout(cct) << r << std::endl;
8218 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8219 return r;
8220}
8221
8222int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8223{
8224 /* Use default file striping parameters */
8225 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8226}
8227
8228int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8229 const UserPerm& perms)
8230{
8231 Mutex::Locker lock(client_lock);
8232 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8233
181888fb
FG
8234 if (unmounting)
8235 return -ENOTCONN;
8236
7c673cae
FG
8237 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8238 filepath path(ino);
8239 req->set_filepath(path);
8240
8241 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8242 char f[30];
8243 sprintf(f, "%u", h);
8244 filepath path2(dirino);
8245 path2.push_dentry(string(f));
8246 req->set_filepath2(path2);
8247
8248 int r = make_request(req, perms, NULL, NULL,
8249 rand() % mdsmap->get_num_in_mds());
8250 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8251 return r;
8252}
8253
8254
8255/**
8256 * Load inode into local cache.
8257 *
8258 * If inode pointer is non-NULL, and take a reference on
8259 * the resulting Inode object in one operation, so that caller
8260 * can safely assume inode will still be there after return.
8261 */
8262int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8263{
8264 Mutex::Locker lock(client_lock);
8265 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8266
181888fb
FG
8267 if (unmounting)
8268 return -ENOTCONN;
8269
7c673cae
FG
8270 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8271 filepath path(ino);
8272 req->set_filepath(path);
8273
8274 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8275 if (r == 0 && inode != NULL) {
8276 vinodeno_t vino(ino, CEPH_NOSNAP);
8277 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8278 assert(p != inode_map.end());
8279 *inode = p->second;
8280 _ll_get(*inode);
8281 }
8282 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8283 return r;
8284}
8285
8286
8287
8288/**
8289 * Find the parent inode of `ino` and insert it into
8290 * our cache. Conditionally also set `parent` to a referenced
8291 * Inode* if caller provides non-NULL value.
8292 */
8293int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8294{
8295 Mutex::Locker lock(client_lock);
8296 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8297
181888fb
FG
8298 if (unmounting)
8299 return -ENOTCONN;
8300
7c673cae
FG
8301 if (!ino->dn_set.empty()) {
8302 // if we exposed the parent here, we'd need to check permissions,
8303 // but right now we just rely on the MDS doing so in make_request
8304 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8305 return 0;
8306 }
8307
8308 if (ino->is_root()) {
8309 *parent = NULL;
8310 ldout(cct, 3) << "ino is root, no parent" << dendl;
8311 return -EINVAL;
8312 }
8313
8314 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8315 filepath path(ino->ino);
8316 req->set_filepath(path);
8317
8318 InodeRef target;
8319 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8320 // Give caller a reference to the parent ino if they provided a pointer.
8321 if (parent != NULL) {
8322 if (r == 0) {
8323 *parent = target.get();
8324 _ll_get(*parent);
8325 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8326 } else {
8327 *parent = NULL;
8328 }
8329 }
8330 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8331 return r;
8332}
8333
8334
8335/**
8336 * Populate the parent dentry for `ino`, provided it is
8337 * a child of `parent`.
8338 */
8339int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8340{
8341 assert(parent->is_dir());
8342
8343 Mutex::Locker lock(client_lock);
8344 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8345
181888fb
FG
8346 if (unmounting)
8347 return -ENOTCONN;
8348
7c673cae
FG
8349 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8350 req->set_filepath2(filepath(parent->ino));
8351 req->set_filepath(filepath(ino->ino));
8352 req->set_inode(ino);
8353
8354 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8355 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8356 return r;
8357}
8358
8359
8360 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8361{
8362 assert(in);
8363 Fh *f = new Fh(in);
8364 f->mode = cmode;
8365 f->flags = flags;
8366
8367 // inode
8368 f->actor_perms = perms;
8369
8370 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8371
8372 if (in->snapid != CEPH_NOSNAP) {
8373 in->snap_cap_refs++;
8374 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8375 << ccap_string(in->caps_issued()) << dendl;
8376 }
8377
8378 const md_config_t *conf = cct->_conf;
8379 f->readahead.set_trigger_requests(1);
8380 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8381 uint64_t max_readahead = Readahead::NO_LIMIT;
8382 if (conf->client_readahead_max_bytes) {
8383 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8384 }
8385 if (conf->client_readahead_max_periods) {
8386 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8387 }
8388 f->readahead.set_max_readahead_size(max_readahead);
8389 vector<uint64_t> alignments;
8390 alignments.push_back(in->layout.get_period());
8391 alignments.push_back(in->layout.stripe_unit);
8392 f->readahead.set_alignments(alignments);
8393
8394 return f;
8395}
8396
8397int Client::_release_fh(Fh *f)
8398{
8399 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8400 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8401 Inode *in = f->inode.get();
8402 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8403
b32b8144
FG
8404 in->unset_deleg(f);
8405
7c673cae
FG
8406 if (in->snapid == CEPH_NOSNAP) {
8407 if (in->put_open_ref(f->mode)) {
8408 _flush(in, new C_Client_FlushComplete(this, in));
8409 check_caps(in, 0);
8410 }
8411 } else {
8412 assert(in->snap_cap_refs > 0);
8413 in->snap_cap_refs--;
8414 }
8415
8416 _release_filelocks(f);
8417
8418 // Finally, read any async err (i.e. from flushes)
8419 int err = f->take_async_err();
8420 if (err != 0) {
8421 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8422 << cpp_strerror(err) << dendl;
8423 } else {
8424 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8425 }
8426
8427 _put_fh(f);
8428
8429 return err;
8430}
8431
8432void Client::_put_fh(Fh *f)
8433{
8434 int left = f->put();
8435 if (!left) {
8436 delete f;
8437 }
8438}
8439
8440int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8441 const UserPerm& perms)
8442{
8443 if (in->snapid != CEPH_NOSNAP &&
8444 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8445 return -EROFS;
8446 }
8447
8448 // use normalized flags to generate cmode
8449 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8450 if (cmode < 0)
8451 return -EINVAL;
8452 int want = ceph_caps_for_mode(cmode);
8453 int result = 0;
8454
8455 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8456
b32b8144 8457 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8458 // update wanted?
8459 check_caps(in, CHECK_CAPS_NODELAY);
8460 } else {
b32b8144 8461
7c673cae
FG
8462 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8463 filepath path;
8464 in->make_nosnap_relative_path(path);
8465 req->set_filepath(path);
8466 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8467 req->head.args.open.mode = mode;
8468 req->head.args.open.pool = -1;
8469 if (cct->_conf->client_debug_getattr_caps)
8470 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8471 else
8472 req->head.args.open.mask = 0;
8473 req->head.args.open.old_size = in->size; // for O_TRUNC
8474 req->set_inode(in);
8475 result = make_request(req, perms);
b32b8144
FG
8476
8477 /*
8478 * NFS expects that delegations will be broken on a conflicting open,
8479 * not just when there is actual conflicting access to the file. SMB leases
8480 * and oplocks also have similar semantics.
8481 *
8482 * Ensure that clients that have delegations enabled will wait on minimal
8483 * caps during open, just to ensure that other clients holding delegations
8484 * return theirs first.
8485 */
8486 if (deleg_timeout && result == 0) {
8487 int need = 0, have;
8488
8489 if (cmode & CEPH_FILE_MODE_WR)
8490 need |= CEPH_CAP_FILE_WR;
8491 if (cmode & CEPH_FILE_MODE_RD)
8492 need |= CEPH_CAP_FILE_RD;
8493
8494 result = get_caps(in, need, want, &have, -1);
8495 if (result < 0) {
8496 ldout(cct, 1) << "Unable to get caps after open of inode " << *in <<
8497 " . Denying open: " <<
8498 cpp_strerror(result) << dendl;
8499 in->put_open_ref(cmode);
8500 } else {
8501 put_cap_ref(in, need);
8502 }
8503 }
7c673cae
FG
8504 }
8505
8506 // success?
8507 if (result >= 0) {
8508 if (fhp)
8509 *fhp = _create_fh(in, flags, cmode, perms);
8510 } else {
8511 in->put_open_ref(cmode);
8512 }
8513
8514 trim_cache();
8515
8516 return result;
8517}
8518
8519int Client::_renew_caps(Inode *in)
8520{
8521 int wanted = in->caps_file_wanted();
8522 if (in->is_any_caps() &&
8523 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8524 check_caps(in, CHECK_CAPS_NODELAY);
8525 return 0;
8526 }
8527
8528 int flags = 0;
8529 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8530 flags = O_RDWR;
8531 else if (wanted & CEPH_CAP_FILE_RD)
8532 flags = O_RDONLY;
8533 else if (wanted & CEPH_CAP_FILE_WR)
8534 flags = O_WRONLY;
8535
8536 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8537 filepath path;
8538 in->make_nosnap_relative_path(path);
8539 req->set_filepath(path);
8540 req->head.args.open.flags = flags;
8541 req->head.args.open.pool = -1;
8542 if (cct->_conf->client_debug_getattr_caps)
8543 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8544 else
8545 req->head.args.open.mask = 0;
8546 req->set_inode(in);
8547
8548 // duplicate in case Cap goes away; not sure if that race is a concern?
8549 const UserPerm *pperm = in->get_best_perms();
8550 UserPerm perms;
8551 if (pperm != NULL)
8552 perms = *pperm;
8553 int ret = make_request(req, perms);
8554 return ret;
8555}
8556
8557int Client::close(int fd)
8558{
8559 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8560 Mutex::Locker lock(client_lock);
8561 tout(cct) << "close" << std::endl;
8562 tout(cct) << fd << std::endl;
8563
181888fb
FG
8564 if (unmounting)
8565 return -ENOTCONN;
8566
7c673cae
FG
8567 Fh *fh = get_filehandle(fd);
8568 if (!fh)
8569 return -EBADF;
8570 int err = _release_fh(fh);
8571 fd_map.erase(fd);
8572 put_fd(fd);
8573 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8574 return err;
8575}
8576
8577
8578// ------------
8579// read, write
8580
8581loff_t Client::lseek(int fd, loff_t offset, int whence)
8582{
8583 Mutex::Locker lock(client_lock);
8584 tout(cct) << "lseek" << std::endl;
8585 tout(cct) << fd << std::endl;
8586 tout(cct) << offset << std::endl;
8587 tout(cct) << whence << std::endl;
8588
181888fb
FG
8589 if (unmounting)
8590 return -ENOTCONN;
8591
7c673cae
FG
8592 Fh *f = get_filehandle(fd);
8593 if (!f)
8594 return -EBADF;
8595#if defined(__linux__) && defined(O_PATH)
8596 if (f->flags & O_PATH)
8597 return -EBADF;
8598#endif
8599 return _lseek(f, offset, whence);
8600}
8601
8602loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8603{
8604 Inode *in = f->inode.get();
8605 int r;
8606
8607 switch (whence) {
8608 case SEEK_SET:
8609 f->pos = offset;
8610 break;
8611
8612 case SEEK_CUR:
8613 f->pos += offset;
8614 break;
8615
8616 case SEEK_END:
8617 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8618 if (r < 0)
8619 return r;
8620 f->pos = in->size + offset;
8621 break;
8622
8623 default:
8624 ceph_abort();
8625 }
8626
8627 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8628 return f->pos;
8629}
8630
8631
8632void Client::lock_fh_pos(Fh *f)
8633{
8634 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8635
8636 if (f->pos_locked || !f->pos_waiters.empty()) {
8637 Cond cond;
8638 f->pos_waiters.push_back(&cond);
8639 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8640 while (f->pos_locked || f->pos_waiters.front() != &cond)
8641 cond.Wait(client_lock);
8642 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8643 assert(f->pos_waiters.front() == &cond);
8644 f->pos_waiters.pop_front();
8645 }
8646
8647 f->pos_locked = true;
8648}
8649
8650void Client::unlock_fh_pos(Fh *f)
8651{
8652 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8653 f->pos_locked = false;
8654}
8655
8656int Client::uninline_data(Inode *in, Context *onfinish)
8657{
8658 if (!in->inline_data.length()) {
8659 onfinish->complete(0);
8660 return 0;
8661 }
8662
8663 char oid_buf[32];
8664 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8665 object_t oid = oid_buf;
8666
8667 ObjectOperation create_ops;
8668 create_ops.create(false);
8669
8670 objecter->mutate(oid,
8671 OSDMap::file_to_object_locator(in->layout),
8672 create_ops,
8673 in->snaprealm->get_snap_context(),
8674 ceph::real_clock::now(),
8675 0,
8676 NULL);
8677
8678 bufferlist inline_version_bl;
8679 ::encode(in->inline_version, inline_version_bl);
8680
8681 ObjectOperation uninline_ops;
8682 uninline_ops.cmpxattr("inline_version",
8683 CEPH_OSD_CMPXATTR_OP_GT,
8684 CEPH_OSD_CMPXATTR_MODE_U64,
8685 inline_version_bl);
8686 bufferlist inline_data = in->inline_data;
8687 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8688 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8689
8690 objecter->mutate(oid,
8691 OSDMap::file_to_object_locator(in->layout),
8692 uninline_ops,
8693 in->snaprealm->get_snap_context(),
8694 ceph::real_clock::now(),
8695 0,
8696 onfinish);
8697
8698 return 0;
8699}
8700
8701//
8702
8703// blocking osd interface
8704
8705int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8706{
8707 Mutex::Locker lock(client_lock);
8708 tout(cct) << "read" << std::endl;
8709 tout(cct) << fd << std::endl;
8710 tout(cct) << size << std::endl;
8711 tout(cct) << offset << std::endl;
8712
181888fb
FG
8713 if (unmounting)
8714 return -ENOTCONN;
8715
7c673cae
FG
8716 Fh *f = get_filehandle(fd);
8717 if (!f)
8718 return -EBADF;
8719#if defined(__linux__) && defined(O_PATH)
8720 if (f->flags & O_PATH)
8721 return -EBADF;
8722#endif
8723 bufferlist bl;
8724 int r = _read(f, offset, size, &bl);
8725 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8726 if (r >= 0) {
8727 bl.copy(0, bl.length(), buf);
8728 r = bl.length();
8729 }
8730 return r;
8731}
8732
8733int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8734{
8735 if (iovcnt < 0)
8736 return -EINVAL;
8737 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8738}
8739
8740int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8741{
8742 const md_config_t *conf = cct->_conf;
8743 Inode *in = f->inode.get();
8744
8745 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8746 return -EBADF;
8747 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8748
8749 bool movepos = false;
8750 if (offset < 0) {
8751 lock_fh_pos(f);
8752 offset = f->pos;
8753 movepos = true;
8754 }
8755 loff_t start_pos = offset;
8756
8757 if (in->inline_version == 0) {
8758 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8759 if (r < 0) {
8760 if (movepos)
8761 unlock_fh_pos(f);
7c673cae 8762 return r;
c07f9fc5 8763 }
7c673cae
FG
8764 assert(in->inline_version > 0);
8765 }
8766
8767retry:
8768 int have;
8769 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8770 if (r < 0) {
8771 if (movepos)
8772 unlock_fh_pos(f);
7c673cae 8773 return r;
c07f9fc5 8774 }
7c673cae
FG
8775 if (f->flags & O_DIRECT)
8776 have &= ~CEPH_CAP_FILE_CACHE;
8777
8778 Mutex uninline_flock("Client::_read_uninline_data flock");
8779 Cond uninline_cond;
8780 bool uninline_done = false;
8781 int uninline_ret = 0;
8782 Context *onuninline = NULL;
8783
8784 if (in->inline_version < CEPH_INLINE_NONE) {
8785 if (!(have & CEPH_CAP_FILE_CACHE)) {
8786 onuninline = new C_SafeCond(&uninline_flock,
8787 &uninline_cond,
8788 &uninline_done,
8789 &uninline_ret);
8790 uninline_data(in, onuninline);
8791 } else {
8792 uint32_t len = in->inline_data.length();
8793
8794 uint64_t endoff = offset + size;
8795 if (endoff > in->size)
8796 endoff = in->size;
8797
8798 if (offset < len) {
8799 if (endoff <= len) {
8800 bl->substr_of(in->inline_data, offset, endoff - offset);
8801 } else {
8802 bl->substr_of(in->inline_data, offset, len - offset);
8803 bl->append_zero(endoff - len);
8804 }
8805 } else if ((uint64_t)offset < endoff) {
8806 bl->append_zero(endoff - offset);
8807 }
8808
8809 goto success;
8810 }
8811 }
8812
8813 if (!conf->client_debug_force_sync_read &&
8814 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8815
8816 if (f->flags & O_RSYNC) {
8817 _flush_range(in, offset, size);
8818 }
8819 r = _read_async(f, offset, size, bl);
8820 if (r < 0)
8821 goto done;
8822 } else {
8823 if (f->flags & O_DIRECT)
8824 _flush_range(in, offset, size);
8825
8826 bool checkeof = false;
8827 r = _read_sync(f, offset, size, bl, &checkeof);
8828 if (r < 0)
8829 goto done;
8830 if (checkeof) {
8831 offset += r;
8832 size -= r;
8833
8834 put_cap_ref(in, CEPH_CAP_FILE_RD);
8835 have = 0;
8836 // reverify size
8837 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8838 if (r < 0)
8839 goto done;
8840
8841 // eof? short read.
8842 if ((uint64_t)offset < in->size)
8843 goto retry;
8844 }
8845 }
8846
8847success:
8848 if (movepos) {
8849 // adjust fd pos
8850 f->pos = start_pos + bl->length();
8851 unlock_fh_pos(f);
8852 }
8853
8854done:
8855 // done!
8856
8857 if (onuninline) {
8858 client_lock.Unlock();
8859 uninline_flock.Lock();
8860 while (!uninline_done)
8861 uninline_cond.Wait(uninline_flock);
8862 uninline_flock.Unlock();
8863 client_lock.Lock();
8864
8865 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8866 in->inline_data.clear();
8867 in->inline_version = CEPH_INLINE_NONE;
8868 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8869 check_caps(in, 0);
8870 } else
8871 r = uninline_ret;
8872 }
8873
8874 if (have)
8875 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8876 if (r < 0) {
8877 if (movepos)
8878 unlock_fh_pos(f);
8879 return r;
8880 } else
8881 return bl->length();
7c673cae
FG
8882}
8883
8884Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8885 client(c), f(f) {
8886 f->get();
8887 f->readahead.inc_pending();
8888}
8889
8890Client::C_Readahead::~C_Readahead() {
8891 f->readahead.dec_pending();
8892 client->_put_fh(f);
8893}
8894
8895void Client::C_Readahead::finish(int r) {
8896 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8897 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8898}
8899
8900int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8901{
8902 const md_config_t *conf = cct->_conf;
8903 Inode *in = f->inode.get();
8904
8905 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8906
8907 // trim read based on file size?
8908 if (off >= in->size)
8909 return 0;
8910 if (len == 0)
8911 return 0;
8912 if (off + len > in->size) {
8913 len = in->size - off;
8914 }
8915
8916 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8917 << " max_bytes=" << f->readahead.get_max_readahead_size()
8918 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8919
8920 // read (and possibly block)
8921 int r, rvalue = 0;
8922 Mutex flock("Client::_read_async flock");
8923 Cond cond;
8924 bool done = false;
8925 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8926 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8927 off, len, bl, 0, onfinish);
8928 if (r == 0) {
8929 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8930 client_lock.Unlock();
8931 flock.Lock();
8932 while (!done)
8933 cond.Wait(flock);
8934 flock.Unlock();
8935 client_lock.Lock();
8936 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8937 r = rvalue;
8938 } else {
8939 // it was cached.
8940 delete onfinish;
8941 }
8942
8943 if(f->readahead.get_min_readahead_size() > 0) {
8944 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8945 if (readahead_extent.second > 0) {
8946 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8947 << " (caller wants " << off << "~" << len << ")" << dendl;
8948 Context *onfinish2 = new C_Readahead(this, f);
8949 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8950 readahead_extent.first, readahead_extent.second,
8951 NULL, 0, onfinish2);
8952 if (r2 == 0) {
8953 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8954 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8955 } else {
8956 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8957 delete onfinish2;
8958 }
8959 }
8960 }
8961
8962 return r;
8963}
8964
8965int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8966 bool *checkeof)
8967{
8968 Inode *in = f->inode.get();
8969 uint64_t pos = off;
8970 int left = len;
8971 int read = 0;
8972
8973 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8974
8975 Mutex flock("Client::_read_sync flock");
8976 Cond cond;
8977 while (left > 0) {
8978 int r = 0;
8979 bool done = false;
8980 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8981 bufferlist tbl;
8982
8983 int wanted = left;
8984 filer->read_trunc(in->ino, &in->layout, in->snapid,
8985 pos, left, &tbl, 0,
8986 in->truncate_size, in->truncate_seq,
8987 onfinish);
8988 client_lock.Unlock();
8989 flock.Lock();
8990 while (!done)
8991 cond.Wait(flock);
8992 flock.Unlock();
8993 client_lock.Lock();
8994
8995 // if we get ENOENT from OSD, assume 0 bytes returned
8996 if (r == -ENOENT)
8997 r = 0;
8998 if (r < 0)
8999 return r;
9000 if (tbl.length()) {
9001 r = tbl.length();
9002
9003 read += r;
9004 pos += r;
9005 left -= r;
9006 bl->claim_append(tbl);
9007 }
9008 // short read?
9009 if (r >= 0 && r < wanted) {
9010 if (pos < in->size) {
9011 // zero up to known EOF
9012 int64_t some = in->size - pos;
9013 if (some > left)
9014 some = left;
9015 bufferptr z(some);
9016 z.zero();
9017 bl->push_back(z);
9018 read += some;
9019 pos += some;
9020 left -= some;
9021 if (left == 0)
9022 return read;
9023 }
9024
9025 *checkeof = true;
9026 return read;
9027 }
9028 }
9029 return read;
9030}
9031
9032
9033/*
9034 * we keep count of uncommitted sync writes on the inode, so that
9035 * fsync can DDRT.
9036 */
9037void Client::_sync_write_commit(Inode *in)
9038{
9039 assert(unsafe_sync_write > 0);
9040 unsafe_sync_write--;
9041
9042 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9043
9044 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9045 if (unsafe_sync_write == 0 && unmounting) {
9046 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9047 mount_cond.Signal();
9048 }
9049}
9050
9051int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9052{
9053 Mutex::Locker lock(client_lock);
9054 tout(cct) << "write" << std::endl;
9055 tout(cct) << fd << std::endl;
9056 tout(cct) << size << std::endl;
9057 tout(cct) << offset << std::endl;
9058
181888fb
FG
9059 if (unmounting)
9060 return -ENOTCONN;
9061
7c673cae
FG
9062 Fh *fh = get_filehandle(fd);
9063 if (!fh)
9064 return -EBADF;
9065#if defined(__linux__) && defined(O_PATH)
9066 if (fh->flags & O_PATH)
9067 return -EBADF;
9068#endif
9069 int r = _write(fh, offset, size, buf, NULL, 0);
9070 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9071 return r;
9072}
9073
9074int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9075{
9076 if (iovcnt < 0)
9077 return -EINVAL;
9078 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9079}
9080
9081int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9082{
9083 Mutex::Locker lock(client_lock);
9084 tout(cct) << fd << std::endl;
9085 tout(cct) << offset << std::endl;
9086
181888fb
FG
9087 if (unmounting)
9088 return -ENOTCONN;
9089
7c673cae
FG
9090 Fh *fh = get_filehandle(fd);
9091 if (!fh)
9092 return -EBADF;
9093#if defined(__linux__) && defined(O_PATH)
9094 if (fh->flags & O_PATH)
9095 return -EBADF;
9096#endif
9097 loff_t totallen = 0;
9098 for (unsigned i = 0; i < iovcnt; i++) {
9099 totallen += iov[i].iov_len;
9100 }
9101 if (write) {
9102 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9103 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9104 return w;
9105 } else {
9106 bufferlist bl;
9107 int r = _read(fh, offset, totallen, &bl);
9108 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9109 if (r <= 0)
9110 return r;
9111
9112 int bufoff = 0;
9113 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9114 /*
9115 * This piece of code aims to handle the case that bufferlist does not have enough data
9116 * to fill in the iov
9117 */
9118 if (resid < iov[j].iov_len) {
9119 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9120 break;
9121 } else {
9122 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9123 }
9124 resid -= iov[j].iov_len;
9125 bufoff += iov[j].iov_len;
9126 }
9127 return r;
9128 }
9129}
9130
9131int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9132 const struct iovec *iov, int iovcnt)
9133{
9134 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9135 return -EFBIG;
9136
9137 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9138 Inode *in = f->inode.get();
9139
9140 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9141 return -ENOSPC;
9142 }
9143
9144 assert(in->snapid == CEPH_NOSNAP);
9145
9146 // was Fh opened as writeable?
9147 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9148 return -EBADF;
9149
9150 // check quota
9151 uint64_t endoff = offset + size;
9152 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9153 f->actor_perms)) {
9154 return -EDQUOT;
9155 }
9156
9157 // use/adjust fd pos?
9158 if (offset < 0) {
9159 lock_fh_pos(f);
9160 /*
9161 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9162 * change out from under us.
9163 */
9164 if (f->flags & O_APPEND) {
9165 int r = _lseek(f, 0, SEEK_END);
9166 if (r < 0) {
9167 unlock_fh_pos(f);
9168 return r;
9169 }
9170 }
9171 offset = f->pos;
9172 f->pos = offset+size;
9173 unlock_fh_pos(f);
9174 }
9175
9176 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9177
9178 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9179
9180 // time it.
9181 utime_t start = ceph_clock_now();
9182
9183 if (in->inline_version == 0) {
9184 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9185 if (r < 0)
9186 return r;
9187 assert(in->inline_version > 0);
9188 }
9189
9190 // copy into fresh buffer (since our write may be resub, async)
9191 bufferlist bl;
9192 if (buf) {
9193 if (size > 0)
9194 bl.append(buf, size);
9195 } else if (iov){
9196 for (int i = 0; i < iovcnt; i++) {
9197 if (iov[i].iov_len > 0) {
9198 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9199 }
9200 }
9201 }
9202
9203 utime_t lat;
9204 uint64_t totalwritten;
9205 int have;
9206 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9207 CEPH_CAP_FILE_BUFFER, &have, endoff);
9208 if (r < 0)
9209 return r;
9210
9211 /* clear the setuid/setgid bits, if any */
181888fb 9212 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9213 struct ceph_statx stx = { 0 };
9214
9215 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9216 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9217 if (r < 0)
9218 return r;
9219 } else {
9220 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9221 }
9222
9223 if (f->flags & O_DIRECT)
9224 have &= ~CEPH_CAP_FILE_BUFFER;
9225
9226 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9227
9228 Mutex uninline_flock("Client::_write_uninline_data flock");
9229 Cond uninline_cond;
9230 bool uninline_done = false;
9231 int uninline_ret = 0;
9232 Context *onuninline = NULL;
9233
9234 if (in->inline_version < CEPH_INLINE_NONE) {
9235 if (endoff > cct->_conf->client_max_inline_size ||
9236 endoff > CEPH_INLINE_MAX_SIZE ||
9237 !(have & CEPH_CAP_FILE_BUFFER)) {
9238 onuninline = new C_SafeCond(&uninline_flock,
9239 &uninline_cond,
9240 &uninline_done,
9241 &uninline_ret);
9242 uninline_data(in, onuninline);
9243 } else {
9244 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9245
9246 uint32_t len = in->inline_data.length();
9247
9248 if (endoff < len)
9249 in->inline_data.copy(endoff, len - endoff, bl);
9250
9251 if (offset < len)
9252 in->inline_data.splice(offset, len - offset);
9253 else if (offset > len)
9254 in->inline_data.append_zero(offset - len);
9255
9256 in->inline_data.append(bl);
9257 in->inline_version++;
9258
9259 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9260
9261 goto success;
9262 }
9263 }
9264
9265 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9266 // do buffered write
9267 if (!in->oset.dirty_or_tx)
9268 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9269
9270 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9271
9272 // async, caching, non-blocking.
9273 r = objectcacher->file_write(&in->oset, &in->layout,
9274 in->snaprealm->get_snap_context(),
9275 offset, size, bl, ceph::real_clock::now(),
9276 0);
9277 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9278
9279 if (r < 0)
9280 goto done;
9281
9282 // flush cached write if O_SYNC is set on file fh
9283 // O_DSYNC == O_SYNC on linux < 2.6.33
9284 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9285 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9286 _flush_range(in, offset, size);
9287 }
9288 } else {
9289 if (f->flags & O_DIRECT)
9290 _flush_range(in, offset, size);
9291
9292 // simple, non-atomic sync write
9293 Mutex flock("Client::_write flock");
9294 Cond cond;
9295 bool done = false;
9296 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9297
9298 unsafe_sync_write++;
9299 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9300
9301 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9302 offset, size, bl, ceph::real_clock::now(), 0,
9303 in->truncate_size, in->truncate_seq,
9304 onfinish);
9305 client_lock.Unlock();
9306 flock.Lock();
9307
9308 while (!done)
9309 cond.Wait(flock);
9310 flock.Unlock();
9311 client_lock.Lock();
9312 _sync_write_commit(in);
9313 }
9314
9315 // if we get here, write was successful, update client metadata
9316success:
9317 // time
9318 lat = ceph_clock_now();
9319 lat -= start;
9320 logger->tinc(l_c_wrlat, lat);
9321
9322 totalwritten = size;
9323 r = (int)totalwritten;
9324
9325 // extend file?
9326 if (totalwritten + offset > in->size) {
9327 in->size = totalwritten + offset;
9328 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9329
9330 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9331 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9332 } else if (is_max_size_approaching(in)) {
9333 check_caps(in, 0);
7c673cae
FG
9334 }
9335
9336 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9337 } else {
9338 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9339 }
9340
9341 // mtime
9342 in->mtime = ceph_clock_now();
9343 in->change_attr++;
9344 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9345
9346done:
9347
9348 if (onuninline) {
9349 client_lock.Unlock();
9350 uninline_flock.Lock();
9351 while (!uninline_done)
9352 uninline_cond.Wait(uninline_flock);
9353 uninline_flock.Unlock();
9354 client_lock.Lock();
9355
9356 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9357 in->inline_data.clear();
9358 in->inline_version = CEPH_INLINE_NONE;
9359 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9360 check_caps(in, 0);
9361 } else
9362 r = uninline_ret;
9363 }
9364
9365 put_cap_ref(in, CEPH_CAP_FILE_WR);
9366 return r;
9367}
9368
9369int Client::_flush(Fh *f)
9370{
9371 Inode *in = f->inode.get();
9372 int err = f->take_async_err();
9373 if (err != 0) {
9374 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9375 << cpp_strerror(err) << dendl;
9376 } else {
9377 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9378 }
9379
9380 return err;
9381}
9382
9383int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9384{
9385 struct ceph_statx stx;
9386 stx.stx_size = length;
9387 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9388}
9389
9390int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9391{
9392 Mutex::Locker lock(client_lock);
9393 tout(cct) << "ftruncate" << std::endl;
9394 tout(cct) << fd << std::endl;
9395 tout(cct) << length << std::endl;
9396
181888fb
FG
9397 if (unmounting)
9398 return -ENOTCONN;
9399
7c673cae
FG
9400 Fh *f = get_filehandle(fd);
9401 if (!f)
9402 return -EBADF;
9403#if defined(__linux__) && defined(O_PATH)
9404 if (f->flags & O_PATH)
9405 return -EBADF;
9406#endif
9407 struct stat attr;
9408 attr.st_size = length;
9409 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9410}
9411
9412int Client::fsync(int fd, bool syncdataonly)
9413{
9414 Mutex::Locker lock(client_lock);
9415 tout(cct) << "fsync" << std::endl;
9416 tout(cct) << fd << std::endl;
9417 tout(cct) << syncdataonly << std::endl;
9418
181888fb
FG
9419 if (unmounting)
9420 return -ENOTCONN;
9421
7c673cae
FG
9422 Fh *f = get_filehandle(fd);
9423 if (!f)
9424 return -EBADF;
9425#if defined(__linux__) && defined(O_PATH)
9426 if (f->flags & O_PATH)
9427 return -EBADF;
9428#endif
9429 int r = _fsync(f, syncdataonly);
9430 if (r == 0) {
9431 // The IOs in this fsync were okay, but maybe something happened
9432 // in the background that we shoudl be reporting?
9433 r = f->take_async_err();
9434 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9435 << ") = 0, async_err = " << r << dendl;
9436 } else {
9437 // Assume that an error we encountered during fsync, even reported
9438 // synchronously, would also have applied the error to the Fh, and we
9439 // should clear it here to avoid returning the same error again on next
9440 // call.
9441 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9442 << r << dendl;
9443 f->take_async_err();
9444 }
9445 return r;
9446}
9447
9448int Client::_fsync(Inode *in, bool syncdataonly)
9449{
9450 int r = 0;
9451 Mutex lock("Client::_fsync::lock");
9452 Cond cond;
9453 bool done = false;
9454 C_SafeCond *object_cacher_completion = NULL;
9455 ceph_tid_t flush_tid = 0;
9456 InodeRef tmp_ref;
9457
9458 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9459
9460 if (cct->_conf->client_oc) {
9461 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9462 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9463 _flush(in, object_cacher_completion);
9464 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9465 }
9466
9467 if (!syncdataonly && in->dirty_caps) {
9468 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9469 if (in->flushing_caps)
9470 flush_tid = last_flush_tid;
9471 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9472
9473 if (!syncdataonly && !in->unsafe_ops.empty()) {
9474 MetaRequest *req = in->unsafe_ops.back();
9475 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9476
9477 req->get();
9478 wait_on_list(req->waitfor_safe);
9479 put_request(req);
9480 }
9481
9482 if (object_cacher_completion) { // wait on a real reply instead of guessing
9483 client_lock.Unlock();
9484 lock.Lock();
9485 ldout(cct, 15) << "waiting on data to flush" << dendl;
9486 while (!done)
9487 cond.Wait(lock);
9488 lock.Unlock();
9489 client_lock.Lock();
9490 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9491 } else {
9492 // FIXME: this can starve
9493 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9494 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9495 << " uncommitted, waiting" << dendl;
9496 wait_on_list(in->waitfor_commit);
9497 }
9498 }
9499
9500 if (!r) {
9501 if (flush_tid > 0)
9502 wait_sync_caps(in, flush_tid);
9503
9504 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9505 } else {
9506 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9507 << cpp_strerror(-r) << dendl;
9508 }
9509
9510 return r;
9511}
9512
9513int Client::_fsync(Fh *f, bool syncdataonly)
9514{
9515 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9516 return _fsync(f->inode.get(), syncdataonly);
9517}
9518
9519int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9520{
9521 Mutex::Locker lock(client_lock);
9522 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9523 tout(cct) << fd << std::endl;
9524
181888fb
FG
9525 if (unmounting)
9526 return -ENOTCONN;
9527
7c673cae
FG
9528 Fh *f = get_filehandle(fd);
9529 if (!f)
9530 return -EBADF;
9531 int r = _getattr(f->inode, mask, perms);
9532 if (r < 0)
9533 return r;
9534 fill_stat(f->inode, stbuf, NULL);
9535 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9536 return r;
9537}
9538
9539int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9540 unsigned int want, unsigned int flags)
9541{
9542 Mutex::Locker lock(client_lock);
9543 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9544 tout(cct) << fd << std::endl;
9545
181888fb
FG
9546 if (unmounting)
9547 return -ENOTCONN;
9548
7c673cae
FG
9549 Fh *f = get_filehandle(fd);
9550 if (!f)
9551 return -EBADF;
9552
9553 unsigned mask = statx_to_mask(flags, want);
9554
9555 int r = 0;
9556 if (mask && !f->inode->caps_issued_mask(mask)) {
9557 r = _getattr(f->inode, mask, perms);
9558 if (r < 0) {
9559 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9560 return r;
9561 }
9562 }
9563
9564 fill_statx(f->inode, mask, stx);
9565 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9566 return r;
9567}
9568
9569// not written yet, but i want to link!
9570
9571int Client::chdir(const char *relpath, std::string &new_cwd,
9572 const UserPerm& perms)
9573{
9574 Mutex::Locker lock(client_lock);
9575 tout(cct) << "chdir" << std::endl;
9576 tout(cct) << relpath << std::endl;
181888fb
FG
9577
9578 if (unmounting)
9579 return -ENOTCONN;
9580
7c673cae
FG
9581 filepath path(relpath);
9582 InodeRef in;
9583 int r = path_walk(path, &in, perms);
9584 if (r < 0)
9585 return r;
9586 if (cwd != in)
9587 cwd.swap(in);
9588 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9589
b5b8bbf5 9590 _getcwd(new_cwd, perms);
7c673cae
FG
9591 return 0;
9592}
9593
b5b8bbf5 9594void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9595{
9596 filepath path;
9597 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9598
9599 Inode *in = cwd.get();
9600 while (in != root) {
9601 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9602
9603 // A cwd or ancester is unlinked
9604 if (in->dn_set.empty()) {
9605 return;
9606 }
9607
9608 Dentry *dn = in->get_first_parent();
9609
9610
9611 if (!dn) {
9612 // look it up
9613 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9614 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9615 filepath path(in->ino);
9616 req->set_filepath(path);
9617 req->set_inode(in);
9618 int res = make_request(req, perms);
9619 if (res < 0)
9620 break;
9621
9622 // start over
9623 path = filepath();
9624 in = cwd.get();
9625 continue;
9626 }
9627 path.push_front_dentry(dn->name);
9628 in = dn->dir->parent_inode;
9629 }
9630 dir = "/";
9631 dir += path.get_path();
9632}
9633
b5b8bbf5
FG
9634void Client::getcwd(string& dir, const UserPerm& perms)
9635{
9636 Mutex::Locker l(client_lock);
181888fb
FG
9637 if (!unmounting)
9638 _getcwd(dir, perms);
b5b8bbf5
FG
9639}
9640
7c673cae
FG
9641int Client::statfs(const char *path, struct statvfs *stbuf,
9642 const UserPerm& perms)
9643{
9644 Mutex::Locker l(client_lock);
9645 tout(cct) << "statfs" << std::endl;
9646
181888fb
FG
9647 if (unmounting)
9648 return -ENOTCONN;
9649
7c673cae
FG
9650 ceph_statfs stats;
9651 C_SaferCond cond;
d2e6a577
FG
9652
9653 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9654 if (data_pools.size() == 1) {
9655 objecter->get_fs_stats(stats, data_pools[0], &cond);
9656 } else {
9657 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9658 }
7c673cae
FG
9659
9660 client_lock.Unlock();
9661 int rval = cond.wait();
9662 client_lock.Lock();
9663
9664 if (rval < 0) {
9665 ldout(cct, 1) << "underlying call to statfs returned error: "
9666 << cpp_strerror(rval)
9667 << dendl;
9668 return rval;
9669 }
9670
9671 memset(stbuf, 0, sizeof(*stbuf));
9672
9673 /*
9674 * we're going to set a block size of 4MB so we can represent larger
9675 * FSes without overflowing. Additionally convert the space
9676 * measurements from KB to bytes while making them in terms of
9677 * blocks. We use 4MB only because it is big enough, and because it
9678 * actually *is* the (ceph) default block size.
9679 */
9680 const int CEPH_BLOCK_SHIFT = 22;
9681 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9682 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9683 stbuf->f_files = stats.num_objects;
9684 stbuf->f_ffree = -1;
9685 stbuf->f_favail = -1;
9686 stbuf->f_fsid = -1; // ??
9687 stbuf->f_flag = 0; // ??
9688 stbuf->f_namemax = NAME_MAX;
9689
9690 // Usually quota_root will == root_ancestor, but if the mount root has no
9691 // quota but we can see a parent of it that does have a quota, we'll
9692 // respect that one instead.
9693 assert(root != nullptr);
9694 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9695
9696 // get_quota_root should always give us something
9697 // because client quotas are always enabled
9698 assert(quota_root != nullptr);
9699
9700 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9701
9702 // Skip the getattr if any sessions are stale, as we don't want to
9703 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9704 // is unhealthy.
9705 if (!_any_stale_sessions()) {
9706 int r = _getattr(quota_root, 0, perms, true);
9707 if (r != 0) {
9708 // Ignore return value: error getting latest inode metadata is not a good
9709 // reason to break "df".
9710 lderr(cct) << "Error in getattr on quota root 0x"
9711 << std::hex << quota_root->ino << std::dec
9712 << " statfs result may be outdated" << dendl;
9713 }
9714 }
9715
9716 // Special case: if there is a size quota set on the Inode acting
9717 // as the root for this client mount, then report the quota status
9718 // as the filesystem statistics.
9719 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9720 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9721 // It is possible for a quota to be exceeded: arithmetic here must
9722 // handle case where used > total.
9723 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9724
9725 stbuf->f_blocks = total;
9726 stbuf->f_bfree = free;
9727 stbuf->f_bavail = free;
9728 } else {
d2e6a577 9729 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9730 // multiple pools may be used without one filesystem namespace via
9731 // layouts, this is the most correct thing we can do.
9732 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9733 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9734 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9735 }
9736
9737 return rval;
9738}
9739
9740int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9741 struct flock *fl, uint64_t owner, bool removing)
9742{
9743 ldout(cct, 10) << "_do_filelock ino " << in->ino
9744 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9745 << " type " << fl->l_type << " owner " << owner
9746 << " " << fl->l_start << "~" << fl->l_len << dendl;
9747
9748 int lock_cmd;
9749 if (F_RDLCK == fl->l_type)
9750 lock_cmd = CEPH_LOCK_SHARED;
9751 else if (F_WRLCK == fl->l_type)
9752 lock_cmd = CEPH_LOCK_EXCL;
9753 else if (F_UNLCK == fl->l_type)
9754 lock_cmd = CEPH_LOCK_UNLOCK;
9755 else
9756 return -EIO;
9757
9758 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9759 sleep = 0;
9760
9761 /*
9762 * Set the most significant bit, so that MDS knows the 'owner'
9763 * is sufficient to identify the owner of lock. (old code uses
9764 * both 'owner' and 'pid')
9765 */
9766 owner |= (1ULL << 63);
9767
9768 MetaRequest *req = new MetaRequest(op);
9769 filepath path;
9770 in->make_nosnap_relative_path(path);
9771 req->set_filepath(path);
9772 req->set_inode(in);
9773
9774 req->head.args.filelock_change.rule = lock_type;
9775 req->head.args.filelock_change.type = lock_cmd;
9776 req->head.args.filelock_change.owner = owner;
9777 req->head.args.filelock_change.pid = fl->l_pid;
9778 req->head.args.filelock_change.start = fl->l_start;
9779 req->head.args.filelock_change.length = fl->l_len;
9780 req->head.args.filelock_change.wait = sleep;
9781
9782 int ret;
9783 bufferlist bl;
9784
9785 if (sleep && switch_interrupt_cb) {
9786 // enable interrupt
9787 switch_interrupt_cb(callback_handle, req->get());
9788 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9789 // disable interrupt
9790 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9791 if (ret == 0 && req->aborted()) {
9792 // effect of this lock request has been revoked by the 'lock intr' request
9793 ret = req->get_abort_code();
9794 }
7c673cae
FG
9795 put_request(req);
9796 } else {
9797 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9798 }
9799
9800 if (ret == 0) {
9801 if (op == CEPH_MDS_OP_GETFILELOCK) {
9802 ceph_filelock filelock;
9803 bufferlist::iterator p = bl.begin();
9804 ::decode(filelock, p);
9805
9806 if (CEPH_LOCK_SHARED == filelock.type)
9807 fl->l_type = F_RDLCK;
9808 else if (CEPH_LOCK_EXCL == filelock.type)
9809 fl->l_type = F_WRLCK;
9810 else
9811 fl->l_type = F_UNLCK;
9812
9813 fl->l_whence = SEEK_SET;
9814 fl->l_start = filelock.start;
9815 fl->l_len = filelock.length;
9816 fl->l_pid = filelock.pid;
9817 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9818 ceph_lock_state_t *lock_state;
9819 if (lock_type == CEPH_LOCK_FCNTL) {
9820 if (!in->fcntl_locks)
9821 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9822 lock_state = in->fcntl_locks;
9823 } else if (lock_type == CEPH_LOCK_FLOCK) {
9824 if (!in->flock_locks)
9825 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9826 lock_state = in->flock_locks;
9827 } else {
9828 ceph_abort();
9829 return -EINVAL;
9830 }
9831 _update_lock_state(fl, owner, lock_state);
9832
9833 if (!removing) {
9834 if (lock_type == CEPH_LOCK_FCNTL) {
9835 if (!fh->fcntl_locks)
9836 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9837 lock_state = fh->fcntl_locks;
9838 } else {
9839 if (!fh->flock_locks)
9840 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9841 lock_state = fh->flock_locks;
9842 }
9843 _update_lock_state(fl, owner, lock_state);
9844 }
9845 } else
9846 ceph_abort();
9847 }
9848 return ret;
9849}
9850
9851int Client::_interrupt_filelock(MetaRequest *req)
9852{
31f18b77
FG
9853 // Set abort code, but do not kick. The abort code prevents the request
9854 // from being re-sent.
9855 req->abort(-EINTR);
9856 if (req->mds < 0)
9857 return 0; // haven't sent the request
9858
7c673cae
FG
9859 Inode *in = req->inode();
9860
9861 int lock_type;
9862 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9863 lock_type = CEPH_LOCK_FLOCK_INTR;
9864 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9865 lock_type = CEPH_LOCK_FCNTL_INTR;
9866 else {
9867 ceph_abort();
9868 return -EINVAL;
9869 }
9870
9871 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9872 filepath path;
9873 in->make_nosnap_relative_path(path);
9874 intr_req->set_filepath(path);
9875 intr_req->set_inode(in);
9876 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9877 intr_req->head.args.filelock_change.rule = lock_type;
9878 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9879
9880 UserPerm perms(req->get_uid(), req->get_gid());
9881 return make_request(intr_req, perms, NULL, NULL, -1);
9882}
9883
9884void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9885{
9886 if (!in->fcntl_locks && !in->flock_locks)
9887 return;
9888
9889 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9890 ::encode(nr_fcntl_locks, bl);
9891 if (nr_fcntl_locks) {
9892 ceph_lock_state_t* lock_state = in->fcntl_locks;
9893 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9894 p != lock_state->held_locks.end();
9895 ++p)
9896 ::encode(p->second, bl);
9897 }
9898
9899 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9900 ::encode(nr_flock_locks, bl);
9901 if (nr_flock_locks) {
9902 ceph_lock_state_t* lock_state = in->flock_locks;
9903 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9904 p != lock_state->held_locks.end();
9905 ++p)
9906 ::encode(p->second, bl);
9907 }
9908
9909 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9910 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9911}
9912
9913void Client::_release_filelocks(Fh *fh)
9914{
9915 if (!fh->fcntl_locks && !fh->flock_locks)
9916 return;
9917
9918 Inode *in = fh->inode.get();
9919 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9920
9921 list<pair<int, ceph_filelock> > to_release;
9922
9923 if (fh->fcntl_locks) {
9924 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9925 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9926 p != lock_state->held_locks.end();
9927 ++p)
9928 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9929 delete fh->fcntl_locks;
9930 }
9931 if (fh->flock_locks) {
9932 ceph_lock_state_t* lock_state = fh->flock_locks;
9933 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9934 p != lock_state->held_locks.end();
9935 ++p)
9936 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9937 delete fh->flock_locks;
9938 }
9939
9940 if (to_release.empty())
9941 return;
9942
9943 struct flock fl;
9944 memset(&fl, 0, sizeof(fl));
9945 fl.l_whence = SEEK_SET;
9946 fl.l_type = F_UNLCK;
9947
9948 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9949 p != to_release.end();
9950 ++p) {
9951 fl.l_start = p->second.start;
9952 fl.l_len = p->second.length;
9953 fl.l_pid = p->second.pid;
9954 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9955 p->second.owner, true);
9956 }
9957}
9958
9959void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9960 ceph_lock_state_t *lock_state)
9961{
9962 int lock_cmd;
9963 if (F_RDLCK == fl->l_type)
9964 lock_cmd = CEPH_LOCK_SHARED;
9965 else if (F_WRLCK == fl->l_type)
9966 lock_cmd = CEPH_LOCK_EXCL;
9967 else
9968 lock_cmd = CEPH_LOCK_UNLOCK;;
9969
9970 ceph_filelock filelock;
9971 filelock.start = fl->l_start;
9972 filelock.length = fl->l_len;
9973 filelock.client = 0;
9974 // see comment in _do_filelock()
9975 filelock.owner = owner | (1ULL << 63);
9976 filelock.pid = fl->l_pid;
9977 filelock.type = lock_cmd;
9978
9979 if (filelock.type == CEPH_LOCK_UNLOCK) {
9980 list<ceph_filelock> activated_locks;
9981 lock_state->remove_lock(filelock, activated_locks);
9982 } else {
9983 bool r = lock_state->add_lock(filelock, false, false, NULL);
9984 assert(r);
9985 }
9986}
9987
9988int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9989{
9990 Inode *in = fh->inode.get();
9991 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9992 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9993 return ret;
9994}
9995
9996int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9997{
9998 Inode *in = fh->inode.get();
9999 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10000 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10001 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10002 return ret;
10003}
10004
10005int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10006{
10007 Inode *in = fh->inode.get();
10008 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10009
10010 int sleep = !(cmd & LOCK_NB);
10011 cmd &= ~LOCK_NB;
10012
10013 int type;
10014 switch (cmd) {
10015 case LOCK_SH:
10016 type = F_RDLCK;
10017 break;
10018 case LOCK_EX:
10019 type = F_WRLCK;
10020 break;
10021 case LOCK_UN:
10022 type = F_UNLCK;
10023 break;
10024 default:
10025 return -EINVAL;
10026 }
10027
10028 struct flock fl;
10029 memset(&fl, 0, sizeof(fl));
10030 fl.l_type = type;
10031 fl.l_whence = SEEK_SET;
10032
10033 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10034 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10035 return ret;
10036}
10037
10038int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10039{
10040 /* Since the only thing this does is wrap a call to statfs, and
10041 statfs takes a lock, it doesn't seem we have a need to split it
10042 out. */
10043 return statfs(0, stbuf, perms);
10044}
10045
10046void Client::ll_register_callbacks(struct client_callback_args *args)
10047{
10048 if (!args)
10049 return;
10050 Mutex::Locker l(client_lock);
10051 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10052 << " invalidate_ino_cb " << args->ino_cb
10053 << " invalidate_dentry_cb " << args->dentry_cb
10054 << " getgroups_cb" << args->getgroups_cb
10055 << " switch_interrupt_cb " << args->switch_intr_cb
10056 << " remount_cb " << args->remount_cb
10057 << dendl;
10058 callback_handle = args->handle;
10059 if (args->ino_cb) {
10060 ino_invalidate_cb = args->ino_cb;
10061 async_ino_invalidator.start();
10062 }
10063 if (args->dentry_cb) {
10064 dentry_invalidate_cb = args->dentry_cb;
10065 async_dentry_invalidator.start();
10066 }
10067 if (args->switch_intr_cb) {
10068 switch_interrupt_cb = args->switch_intr_cb;
10069 interrupt_finisher.start();
10070 }
10071 if (args->remount_cb) {
10072 remount_cb = args->remount_cb;
10073 remount_finisher.start();
10074 }
10075 getgroups_cb = args->getgroups_cb;
10076 umask_cb = args->umask_cb;
10077}
10078
10079int Client::test_dentry_handling(bool can_invalidate)
10080{
10081 int r = 0;
10082
10083 can_invalidate_dentries = can_invalidate;
10084
10085 if (can_invalidate_dentries) {
10086 assert(dentry_invalidate_cb);
10087 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10088 r = 0;
7c673cae
FG
10089 } else if (remount_cb) {
10090 ldout(cct, 1) << "using remount_cb" << dendl;
b32b8144
FG
10091 r = _do_remount();
10092 }
10093 if (r) {
10094 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10095 if (should_abort) {
10096 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
7c673cae 10097 ceph_abort();
b32b8144
FG
10098 } else {
10099 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10100 }
7c673cae
FG
10101 }
10102 return r;
10103}
10104
10105int Client::_sync_fs()
10106{
10107 ldout(cct, 10) << "_sync_fs" << dendl;
10108
10109 // flush file data
10110 Mutex lock("Client::_fsync::lock");
10111 Cond cond;
10112 bool flush_done = false;
10113 if (cct->_conf->client_oc)
10114 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10115 else
10116 flush_done = true;
10117
10118 // flush caps
10119 flush_caps_sync();
10120 ceph_tid_t flush_tid = last_flush_tid;
10121
10122 // wait for unsafe mds requests
10123 wait_unsafe_requests();
10124
10125 wait_sync_caps(flush_tid);
10126
10127 if (!flush_done) {
10128 client_lock.Unlock();
10129 lock.Lock();
10130 ldout(cct, 15) << "waiting on data to flush" << dendl;
10131 while (!flush_done)
10132 cond.Wait(lock);
10133 lock.Unlock();
10134 client_lock.Lock();
10135 }
10136
10137 return 0;
10138}
10139
10140int Client::sync_fs()
10141{
10142 Mutex::Locker l(client_lock);
181888fb
FG
10143
10144 if (unmounting)
10145 return -ENOTCONN;
10146
7c673cae
FG
10147 return _sync_fs();
10148}
10149
10150int64_t Client::drop_caches()
10151{
10152 Mutex::Locker l(client_lock);
10153 return objectcacher->release_all();
10154}
10155
10156
10157int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10158{
10159 Mutex::Locker l(client_lock);
10160 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10161 << ", " << offset << ", " << count << ")" << dendl;
10162
10163 Fh *f = get_filehandle(fd);
10164 if (!f)
10165 return -EBADF;
10166
10167 // for now
10168 _fsync(f, true);
10169
10170 return 0;
10171}
10172
10173int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10174{
10175 Mutex::Locker l(client_lock);
10176 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10177 << ", " << offset << ", " << count << ")" << dendl;
10178
10179 Fh *f = get_filehandle(fd);
10180 if (!f)
10181 return -EBADF;
10182 Inode *in = f->inode.get();
10183
10184 _fsync(f, true);
10185 if (_release(in))
10186 check_caps(in, 0);
10187 return 0;
10188}
10189
10190
10191// =============================
10192// snaps
10193
10194int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10195{
10196 Mutex::Locker l(client_lock);
181888fb
FG
10197
10198 if (unmounting)
10199 return -ENOTCONN;
10200
7c673cae
FG
10201 filepath path(relpath);
10202 InodeRef in;
10203 int r = path_walk(path, &in, perm);
10204 if (r < 0)
10205 return r;
10206 if (cct->_conf->client_permissions) {
10207 r = may_create(in.get(), perm);
10208 if (r < 0)
10209 return r;
10210 }
10211 Inode *snapdir = open_snapdir(in.get());
10212 return _mkdir(snapdir, name, 0, perm);
10213}
181888fb 10214
7c673cae
FG
10215int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10216{
10217 Mutex::Locker l(client_lock);
181888fb
FG
10218
10219 if (unmounting)
10220 return -ENOTCONN;
10221
7c673cae
FG
10222 filepath path(relpath);
10223 InodeRef in;
10224 int r = path_walk(path, &in, perms);
10225 if (r < 0)
10226 return r;
10227 if (cct->_conf->client_permissions) {
10228 r = may_delete(in.get(), NULL, perms);
10229 if (r < 0)
10230 return r;
10231 }
10232 Inode *snapdir = open_snapdir(in.get());
10233 return _rmdir(snapdir, name, perms);
10234}
10235
10236// =============================
10237// expose caps
10238
10239int Client::get_caps_issued(int fd) {
10240
10241 Mutex::Locker lock(client_lock);
10242
181888fb
FG
10243 if (unmounting)
10244 return -ENOTCONN;
10245
7c673cae
FG
10246 Fh *f = get_filehandle(fd);
10247 if (!f)
10248 return -EBADF;
10249
10250 return f->inode->caps_issued();
10251}
10252
10253int Client::get_caps_issued(const char *path, const UserPerm& perms)
10254{
10255 Mutex::Locker lock(client_lock);
181888fb
FG
10256
10257 if (unmounting)
10258 return -ENOTCONN;
10259
7c673cae
FG
10260 filepath p(path);
10261 InodeRef in;
10262 int r = path_walk(p, &in, perms, true);
10263 if (r < 0)
10264 return r;
10265 return in->caps_issued();
10266}
10267
10268// =========================================
10269// low level
10270
10271Inode *Client::open_snapdir(Inode *diri)
10272{
10273 Inode *in;
10274 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10275 if (!inode_map.count(vino)) {
10276 in = new Inode(this, vino, &diri->layout);
10277
10278 in->ino = diri->ino;
10279 in->snapid = CEPH_SNAPDIR;
10280 in->mode = diri->mode;
10281 in->uid = diri->uid;
10282 in->gid = diri->gid;
10283 in->mtime = diri->mtime;
10284 in->ctime = diri->ctime;
10285 in->btime = diri->btime;
10286 in->size = diri->size;
10287 in->change_attr = diri->change_attr;
10288
10289 in->dirfragtree.clear();
10290 in->snapdir_parent = diri;
10291 diri->flags |= I_SNAPDIR_OPEN;
10292 inode_map[vino] = in;
10293 if (use_faked_inos())
10294 _assign_faked_ino(in);
10295 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10296 } else {
10297 in = inode_map[vino];
10298 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10299 }
10300 return in;
10301}
10302
10303int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10304 Inode **out, const UserPerm& perms)
10305{
10306 Mutex::Locker lock(client_lock);
31f18b77
FG
10307 vinodeno_t vparent = _get_vino(parent);
10308 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10309 tout(cct) << "ll_lookup" << std::endl;
10310 tout(cct) << name << std::endl;
10311
181888fb
FG
10312 if (unmounting)
10313 return -ENOTCONN;
10314
7c673cae
FG
10315 int r = 0;
10316 if (!cct->_conf->fuse_default_permissions) {
10317 r = may_lookup(parent, perms);
10318 if (r < 0)
10319 return r;
10320 }
10321
10322 string dname(name);
10323 InodeRef in;
10324
10325 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10326 if (r < 0) {
10327 attr->st_ino = 0;
10328 goto out;
10329 }
10330
10331 assert(in);
10332 fill_stat(in, attr);
10333 _ll_get(in.get());
10334
10335 out:
31f18b77 10336 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10337 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10338 tout(cct) << attr->st_ino << std::endl;
10339 *out = in.get();
10340 return r;
10341}
10342
10343int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10344 struct ceph_statx *stx, unsigned want, unsigned flags,
10345 const UserPerm& perms)
10346{
10347 Mutex::Locker lock(client_lock);
31f18b77
FG
10348 vinodeno_t vparent = _get_vino(parent);
10349 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10350 tout(cct) << "ll_lookupx" << std::endl;
10351 tout(cct) << name << std::endl;
10352
181888fb
FG
10353 if (unmounting)
10354 return -ENOTCONN;
10355
7c673cae
FG
10356 int r = 0;
10357 if (!cct->_conf->fuse_default_permissions) {
10358 r = may_lookup(parent, perms);
10359 if (r < 0)
10360 return r;
10361 }
10362
10363 string dname(name);
10364 InodeRef in;
10365
10366 unsigned mask = statx_to_mask(flags, want);
10367 r = _lookup(parent, dname, mask, &in, perms);
10368 if (r < 0) {
10369 stx->stx_ino = 0;
10370 stx->stx_mask = 0;
10371 } else {
10372 assert(in);
10373 fill_statx(in, mask, stx);
10374 _ll_get(in.get());
10375 }
10376
31f18b77 10377 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10378 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10379 tout(cct) << stx->stx_ino << std::endl;
10380 *out = in.get();
10381 return r;
10382}
10383
10384int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10385 unsigned int want, unsigned int flags, const UserPerm& perms)
10386{
10387 Mutex::Locker lock(client_lock);
181888fb
FG
10388
10389 if (unmounting)
10390 return -ENOTCONN;
10391
7c673cae
FG
10392 filepath fp(name, 0);
10393 InodeRef in;
10394 int rc;
10395 unsigned mask = statx_to_mask(flags, want);
10396
10397 ldout(cct, 3) << "ll_walk" << name << dendl;
10398 tout(cct) << "ll_walk" << std::endl;
10399 tout(cct) << name << std::endl;
10400
10401 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10402 if (rc < 0) {
10403 /* zero out mask, just in case... */
10404 stx->stx_mask = 0;
10405 stx->stx_ino = 0;
10406 *out = NULL;
10407 return rc;
10408 } else {
10409 assert(in);
10410 fill_statx(in, mask, stx);
10411 _ll_get(in.get());
10412 *out = in.get();
10413 return 0;
10414 }
10415}
10416
10417void Client::_ll_get(Inode *in)
10418{
10419 if (in->ll_ref == 0) {
10420 in->get();
10421 if (in->is_dir() && !in->dn_set.empty()) {
10422 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10423 in->get_first_parent()->get(); // pin dentry
10424 }
10425 }
10426 in->ll_get();
10427 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10428}
10429
10430int Client::_ll_put(Inode *in, int num)
10431{
10432 in->ll_put(num);
10433 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10434 if (in->ll_ref == 0) {
10435 if (in->is_dir() && !in->dn_set.empty()) {
10436 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10437 in->get_first_parent()->put(); // unpin dentry
10438 }
10439 put_inode(in);
10440 return 0;
10441 } else {
10442 return in->ll_ref;
10443 }
10444}
10445
10446void Client::_ll_drop_pins()
10447{
10448 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10449 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10450 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10451 it != inode_map.end();
10452 it = next) {
10453 Inode *in = it->second;
10454 next = it;
10455 ++next;
10456 if (in->ll_ref)
10457 _ll_put(in, in->ll_ref);
10458 }
10459}
10460
10461bool Client::ll_forget(Inode *in, int count)
10462{
10463 Mutex::Locker lock(client_lock);
10464 inodeno_t ino = _get_inodeno(in);
10465
10466 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10467 tout(cct) << "ll_forget" << std::endl;
10468 tout(cct) << ino.val << std::endl;
10469 tout(cct) << count << std::endl;
10470
181888fb
FG
10471 // Ignore forget if we're no longer mounted
10472 if (unmounting)
10473 return true;
10474
7c673cae
FG
10475 if (ino == 1) return true; // ignore forget on root.
10476
10477 bool last = false;
10478 if (in->ll_ref < count) {
10479 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10480 << ", which only has ll_ref=" << in->ll_ref << dendl;
10481 _ll_put(in, in->ll_ref);
10482 last = true;
10483 } else {
10484 if (_ll_put(in, count) == 0)
10485 last = true;
10486 }
10487
10488 return last;
10489}
10490
10491bool Client::ll_put(Inode *in)
10492{
10493 /* ll_forget already takes the lock */
10494 return ll_forget(in, 1);
10495}
10496
10497snapid_t Client::ll_get_snapid(Inode *in)
10498{
10499 Mutex::Locker lock(client_lock);
10500 return in->snapid;
10501}
10502
10503Inode *Client::ll_get_inode(ino_t ino)
10504{
10505 Mutex::Locker lock(client_lock);
181888fb
FG
10506
10507 if (unmounting)
10508 return NULL;
10509
7c673cae
FG
10510 vinodeno_t vino = _map_faked_ino(ino);
10511 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10512 if (p == inode_map.end())
10513 return NULL;
10514 Inode *in = p->second;
10515 _ll_get(in);
10516 return in;
10517}
10518
10519Inode *Client::ll_get_inode(vinodeno_t vino)
10520{
10521 Mutex::Locker lock(client_lock);
181888fb
FG
10522
10523 if (unmounting)
10524 return NULL;
10525
7c673cae
FG
10526 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10527 if (p == inode_map.end())
10528 return NULL;
10529 Inode *in = p->second;
10530 _ll_get(in);
10531 return in;
10532}
10533
10534int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10535{
10536 vinodeno_t vino = _get_vino(in);
10537
10538 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10539 tout(cct) << "ll_getattr" << std::endl;
10540 tout(cct) << vino.ino.val << std::endl;
10541
10542 if (vino.snapid < CEPH_NOSNAP)
10543 return 0;
10544 else
10545 return _getattr(in, caps, perms);
10546}
10547
10548int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10549{
10550 Mutex::Locker lock(client_lock);
10551
181888fb
FG
10552 if (unmounting)
10553 return -ENOTCONN;
10554
7c673cae
FG
10555 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10556
10557 if (res == 0)
10558 fill_stat(in, attr);
10559 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10560 return res;
10561}
10562
10563int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10564 unsigned int flags, const UserPerm& perms)
10565{
10566 Mutex::Locker lock(client_lock);
10567
181888fb
FG
10568 if (unmounting)
10569 return -ENOTCONN;
10570
7c673cae
FG
10571 int res = 0;
10572 unsigned mask = statx_to_mask(flags, want);
10573
10574 if (mask && !in->caps_issued_mask(mask))
10575 res = _ll_getattr(in, mask, perms);
10576
10577 if (res == 0)
10578 fill_statx(in, mask, stx);
10579 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10580 return res;
10581}
10582
10583int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10584 const UserPerm& perms, InodeRef *inp)
10585{
10586 vinodeno_t vino = _get_vino(in);
10587
10588 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10589 << dendl;
10590 tout(cct) << "ll_setattrx" << std::endl;
10591 tout(cct) << vino.ino.val << std::endl;
10592 tout(cct) << stx->stx_mode << std::endl;
10593 tout(cct) << stx->stx_uid << std::endl;
10594 tout(cct) << stx->stx_gid << std::endl;
10595 tout(cct) << stx->stx_size << std::endl;
10596 tout(cct) << stx->stx_mtime << std::endl;
10597 tout(cct) << stx->stx_atime << std::endl;
10598 tout(cct) << stx->stx_btime << std::endl;
10599 tout(cct) << mask << std::endl;
10600
10601 if (!cct->_conf->fuse_default_permissions) {
10602 int res = may_setattr(in, stx, mask, perms);
10603 if (res < 0)
10604 return res;
10605 }
10606
10607 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10608
10609 return __setattrx(in, stx, mask, perms, inp);
10610}
10611
10612int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10613 const UserPerm& perms)
10614{
10615 Mutex::Locker lock(client_lock);
181888fb
FG
10616
10617 if (unmounting)
10618 return -ENOTCONN;
10619
7c673cae
FG
10620 InodeRef target(in);
10621 int res = _ll_setattrx(in, stx, mask, perms, &target);
10622 if (res == 0) {
10623 assert(in == target.get());
10624 fill_statx(in, in->caps_issued(), stx);
10625 }
10626
10627 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10628 return res;
10629}
10630
10631int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10632 const UserPerm& perms)
10633{
10634 struct ceph_statx stx;
10635 stat_to_statx(attr, &stx);
10636
10637 Mutex::Locker lock(client_lock);
181888fb
FG
10638
10639 if (unmounting)
10640 return -ENOTCONN;
10641
7c673cae
FG
10642 InodeRef target(in);
10643 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10644 if (res == 0) {
10645 assert(in == target.get());
10646 fill_stat(in, attr);
10647 }
10648
10649 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10650 return res;
10651}
10652
10653
10654// ----------
10655// xattrs
10656
10657int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10658 const UserPerm& perms)
10659{
10660 Mutex::Locker lock(client_lock);
181888fb
FG
10661
10662 if (unmounting)
10663 return -ENOTCONN;
10664
7c673cae
FG
10665 InodeRef in;
10666 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10667 if (r < 0)
10668 return r;
10669 return _getxattr(in, name, value, size, perms);
10670}
10671
10672int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10673 const UserPerm& perms)
10674{
10675 Mutex::Locker lock(client_lock);
181888fb
FG
10676
10677 if (unmounting)
10678 return -ENOTCONN;
10679
7c673cae
FG
10680 InodeRef in;
10681 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10682 if (r < 0)
10683 return r;
10684 return _getxattr(in, name, value, size, perms);
10685}
10686
10687int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10688 const UserPerm& perms)
10689{
10690 Mutex::Locker lock(client_lock);
181888fb
FG
10691
10692 if (unmounting)
10693 return -ENOTCONN;
10694
7c673cae
FG
10695 Fh *f = get_filehandle(fd);
10696 if (!f)
10697 return -EBADF;
10698 return _getxattr(f->inode, name, value, size, perms);
10699}
10700
10701int Client::listxattr(const char *path, char *list, size_t size,
10702 const UserPerm& perms)
10703{
10704 Mutex::Locker lock(client_lock);
181888fb
FG
10705
10706 if (unmounting)
10707 return -ENOTCONN;
10708
7c673cae
FG
10709 InodeRef in;
10710 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10711 if (r < 0)
10712 return r;
10713 return Client::_listxattr(in.get(), list, size, perms);
10714}
10715
10716int Client::llistxattr(const char *path, char *list, size_t size,
10717 const UserPerm& perms)
10718{
10719 Mutex::Locker lock(client_lock);
181888fb
FG
10720
10721 if (unmounting)
10722 return -ENOTCONN;
10723
7c673cae
FG
10724 InodeRef in;
10725 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10726 if (r < 0)
10727 return r;
10728 return Client::_listxattr(in.get(), list, size, perms);
10729}
10730
10731int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10732{
10733 Mutex::Locker lock(client_lock);
181888fb
FG
10734
10735 if (unmounting)
10736 return -ENOTCONN;
10737
7c673cae
FG
10738 Fh *f = get_filehandle(fd);
10739 if (!f)
10740 return -EBADF;
10741 return Client::_listxattr(f->inode.get(), list, size, perms);
10742}
10743
10744int Client::removexattr(const char *path, const char *name,
10745 const UserPerm& perms)
10746{
10747 Mutex::Locker lock(client_lock);
181888fb
FG
10748
10749 if (unmounting)
10750 return -ENOTCONN;
10751
7c673cae
FG
10752 InodeRef in;
10753 int r = Client::path_walk(path, &in, perms, true);
10754 if (r < 0)
10755 return r;
10756 return _removexattr(in, name, perms);
10757}
10758
10759int Client::lremovexattr(const char *path, const char *name,
10760 const UserPerm& perms)
10761{
10762 Mutex::Locker lock(client_lock);
181888fb
FG
10763
10764 if (unmounting)
10765 return -ENOTCONN;
10766
7c673cae
FG
10767 InodeRef in;
10768 int r = Client::path_walk(path, &in, perms, false);
10769 if (r < 0)
10770 return r;
10771 return _removexattr(in, name, perms);
10772}
10773
10774int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10775{
10776 Mutex::Locker lock(client_lock);
181888fb
FG
10777
10778 if (unmounting)
10779 return -ENOTCONN;
10780
7c673cae
FG
10781 Fh *f = get_filehandle(fd);
10782 if (!f)
10783 return -EBADF;
10784 return _removexattr(f->inode, name, perms);
10785}
10786
10787int Client::setxattr(const char *path, const char *name, const void *value,
10788 size_t size, int flags, const UserPerm& perms)
10789{
10790 _setxattr_maybe_wait_for_osdmap(name, value, size);
10791
10792 Mutex::Locker lock(client_lock);
181888fb
FG
10793
10794 if (unmounting)
10795 return -ENOTCONN;
10796
7c673cae
FG
10797 InodeRef in;
10798 int r = Client::path_walk(path, &in, perms, true);
10799 if (r < 0)
10800 return r;
10801 return _setxattr(in, name, value, size, flags, perms);
10802}
10803
10804int Client::lsetxattr(const char *path, const char *name, const void *value,
10805 size_t size, int flags, const UserPerm& perms)
10806{
10807 _setxattr_maybe_wait_for_osdmap(name, value, size);
10808
10809 Mutex::Locker lock(client_lock);
181888fb
FG
10810
10811 if (unmounting)
10812 return -ENOTCONN;
10813
7c673cae
FG
10814 InodeRef in;
10815 int r = Client::path_walk(path, &in, perms, false);
10816 if (r < 0)
10817 return r;
10818 return _setxattr(in, name, value, size, flags, perms);
10819}
10820
10821int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10822 int flags, const UserPerm& perms)
10823{
10824 _setxattr_maybe_wait_for_osdmap(name, value, size);
10825
10826 Mutex::Locker lock(client_lock);
181888fb
FG
10827
10828 if (unmounting)
10829 return -ENOTCONN;
10830
7c673cae
FG
10831 Fh *f = get_filehandle(fd);
10832 if (!f)
10833 return -EBADF;
10834 return _setxattr(f->inode, name, value, size, flags, perms);
10835}
10836
10837int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10838 const UserPerm& perms)
10839{
10840 int r;
10841
10842 const VXattr *vxattr = _match_vxattr(in, name);
10843 if (vxattr) {
10844 r = -ENODATA;
10845
10846 // Do a force getattr to get the latest quota before returning
10847 // a value to userspace.
10848 r = _getattr(in, 0, perms, true);
10849 if (r != 0) {
10850 // Error from getattr!
10851 return r;
10852 }
10853
10854 // call pointer-to-member function
10855 char buf[256];
10856 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10857 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10858 } else {
10859 r = -ENODATA;
10860 }
10861
10862 if (size != 0) {
10863 if (r > (int)size) {
10864 r = -ERANGE;
10865 } else if (r > 0) {
10866 memcpy(value, buf, r);
10867 }
10868 }
10869 goto out;
10870 }
10871
10872 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10873 r = -EOPNOTSUPP;
10874 goto out;
10875 }
10876
10877 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10878 if (r == 0) {
10879 string n(name);
10880 r = -ENODATA;
10881 if (in->xattrs.count(n)) {
10882 r = in->xattrs[n].length();
10883 if (r > 0 && size != 0) {
10884 if (size >= (unsigned)r)
10885 memcpy(value, in->xattrs[n].c_str(), r);
10886 else
10887 r = -ERANGE;
10888 }
10889 }
10890 }
10891 out:
10892 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10893 return r;
10894}
10895
10896int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10897 const UserPerm& perms)
10898{
10899 if (cct->_conf->client_permissions) {
10900 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10901 if (r < 0)
10902 return r;
10903 }
10904 return _getxattr(in.get(), name, value, size, perms);
10905}
10906
10907int Client::ll_getxattr(Inode *in, const char *name, void *value,
10908 size_t size, const UserPerm& perms)
10909{
10910 Mutex::Locker lock(client_lock);
10911
181888fb
FG
10912 if (unmounting)
10913 return -ENOTCONN;
10914
7c673cae
FG
10915 vinodeno_t vino = _get_vino(in);
10916
10917 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10918 tout(cct) << "ll_getxattr" << std::endl;
10919 tout(cct) << vino.ino.val << std::endl;
10920 tout(cct) << name << std::endl;
10921
10922 if (!cct->_conf->fuse_default_permissions) {
10923 int r = xattr_permission(in, name, MAY_READ, perms);
10924 if (r < 0)
10925 return r;
10926 }
10927
10928 return _getxattr(in, name, value, size, perms);
10929}
10930
10931int Client::_listxattr(Inode *in, char *name, size_t size,
10932 const UserPerm& perms)
10933{
10934 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10935 if (r == 0) {
10936 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10937 p != in->xattrs.end();
10938 ++p)
10939 r += p->first.length() + 1;
10940
10941 const VXattr *vxattrs = _get_vxattrs(in);
10942 r += _vxattrs_name_size(vxattrs);
10943
10944 if (size != 0) {
10945 if (size >= (unsigned)r) {
10946 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10947 p != in->xattrs.end();
10948 ++p) {
10949 memcpy(name, p->first.c_str(), p->first.length());
10950 name += p->first.length();
10951 *name = '\0';
10952 name++;
10953 }
10954 if (vxattrs) {
10955 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10956 const VXattr& vxattr = vxattrs[i];
10957 if (vxattr.hidden)
10958 continue;
10959 // call pointer-to-member function
10960 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10961 continue;
10962 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10963 name += vxattr.name.length();
10964 *name = '\0';
10965 name++;
10966 }
10967 }
10968 } else
10969 r = -ERANGE;
10970 }
10971 }
10972 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10973 return r;
10974}
10975
10976int Client::ll_listxattr(Inode *in, char *names, size_t size,
10977 const UserPerm& perms)
10978{
10979 Mutex::Locker lock(client_lock);
10980
181888fb
FG
10981 if (unmounting)
10982 return -ENOTCONN;
10983
7c673cae
FG
10984 vinodeno_t vino = _get_vino(in);
10985
10986 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10987 tout(cct) << "ll_listxattr" << std::endl;
10988 tout(cct) << vino.ino.val << std::endl;
10989 tout(cct) << size << std::endl;
10990
10991 return _listxattr(in, names, size, perms);
10992}
10993
10994int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10995 size_t size, int flags, const UserPerm& perms)
10996{
10997
10998 int xattr_flags = 0;
10999 if (!value)
11000 xattr_flags |= CEPH_XATTR_REMOVE;
11001 if (flags & XATTR_CREATE)
11002 xattr_flags |= CEPH_XATTR_CREATE;
11003 if (flags & XATTR_REPLACE)
11004 xattr_flags |= CEPH_XATTR_REPLACE;
11005
11006 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11007 filepath path;
11008 in->make_nosnap_relative_path(path);
11009 req->set_filepath(path);
11010 req->set_string2(name);
11011 req->set_inode(in);
11012 req->head.args.setxattr.flags = xattr_flags;
11013
11014 bufferlist bl;
11015 bl.append((const char*)value, size);
11016 req->set_data(bl);
11017
11018 int res = make_request(req, perms);
11019
11020 trim_cache();
11021 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11022 res << dendl;
11023 return res;
11024}
11025
11026int Client::_setxattr(Inode *in, const char *name, const void *value,
11027 size_t size, int flags, const UserPerm& perms)
11028{
11029 if (in->snapid != CEPH_NOSNAP) {
11030 return -EROFS;
11031 }
11032
11033 bool posix_acl_xattr = false;
11034 if (acl_type == POSIX_ACL)
11035 posix_acl_xattr = !strncmp(name, "system.", 7);
11036
11037 if (strncmp(name, "user.", 5) &&
11038 strncmp(name, "security.", 9) &&
11039 strncmp(name, "trusted.", 8) &&
11040 strncmp(name, "ceph.", 5) &&
11041 !posix_acl_xattr)
11042 return -EOPNOTSUPP;
11043
11044 if (posix_acl_xattr) {
11045 if (!strcmp(name, ACL_EA_ACCESS)) {
11046 mode_t new_mode = in->mode;
11047 if (value) {
11048 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11049 if (ret < 0)
11050 return ret;
11051 if (ret == 0) {
11052 value = NULL;
11053 size = 0;
11054 }
11055 if (new_mode != in->mode) {
11056 struct ceph_statx stx;
11057 stx.stx_mode = new_mode;
11058 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11059 if (ret < 0)
11060 return ret;
11061 }
11062 }
11063 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11064 if (value) {
11065 if (!S_ISDIR(in->mode))
11066 return -EACCES;
11067 int ret = posix_acl_check(value, size);
11068 if (ret < 0)
11069 return -EINVAL;
11070 if (ret == 0) {
11071 value = NULL;
11072 size = 0;
11073 }
11074 }
11075 } else {
11076 return -EOPNOTSUPP;
11077 }
11078 } else {
11079 const VXattr *vxattr = _match_vxattr(in, name);
11080 if (vxattr && vxattr->readonly)
11081 return -EOPNOTSUPP;
11082 }
11083
11084 return _do_setxattr(in, name, value, size, flags, perms);
11085}
11086
11087int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11088 size_t size, int flags, const UserPerm& perms)
11089{
11090 if (cct->_conf->client_permissions) {
11091 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11092 if (r < 0)
11093 return r;
11094 }
11095 return _setxattr(in.get(), name, value, size, flags, perms);
11096}
11097
11098int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11099{
11100 string tmp;
11101 if (name == "layout") {
11102 string::iterator begin = value.begin();
11103 string::iterator end = value.end();
11104 keys_and_values<string::iterator> p; // create instance of parser
11105 std::map<string, string> m; // map to receive results
11106 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11107 return -EINVAL;
11108 }
11109 if (begin != end)
11110 return -EINVAL;
11111 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11112 if (q->first == "pool") {
11113 tmp = q->second;
11114 break;
11115 }
11116 }
11117 } else if (name == "layout.pool") {
11118 tmp = value;
11119 }
11120
11121 if (tmp.length()) {
11122 int64_t pool;
11123 try {
11124 pool = boost::lexical_cast<unsigned>(tmp);
11125 if (!osdmap->have_pg_pool(pool))
11126 return -ENOENT;
11127 } catch (boost::bad_lexical_cast const&) {
11128 pool = osdmap->lookup_pg_pool_name(tmp);
11129 if (pool < 0) {
11130 return -ENOENT;
11131 }
11132 }
11133 }
11134
11135 return 0;
11136}
11137
11138void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11139{
11140 // For setting pool of layout, MetaRequest need osdmap epoch.
11141 // There is a race which create a new data pool but client and mds both don't have.
11142 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11143 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11144 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11145 string rest(strstr(name, "layout"));
11146 string v((const char*)value, size);
11147 int r = objecter->with_osdmap([&](const OSDMap& o) {
11148 return _setxattr_check_data_pool(rest, v, &o);
11149 });
11150
11151 if (r == -ENOENT) {
11152 C_SaferCond ctx;
11153 objecter->wait_for_latest_osdmap(&ctx);
11154 ctx.wait();
11155 }
11156 }
11157}
11158
11159int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11160 size_t size, int flags, const UserPerm& perms)
11161{
11162 _setxattr_maybe_wait_for_osdmap(name, value, size);
11163
11164 Mutex::Locker lock(client_lock);
11165
181888fb
FG
11166 if (unmounting)
11167 return -ENOTCONN;
11168
7c673cae
FG
11169 vinodeno_t vino = _get_vino(in);
11170
11171 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11172 tout(cct) << "ll_setxattr" << std::endl;
11173 tout(cct) << vino.ino.val << std::endl;
11174 tout(cct) << name << std::endl;
11175
11176 if (!cct->_conf->fuse_default_permissions) {
11177 int r = xattr_permission(in, name, MAY_WRITE, perms);
11178 if (r < 0)
11179 return r;
11180 }
11181 return _setxattr(in, name, value, size, flags, perms);
11182}
11183
11184int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11185{
11186 if (in->snapid != CEPH_NOSNAP) {
11187 return -EROFS;
11188 }
11189
11190 // same xattrs supported by kernel client
11191 if (strncmp(name, "user.", 5) &&
11192 strncmp(name, "system.", 7) &&
11193 strncmp(name, "security.", 9) &&
11194 strncmp(name, "trusted.", 8) &&
11195 strncmp(name, "ceph.", 5))
11196 return -EOPNOTSUPP;
11197
11198 const VXattr *vxattr = _match_vxattr(in, name);
11199 if (vxattr && vxattr->readonly)
11200 return -EOPNOTSUPP;
11201
11202 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11203 filepath path;
11204 in->make_nosnap_relative_path(path);
11205 req->set_filepath(path);
11206 req->set_filepath2(name);
11207 req->set_inode(in);
11208
11209 int res = make_request(req, perms);
11210
11211 trim_cache();
11212 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11213 return res;
11214}
11215
11216int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11217{
11218 if (cct->_conf->client_permissions) {
11219 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11220 if (r < 0)
11221 return r;
11222 }
11223 return _removexattr(in.get(), name, perms);
11224}
11225
11226int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11227{
11228 Mutex::Locker lock(client_lock);
11229
181888fb
FG
11230 if (unmounting)
11231 return -ENOTCONN;
11232
7c673cae
FG
11233 vinodeno_t vino = _get_vino(in);
11234
11235 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11236 tout(cct) << "ll_removexattr" << std::endl;
11237 tout(cct) << vino.ino.val << std::endl;
11238 tout(cct) << name << std::endl;
11239
11240 if (!cct->_conf->fuse_default_permissions) {
11241 int r = xattr_permission(in, name, MAY_WRITE, perms);
11242 if (r < 0)
11243 return r;
11244 }
11245
11246 return _removexattr(in, name, perms);
11247}
11248
11249bool Client::_vxattrcb_quota_exists(Inode *in)
11250{
11251 return in->quota.is_enable();
11252}
11253size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11254{
11255 return snprintf(val, size,
11256 "max_bytes=%lld max_files=%lld",
11257 (long long int)in->quota.max_bytes,
11258 (long long int)in->quota.max_files);
11259}
11260size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11261{
11262 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11263}
11264size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11265{
11266 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11267}
11268
11269bool Client::_vxattrcb_layout_exists(Inode *in)
11270{
11271 return in->layout != file_layout_t();
11272}
11273size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11274{
11275 int r = snprintf(val, size,
11276 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11277 (unsigned long long)in->layout.stripe_unit,
11278 (unsigned long long)in->layout.stripe_count,
11279 (unsigned long long)in->layout.object_size);
11280 objecter->with_osdmap([&](const OSDMap& o) {
11281 if (o.have_pg_pool(in->layout.pool_id))
11282 r += snprintf(val + r, size - r, "%s",
11283 o.get_pool_name(in->layout.pool_id).c_str());
11284 else
11285 r += snprintf(val + r, size - r, "%" PRIu64,
11286 (uint64_t)in->layout.pool_id);
11287 });
11288 if (in->layout.pool_ns.length())
11289 r += snprintf(val + r, size - r, " pool_namespace=%s",
11290 in->layout.pool_ns.c_str());
11291 return r;
11292}
11293size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11294{
11295 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11296}
11297size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11298{
11299 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11300}
11301size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11302{
11303 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11304}
11305size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11306{
11307 size_t r;
11308 objecter->with_osdmap([&](const OSDMap& o) {
11309 if (o.have_pg_pool(in->layout.pool_id))
11310 r = snprintf(val, size, "%s", o.get_pool_name(
11311 in->layout.pool_id).c_str());
11312 else
11313 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11314 });
11315 return r;
11316}
11317size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11318{
11319 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11320}
11321size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11322{
11323 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11324}
11325size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11326{
11327 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11328}
11329size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11330{
11331 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11332}
11333size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11334{
11335 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11336}
11337size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11338{
11339 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11340}
11341size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11342{
11343 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11344}
11345size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11346{
11347 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11348}
11349size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11350{
11351 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11352 (long)in->rstat.rctime.nsec());
11353}
11354
11355#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11356#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11357
11358#define XATTR_NAME_CEPH(_type, _name) \
11359{ \
11360 name: CEPH_XATTR_NAME(_type, _name), \
11361 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11362 readonly: true, \
11363 hidden: false, \
11364 exists_cb: NULL, \
11365}
11366#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11367{ \
11368 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11369 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11370 readonly: false, \
11371 hidden: true, \
11372 exists_cb: &Client::_vxattrcb_layout_exists, \
11373}
11374#define XATTR_QUOTA_FIELD(_type, _name) \
11375{ \
11376 name: CEPH_XATTR_NAME(_type, _name), \
11377 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11378 readonly: false, \
11379 hidden: true, \
11380 exists_cb: &Client::_vxattrcb_quota_exists, \
11381}
11382
11383const Client::VXattr Client::_dir_vxattrs[] = {
11384 {
11385 name: "ceph.dir.layout",
11386 getxattr_cb: &Client::_vxattrcb_layout,
11387 readonly: false,
11388 hidden: true,
11389 exists_cb: &Client::_vxattrcb_layout_exists,
11390 },
11391 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11392 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11393 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11394 XATTR_LAYOUT_FIELD(dir, layout, pool),
11395 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11396 XATTR_NAME_CEPH(dir, entries),
11397 XATTR_NAME_CEPH(dir, files),
11398 XATTR_NAME_CEPH(dir, subdirs),
11399 XATTR_NAME_CEPH(dir, rentries),
11400 XATTR_NAME_CEPH(dir, rfiles),
11401 XATTR_NAME_CEPH(dir, rsubdirs),
11402 XATTR_NAME_CEPH(dir, rbytes),
11403 XATTR_NAME_CEPH(dir, rctime),
11404 {
11405 name: "ceph.quota",
11406 getxattr_cb: &Client::_vxattrcb_quota,
11407 readonly: false,
11408 hidden: true,
11409 exists_cb: &Client::_vxattrcb_quota_exists,
11410 },
11411 XATTR_QUOTA_FIELD(quota, max_bytes),
11412 XATTR_QUOTA_FIELD(quota, max_files),
11413 { name: "" } /* Required table terminator */
11414};
11415
11416const Client::VXattr Client::_file_vxattrs[] = {
11417 {
11418 name: "ceph.file.layout",
11419 getxattr_cb: &Client::_vxattrcb_layout,
11420 readonly: false,
11421 hidden: true,
11422 exists_cb: &Client::_vxattrcb_layout_exists,
11423 },
11424 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11425 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11426 XATTR_LAYOUT_FIELD(file, layout, object_size),
11427 XATTR_LAYOUT_FIELD(file, layout, pool),
11428 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11429 { name: "" } /* Required table terminator */
11430};
11431
11432const Client::VXattr *Client::_get_vxattrs(Inode *in)
11433{
11434 if (in->is_dir())
11435 return _dir_vxattrs;
11436 else if (in->is_file())
11437 return _file_vxattrs;
11438 return NULL;
11439}
11440
11441const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11442{
11443 if (strncmp(name, "ceph.", 5) == 0) {
11444 const VXattr *vxattr = _get_vxattrs(in);
11445 if (vxattr) {
11446 while (!vxattr->name.empty()) {
11447 if (vxattr->name == name)
11448 return vxattr;
11449 vxattr++;
11450 }
11451 }
11452 }
11453 return NULL;
11454}
11455
11456size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11457{
11458 size_t len = 0;
11459 while (!vxattr->name.empty()) {
11460 if (!vxattr->hidden)
11461 len += vxattr->name.length() + 1;
11462 vxattr++;
11463 }
11464 return len;
11465}
11466
11467int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11468{
11469 Mutex::Locker lock(client_lock);
11470
181888fb
FG
11471 if (unmounting)
11472 return -ENOTCONN;
11473
7c673cae
FG
11474 vinodeno_t vino = _get_vino(in);
11475
11476 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11477 tout(cct) << "ll_readlink" << std::endl;
11478 tout(cct) << vino.ino.val << std::endl;
11479
11480 set<Dentry*>::iterator dn = in->dn_set.begin();
11481 while (dn != in->dn_set.end()) {
11482 touch_dn(*dn);
11483 ++dn;
11484 }
11485
11486 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11487 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11488 return r;
11489}
11490
11491int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11492 const UserPerm& perms, InodeRef *inp)
11493{
11494 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11495 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11496 << ", gid " << perms.gid() << ")" << dendl;
11497
11498 if (strlen(name) > NAME_MAX)
11499 return -ENAMETOOLONG;
11500
11501 if (dir->snapid != CEPH_NOSNAP) {
11502 return -EROFS;
11503 }
11504 if (is_quota_files_exceeded(dir, perms)) {
11505 return -EDQUOT;
11506 }
11507
11508 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11509
11510 filepath path;
11511 dir->make_nosnap_relative_path(path);
11512 path.push_dentry(name);
11513 req->set_filepath(path);
11514 req->set_inode(dir);
11515 req->head.args.mknod.rdev = rdev;
11516 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11517 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11518
11519 bufferlist xattrs_bl;
11520 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11521 if (res < 0)
11522 goto fail;
11523 req->head.args.mknod.mode = mode;
11524 if (xattrs_bl.length() > 0)
11525 req->set_data(xattrs_bl);
11526
11527 Dentry *de;
11528 res = get_or_create(dir, name, &de);
11529 if (res < 0)
11530 goto fail;
11531 req->set_dentry(de);
11532
11533 res = make_request(req, perms, inp);
11534
11535 trim_cache();
11536
11537 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11538 return res;
11539
11540 fail:
11541 put_request(req);
11542 return res;
11543}
11544
11545int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11546 dev_t rdev, struct stat *attr, Inode **out,
11547 const UserPerm& perms)
11548{
11549 Mutex::Locker lock(client_lock);
11550
181888fb
FG
11551 if (unmounting)
11552 return -ENOTCONN;
11553
7c673cae
FG
11554 vinodeno_t vparent = _get_vino(parent);
11555
11556 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11557 tout(cct) << "ll_mknod" << std::endl;
11558 tout(cct) << vparent.ino.val << std::endl;
11559 tout(cct) << name << std::endl;
11560 tout(cct) << mode << std::endl;
11561 tout(cct) << rdev << std::endl;
11562
11563 if (!cct->_conf->fuse_default_permissions) {
11564 int r = may_create(parent, perms);
11565 if (r < 0)
11566 return r;
11567 }
11568
11569 InodeRef in;
11570 int r = _mknod(parent, name, mode, rdev, perms, &in);
11571 if (r == 0) {
11572 fill_stat(in, attr);
11573 _ll_get(in.get());
11574 }
11575 tout(cct) << attr->st_ino << std::endl;
11576 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11577 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11578 *out = in.get();
11579 return r;
11580}
11581
11582int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11583 dev_t rdev, Inode **out,
11584 struct ceph_statx *stx, unsigned want, unsigned flags,
11585 const UserPerm& perms)
11586{
11587 unsigned caps = statx_to_mask(flags, want);
11588 Mutex::Locker lock(client_lock);
11589
181888fb
FG
11590 if (unmounting)
11591 return -ENOTCONN;
11592
7c673cae
FG
11593 vinodeno_t vparent = _get_vino(parent);
11594
11595 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11596 tout(cct) << "ll_mknodx" << std::endl;
11597 tout(cct) << vparent.ino.val << std::endl;
11598 tout(cct) << name << std::endl;
11599 tout(cct) << mode << std::endl;
11600 tout(cct) << rdev << std::endl;
11601
11602 if (!cct->_conf->fuse_default_permissions) {
11603 int r = may_create(parent, perms);
11604 if (r < 0)
11605 return r;
11606 }
11607
11608 InodeRef in;
11609 int r = _mknod(parent, name, mode, rdev, perms, &in);
11610 if (r == 0) {
11611 fill_statx(in, caps, stx);
11612 _ll_get(in.get());
11613 }
11614 tout(cct) << stx->stx_ino << std::endl;
11615 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11616 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11617 *out = in.get();
11618 return r;
11619}
11620
11621int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11622 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11623 int object_size, const char *data_pool, bool *created,
11624 const UserPerm& perms)
11625{
11626 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11627 mode << dec << ")" << dendl;
11628
11629 if (strlen(name) > NAME_MAX)
11630 return -ENAMETOOLONG;
11631 if (dir->snapid != CEPH_NOSNAP) {
11632 return -EROFS;
11633 }
11634 if (is_quota_files_exceeded(dir, perms)) {
11635 return -EDQUOT;
11636 }
11637
11638 // use normalized flags to generate cmode
11639 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11640 if (cmode < 0)
11641 return -EINVAL;
11642
11643 int64_t pool_id = -1;
11644 if (data_pool && *data_pool) {
11645 pool_id = objecter->with_osdmap(
11646 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11647 if (pool_id < 0)
11648 return -EINVAL;
11649 if (pool_id > 0xffffffffll)
11650 return -ERANGE; // bummer!
11651 }
11652
11653 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11654
11655 filepath path;
11656 dir->make_nosnap_relative_path(path);
11657 path.push_dentry(name);
11658 req->set_filepath(path);
11659 req->set_inode(dir);
11660 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11661
11662 req->head.args.open.stripe_unit = stripe_unit;
11663 req->head.args.open.stripe_count = stripe_count;
11664 req->head.args.open.object_size = object_size;
11665 if (cct->_conf->client_debug_getattr_caps)
11666 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11667 else
11668 req->head.args.open.mask = 0;
11669 req->head.args.open.pool = pool_id;
11670 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11671 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11672
11673 mode |= S_IFREG;
11674 bufferlist xattrs_bl;
11675 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11676 if (res < 0)
11677 goto fail;
11678 req->head.args.open.mode = mode;
11679 if (xattrs_bl.length() > 0)
11680 req->set_data(xattrs_bl);
11681
11682 Dentry *de;
11683 res = get_or_create(dir, name, &de);
11684 if (res < 0)
11685 goto fail;
11686 req->set_dentry(de);
11687
11688 res = make_request(req, perms, inp, created);
11689 if (res < 0) {
11690 goto reply_error;
11691 }
11692
11693 /* If the caller passed a value in fhp, do the open */
11694 if(fhp) {
11695 (*inp)->get_open_ref(cmode);
11696 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11697 }
11698
11699 reply_error:
11700 trim_cache();
11701
11702 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11703 << " layout " << stripe_unit
11704 << ' ' << stripe_count
11705 << ' ' << object_size
11706 <<") = " << res << dendl;
11707 return res;
11708
11709 fail:
11710 put_request(req);
11711 return res;
11712}
11713
11714
11715int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11716 InodeRef *inp)
11717{
11718 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11719 << mode << dec << ", uid " << perm.uid()
11720 << ", gid " << perm.gid() << ")" << dendl;
11721
11722 if (strlen(name) > NAME_MAX)
11723 return -ENAMETOOLONG;
11724
11725 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11726 return -EROFS;
11727 }
11728 if (is_quota_files_exceeded(dir, perm)) {
11729 return -EDQUOT;
11730 }
11731 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11732 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11733
11734 filepath path;
11735 dir->make_nosnap_relative_path(path);
11736 path.push_dentry(name);
11737 req->set_filepath(path);
11738 req->set_inode(dir);
11739 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11740 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11741
11742 mode |= S_IFDIR;
11743 bufferlist xattrs_bl;
11744 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11745 if (res < 0)
11746 goto fail;
11747 req->head.args.mkdir.mode = mode;
11748 if (xattrs_bl.length() > 0)
11749 req->set_data(xattrs_bl);
11750
11751 Dentry *de;
11752 res = get_or_create(dir, name, &de);
11753 if (res < 0)
11754 goto fail;
11755 req->set_dentry(de);
11756
11757 ldout(cct, 10) << "_mkdir: making request" << dendl;
11758 res = make_request(req, perm, inp);
11759 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11760
11761 trim_cache();
11762
11763 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11764 return res;
11765
11766 fail:
11767 put_request(req);
11768 return res;
11769}
11770
11771int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11772 struct stat *attr, Inode **out, const UserPerm& perm)
11773{
11774 Mutex::Locker lock(client_lock);
11775
181888fb
FG
11776 if (unmounting)
11777 return -ENOTCONN;
11778
7c673cae
FG
11779 vinodeno_t vparent = _get_vino(parent);
11780
11781 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11782 tout(cct) << "ll_mkdir" << std::endl;
11783 tout(cct) << vparent.ino.val << std::endl;
11784 tout(cct) << name << std::endl;
11785 tout(cct) << mode << std::endl;
11786
11787 if (!cct->_conf->fuse_default_permissions) {
11788 int r = may_create(parent, perm);
11789 if (r < 0)
11790 return r;
11791 }
11792
11793 InodeRef in;
11794 int r = _mkdir(parent, name, mode, perm, &in);
11795 if (r == 0) {
11796 fill_stat(in, attr);
11797 _ll_get(in.get());
11798 }
11799 tout(cct) << attr->st_ino << std::endl;
11800 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11801 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11802 *out = in.get();
11803 return r;
11804}
11805
11806int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11807 struct ceph_statx *stx, unsigned want, unsigned flags,
11808 const UserPerm& perms)
11809{
11810 Mutex::Locker lock(client_lock);
11811
181888fb
FG
11812 if (unmounting)
11813 return -ENOTCONN;
11814
7c673cae
FG
11815 vinodeno_t vparent = _get_vino(parent);
11816
11817 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11818 tout(cct) << "ll_mkdirx" << std::endl;
11819 tout(cct) << vparent.ino.val << std::endl;
11820 tout(cct) << name << std::endl;
11821 tout(cct) << mode << std::endl;
11822
11823 if (!cct->_conf->fuse_default_permissions) {
11824 int r = may_create(parent, perms);
11825 if (r < 0)
11826 return r;
11827 }
11828
11829 InodeRef in;
11830 int r = _mkdir(parent, name, mode, perms, &in);
11831 if (r == 0) {
11832 fill_statx(in, statx_to_mask(flags, want), stx);
11833 _ll_get(in.get());
11834 } else {
11835 stx->stx_ino = 0;
11836 stx->stx_mask = 0;
11837 }
11838 tout(cct) << stx->stx_ino << std::endl;
11839 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11840 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11841 *out = in.get();
11842 return r;
11843}
11844
11845int Client::_symlink(Inode *dir, const char *name, const char *target,
11846 const UserPerm& perms, InodeRef *inp)
11847{
11848 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11849 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11850 << dendl;
11851
11852 if (strlen(name) > NAME_MAX)
11853 return -ENAMETOOLONG;
11854
11855 if (dir->snapid != CEPH_NOSNAP) {
11856 return -EROFS;
11857 }
11858 if (is_quota_files_exceeded(dir, perms)) {
11859 return -EDQUOT;
11860 }
11861
11862 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11863
11864 filepath path;
11865 dir->make_nosnap_relative_path(path);
11866 path.push_dentry(name);
11867 req->set_filepath(path);
11868 req->set_inode(dir);
11869 req->set_string2(target);
11870 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11871 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11872
11873 Dentry *de;
11874 int res = get_or_create(dir, name, &de);
11875 if (res < 0)
11876 goto fail;
11877 req->set_dentry(de);
11878
11879 res = make_request(req, perms, inp);
11880
11881 trim_cache();
11882 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11883 res << dendl;
11884 return res;
11885
11886 fail:
11887 put_request(req);
11888 return res;
11889}
11890
11891int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11892 struct stat *attr, Inode **out, const UserPerm& perms)
11893{
11894 Mutex::Locker lock(client_lock);
11895
181888fb
FG
11896 if (unmounting)
11897 return -ENOTCONN;
11898
7c673cae
FG
11899 vinodeno_t vparent = _get_vino(parent);
11900
11901 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11902 << dendl;
11903 tout(cct) << "ll_symlink" << std::endl;
11904 tout(cct) << vparent.ino.val << std::endl;
11905 tout(cct) << name << std::endl;
11906 tout(cct) << value << std::endl;
11907
11908 if (!cct->_conf->fuse_default_permissions) {
11909 int r = may_create(parent, perms);
11910 if (r < 0)
11911 return r;
11912 }
11913
11914 InodeRef in;
11915 int r = _symlink(parent, name, value, perms, &in);
11916 if (r == 0) {
11917 fill_stat(in, attr);
11918 _ll_get(in.get());
11919 }
11920 tout(cct) << attr->st_ino << std::endl;
11921 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11922 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11923 *out = in.get();
11924 return r;
11925}
11926
11927int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11928 Inode **out, struct ceph_statx *stx, unsigned want,
11929 unsigned flags, const UserPerm& perms)
11930{
11931 Mutex::Locker lock(client_lock);
11932
181888fb
FG
11933 if (unmounting)
11934 return -ENOTCONN;
11935
7c673cae
FG
11936 vinodeno_t vparent = _get_vino(parent);
11937
11938 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11939 << dendl;
11940 tout(cct) << "ll_symlinkx" << std::endl;
11941 tout(cct) << vparent.ino.val << std::endl;
11942 tout(cct) << name << std::endl;
11943 tout(cct) << value << std::endl;
11944
11945 if (!cct->_conf->fuse_default_permissions) {
11946 int r = may_create(parent, perms);
11947 if (r < 0)
11948 return r;
11949 }
11950
11951 InodeRef in;
11952 int r = _symlink(parent, name, value, perms, &in);
11953 if (r == 0) {
11954 fill_statx(in, statx_to_mask(flags, want), stx);
11955 _ll_get(in.get());
11956 }
11957 tout(cct) << stx->stx_ino << std::endl;
11958 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11959 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11960 *out = in.get();
11961 return r;
11962}
11963
11964int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11965{
11966 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11967 << " uid " << perm.uid() << " gid " << perm.gid()
11968 << ")" << dendl;
11969
11970 if (dir->snapid != CEPH_NOSNAP) {
11971 return -EROFS;
11972 }
11973
11974 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11975
11976 filepath path;
11977 dir->make_nosnap_relative_path(path);
11978 path.push_dentry(name);
11979 req->set_filepath(path);
11980
11981 InodeRef otherin;
b32b8144 11982 Inode *in;
7c673cae 11983 Dentry *de;
b32b8144 11984
7c673cae
FG
11985 int res = get_or_create(dir, name, &de);
11986 if (res < 0)
11987 goto fail;
11988 req->set_dentry(de);
11989 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11990 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11991
11992 res = _lookup(dir, name, 0, &otherin, perm);
11993 if (res < 0)
11994 goto fail;
b32b8144
FG
11995
11996 in = otherin.get();
11997 req->set_other_inode(in);
11998 in->break_all_delegs();
7c673cae
FG
11999 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12000
12001 req->set_inode(dir);
12002
12003 res = make_request(req, perm);
12004
12005 trim_cache();
12006 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
12007 return res;
12008
12009 fail:
12010 put_request(req);
12011 return res;
12012}
12013
12014int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12015{
12016 Mutex::Locker lock(client_lock);
12017
181888fb
FG
12018 if (unmounting)
12019 return -ENOTCONN;
12020
7c673cae
FG
12021 vinodeno_t vino = _get_vino(in);
12022
12023 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12024 tout(cct) << "ll_unlink" << std::endl;
12025 tout(cct) << vino.ino.val << std::endl;
12026 tout(cct) << name << std::endl;
12027
12028 if (!cct->_conf->fuse_default_permissions) {
12029 int r = may_delete(in, name, perm);
12030 if (r < 0)
12031 return r;
12032 }
12033 return _unlink(in, name, perm);
12034}
12035
12036int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12037{
12038 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
12039 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12040
12041 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12042 return -EROFS;
12043 }
b32b8144
FG
12044
12045 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12046 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12047 filepath path;
12048 dir->make_nosnap_relative_path(path);
12049 path.push_dentry(name);
12050 req->set_filepath(path);
12051
12052 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12053 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12054 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12055
12056 InodeRef in;
12057
12058 Dentry *de;
12059 int res = get_or_create(dir, name, &de);
12060 if (res < 0)
12061 goto fail;
b32b8144
FG
12062 if (op == CEPH_MDS_OP_RMDIR)
12063 req->set_dentry(de);
12064 else
12065 de->get();
12066
7c673cae
FG
12067 res = _lookup(dir, name, 0, &in, perms);
12068 if (res < 0)
12069 goto fail;
b32b8144 12070 if (op == CEPH_MDS_OP_RMDIR) {
7c673cae 12071 req->set_inode(dir);
7c673cae
FG
12072 req->set_other_inode(in.get());
12073 } else {
12074 unlink(de, true, true);
b32b8144 12075 de->put();
7c673cae
FG
12076 req->set_other_inode(in.get());
12077 }
12078
12079 res = make_request(req, perms);
12080
12081 trim_cache();
12082 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
12083 return res;
12084
12085 fail:
12086 put_request(req);
12087 return res;
12088}
12089
12090int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12091{
12092 Mutex::Locker lock(client_lock);
12093
181888fb
FG
12094 if (unmounting)
12095 return -ENOTCONN;
12096
7c673cae
FG
12097 vinodeno_t vino = _get_vino(in);
12098
12099 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12100 tout(cct) << "ll_rmdir" << std::endl;
12101 tout(cct) << vino.ino.val << std::endl;
12102 tout(cct) << name << std::endl;
12103
12104 if (!cct->_conf->fuse_default_permissions) {
12105 int r = may_delete(in, name, perms);
12106 if (r < 0)
12107 return r;
12108 }
12109
12110 return _rmdir(in, name, perms);
12111}
12112
12113int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12114{
12115 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
12116 << todir->ino << " " << toname
12117 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12118 << dendl;
12119
12120 if (fromdir->snapid != todir->snapid)
12121 return -EXDEV;
12122
12123 int op = CEPH_MDS_OP_RENAME;
12124 if (fromdir->snapid != CEPH_NOSNAP) {
12125 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12126 op = CEPH_MDS_OP_RENAMESNAP;
12127 else
12128 return -EROFS;
12129 }
12130 if (fromdir != todir) {
12131 Inode *fromdir_root =
12132 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12133 Inode *todir_root =
12134 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12135 if (fromdir_root != todir_root) {
12136 return -EXDEV;
12137 }
12138 }
12139
12140 InodeRef target;
12141 MetaRequest *req = new MetaRequest(op);
12142
12143 filepath from;
12144 fromdir->make_nosnap_relative_path(from);
12145 from.push_dentry(fromname);
12146 filepath to;
12147 todir->make_nosnap_relative_path(to);
12148 to.push_dentry(toname);
12149 req->set_filepath(to);
12150 req->set_filepath2(from);
12151
12152 Dentry *oldde;
12153 int res = get_or_create(fromdir, fromname, &oldde);
12154 if (res < 0)
12155 goto fail;
12156 Dentry *de;
12157 res = get_or_create(todir, toname, &de);
12158 if (res < 0)
12159 goto fail;
12160
12161 if (op == CEPH_MDS_OP_RENAME) {
12162 req->set_old_dentry(oldde);
12163 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12164 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12165
12166 req->set_dentry(de);
12167 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12168 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12169
12170 InodeRef oldin, otherin;
12171 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12172 if (res < 0)
12173 goto fail;
b32b8144
FG
12174
12175 Inode *oldinode = oldin.get();
12176 oldinode->break_all_delegs();
12177 req->set_old_inode(oldinode);
7c673cae
FG
12178 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12179
12180 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12181 switch (res) {
12182 case 0:
12183 {
12184 Inode *in = otherin.get();
12185 req->set_other_inode(in);
12186 in->break_all_delegs();
12187 }
7c673cae 12188 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12189 break;
12190 case -ENOENT:
12191 break;
12192 default:
12193 goto fail;
7c673cae
FG
12194 }
12195
12196 req->set_inode(todir);
12197 } else {
12198 // renamesnap reply contains no tracedn, so we need to invalidate
12199 // dentry manually
12200 unlink(oldde, true, true);
12201 unlink(de, true, true);
12202 }
12203
12204 res = make_request(req, perm, &target);
12205 ldout(cct, 10) << "rename result is " << res << dendl;
12206
12207 // renamed item from our cache
12208
12209 trim_cache();
12210 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12211 return res;
12212
12213 fail:
12214 put_request(req);
12215 return res;
12216}
12217
12218int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12219 const char *newname, const UserPerm& perm)
12220{
12221 Mutex::Locker lock(client_lock);
12222
181888fb
FG
12223 if (unmounting)
12224 return -ENOTCONN;
12225
7c673cae
FG
12226 vinodeno_t vparent = _get_vino(parent);
12227 vinodeno_t vnewparent = _get_vino(newparent);
12228
12229 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12230 << vnewparent << " " << newname << dendl;
12231 tout(cct) << "ll_rename" << std::endl;
12232 tout(cct) << vparent.ino.val << std::endl;
12233 tout(cct) << name << std::endl;
12234 tout(cct) << vnewparent.ino.val << std::endl;
12235 tout(cct) << newname << std::endl;
12236
12237 if (!cct->_conf->fuse_default_permissions) {
12238 int r = may_delete(parent, name, perm);
12239 if (r < 0)
12240 return r;
12241 r = may_delete(newparent, newname, perm);
12242 if (r < 0 && r != -ENOENT)
12243 return r;
12244 }
12245
12246 return _rename(parent, name, newparent, newname, perm);
12247}
12248
12249int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12250{
12251 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12252 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12253
12254 if (strlen(newname) > NAME_MAX)
12255 return -ENAMETOOLONG;
12256
12257 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12258 return -EROFS;
12259 }
12260 if (is_quota_files_exceeded(dir, perm)) {
12261 return -EDQUOT;
12262 }
12263
b32b8144 12264 in->break_all_delegs();
7c673cae
FG
12265 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12266
12267 filepath path(newname, dir->ino);
12268 req->set_filepath(path);
12269 filepath existing(in->ino);
12270 req->set_filepath2(existing);
12271
12272 req->set_inode(dir);
12273 req->inode_drop = CEPH_CAP_FILE_SHARED;
12274 req->inode_unless = CEPH_CAP_FILE_EXCL;
12275
12276 Dentry *de;
12277 int res = get_or_create(dir, newname, &de);
12278 if (res < 0)
12279 goto fail;
12280 req->set_dentry(de);
12281
12282 res = make_request(req, perm, inp);
12283 ldout(cct, 10) << "link result is " << res << dendl;
12284
12285 trim_cache();
12286 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
12287 return res;
12288
12289 fail:
12290 put_request(req);
12291 return res;
12292}
12293
12294int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12295 const UserPerm& perm)
12296{
12297 Mutex::Locker lock(client_lock);
12298
181888fb
FG
12299 if (unmounting)
12300 return -ENOTCONN;
12301
7c673cae
FG
12302 vinodeno_t vino = _get_vino(in);
12303 vinodeno_t vnewparent = _get_vino(newparent);
12304
31f18b77 12305 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12306 newname << dendl;
12307 tout(cct) << "ll_link" << std::endl;
12308 tout(cct) << vino.ino.val << std::endl;
12309 tout(cct) << vnewparent << std::endl;
12310 tout(cct) << newname << std::endl;
12311
12312 int r = 0;
12313 InodeRef target;
12314
12315 if (!cct->_conf->fuse_default_permissions) {
12316 if (S_ISDIR(in->mode))
12317 return -EPERM;
12318
12319 r = may_hardlink(in, perm);
12320 if (r < 0)
12321 return r;
12322
12323 r = may_create(newparent, perm);
12324 if (r < 0)
12325 return r;
12326 }
12327
12328 return _link(in, newparent, newname, perm, &target);
12329}
12330
12331int Client::ll_num_osds(void)
12332{
12333 Mutex::Locker lock(client_lock);
12334 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12335}
12336
12337int Client::ll_osdaddr(int osd, uint32_t *addr)
12338{
12339 Mutex::Locker lock(client_lock);
181888fb 12340
7c673cae
FG
12341 entity_addr_t g;
12342 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12343 if (!o.exists(osd))
12344 return false;
12345 g = o.get_addr(osd);
12346 return true;
12347 });
12348 if (!exists)
12349 return -1;
12350 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12351 *addr = ntohl(nb_addr);
12352 return 0;
12353}
181888fb 12354
7c673cae
FG
12355uint32_t Client::ll_stripe_unit(Inode *in)
12356{
12357 Mutex::Locker lock(client_lock);
12358 return in->layout.stripe_unit;
12359}
12360
12361uint64_t Client::ll_snap_seq(Inode *in)
12362{
12363 Mutex::Locker lock(client_lock);
12364 return in->snaprealm->seq;
12365}
12366
12367int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12368{
12369 Mutex::Locker lock(client_lock);
12370 *layout = in->layout;
12371 return 0;
12372}
12373
12374int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12375{
12376 return ll_file_layout(fh->inode.get(), layout);
12377}
12378
12379/* Currently we cannot take advantage of redundancy in reads, since we
12380 would have to go through all possible placement groups (a
12381 potentially quite large number determined by a hash), and use CRUSH
12382 to calculate the appropriate set of OSDs for each placement group,
12383 then index into that. An array with one entry per OSD is much more
12384 tractable and works for demonstration purposes. */
12385
12386int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12387 file_layout_t* layout)
12388{
12389 Mutex::Locker lock(client_lock);
181888fb 12390
7c673cae
FG
12391 inodeno_t ino = ll_get_inodeno(in);
12392 uint32_t object_size = layout->object_size;
12393 uint32_t su = layout->stripe_unit;
12394 uint32_t stripe_count = layout->stripe_count;
12395 uint64_t stripes_per_object = object_size / su;
12396
12397 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12398 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12399 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12400 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12401
12402 object_t oid = file_object_t(ino, objectno);
12403 return objecter->with_osdmap([&](const OSDMap& o) {
12404 ceph_object_layout olayout =
12405 o.file_to_object_layout(oid, *layout);
12406 pg_t pg = (pg_t)olayout.ol_pgid;
12407 vector<int> osds;
12408 int primary;
12409 o.pg_to_acting_osds(pg, &osds, &primary);
12410 return primary;
12411 });
12412}
12413
12414/* Return the offset of the block, internal to the object */
12415
12416uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12417{
12418 Mutex::Locker lock(client_lock);
12419 file_layout_t *layout=&(in->layout);
12420 uint32_t object_size = layout->object_size;
12421 uint32_t su = layout->stripe_unit;
12422 uint64_t stripes_per_object = object_size / su;
12423
12424 return (blockno % stripes_per_object) * su;
12425}
12426
12427int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12428 const UserPerm& perms)
12429{
12430 Mutex::Locker lock(client_lock);
12431
181888fb
FG
12432 if (unmounting)
12433 return -ENOTCONN;
12434
7c673cae
FG
12435 vinodeno_t vino = _get_vino(in);
12436
12437 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12438 tout(cct) << "ll_opendir" << std::endl;
12439 tout(cct) << vino.ino.val << std::endl;
12440
12441 if (!cct->_conf->fuse_default_permissions) {
12442 int r = may_open(in, flags, perms);
12443 if (r < 0)
12444 return r;
12445 }
12446
12447 int r = _opendir(in, dirpp, perms);
12448 tout(cct) << (unsigned long)*dirpp << std::endl;
12449
12450 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12451 << dendl;
12452 return r;
12453}
12454
12455int Client::ll_releasedir(dir_result_t *dirp)
12456{
12457 Mutex::Locker lock(client_lock);
12458 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12459 tout(cct) << "ll_releasedir" << std::endl;
12460 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12461
12462 if (unmounting)
12463 return -ENOTCONN;
12464
7c673cae
FG
12465 _closedir(dirp);
12466 return 0;
12467}
12468
12469int Client::ll_fsyncdir(dir_result_t *dirp)
12470{
12471 Mutex::Locker lock(client_lock);
12472 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12473 tout(cct) << "ll_fsyncdir" << std::endl;
12474 tout(cct) << (unsigned long)dirp << std::endl;
12475
181888fb
FG
12476 if (unmounting)
12477 return -ENOTCONN;
12478
7c673cae
FG
12479 return _fsync(dirp->inode.get(), false);
12480}
12481
12482int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12483{
12484 assert(!(flags & O_CREAT));
12485
12486 Mutex::Locker lock(client_lock);
12487
181888fb
FG
12488 if (unmounting)
12489 return -ENOTCONN;
12490
7c673cae
FG
12491 vinodeno_t vino = _get_vino(in);
12492
12493 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12494 tout(cct) << "ll_open" << std::endl;
12495 tout(cct) << vino.ino.val << std::endl;
12496 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12497
12498 int r;
12499 if (!cct->_conf->fuse_default_permissions) {
12500 r = may_open(in, flags, perms);
12501 if (r < 0)
12502 goto out;
12503 }
12504
12505 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12506
12507 out:
12508 Fh *fhptr = fhp ? *fhp : NULL;
12509 if (fhptr) {
12510 ll_unclosed_fh_set.insert(fhptr);
12511 }
12512 tout(cct) << (unsigned long)fhptr << std::endl;
12513 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12514 " = " << r << " (" << fhptr << ")" << dendl;
12515 return r;
12516}
12517
12518int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12519 int flags, InodeRef *in, int caps, Fh **fhp,
12520 const UserPerm& perms)
12521{
12522 *fhp = NULL;
12523
12524 vinodeno_t vparent = _get_vino(parent);
12525
12526 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12527 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12528 << ", gid " << perms.gid() << dendl;
12529 tout(cct) << "ll_create" << std::endl;
12530 tout(cct) << vparent.ino.val << std::endl;
12531 tout(cct) << name << std::endl;
12532 tout(cct) << mode << std::endl;
12533 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12534
12535 bool created = false;
12536 int r = _lookup(parent, name, caps, in, perms);
12537
12538 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12539 return -EEXIST;
12540
12541 if (r == -ENOENT && (flags & O_CREAT)) {
12542 if (!cct->_conf->fuse_default_permissions) {
12543 r = may_create(parent, perms);
12544 if (r < 0)
12545 goto out;
12546 }
12547 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12548 perms);
12549 if (r < 0)
12550 goto out;
12551 }
12552
12553 if (r < 0)
12554 goto out;
12555
12556 assert(*in);
12557
12558 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12559 if (!created) {
12560 if (!cct->_conf->fuse_default_permissions) {
12561 r = may_open(in->get(), flags, perms);
12562 if (r < 0) {
12563 if (*fhp) {
12564 int release_r = _release_fh(*fhp);
12565 assert(release_r == 0); // during create, no async data ops should have happened
12566 }
12567 goto out;
12568 }
12569 }
12570 if (*fhp == NULL) {
12571 r = _open(in->get(), flags, mode, fhp, perms);
12572 if (r < 0)
12573 goto out;
12574 }
12575 }
12576
12577out:
12578 if (*fhp) {
12579 ll_unclosed_fh_set.insert(*fhp);
12580 }
12581
12582 ino_t ino = 0;
12583 if (r >= 0) {
12584 Inode *inode = in->get();
12585 if (use_faked_inos())
12586 ino = inode->faked_ino;
12587 else
12588 ino = inode->ino;
12589 }
12590
12591 tout(cct) << (unsigned long)*fhp << std::endl;
12592 tout(cct) << ino << std::endl;
31f18b77 12593 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12594 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12595 *fhp << " " << hex << ino << dec << ")" << dendl;
12596
12597 return r;
12598}
12599
12600int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12601 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12602 const UserPerm& perms)
12603{
12604 Mutex::Locker lock(client_lock);
12605 InodeRef in;
12606
181888fb
FG
12607 if (unmounting)
12608 return -ENOTCONN;
12609
7c673cae
FG
12610 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12611 fhp, perms);
12612 if (r >= 0) {
12613 assert(in);
12614
12615 // passing an Inode in outp requires an additional ref
12616 if (outp) {
12617 _ll_get(in.get());
12618 *outp = in.get();
12619 }
12620 fill_stat(in, attr);
12621 } else {
12622 attr->st_ino = 0;
12623 }
12624
12625 return r;
12626}
12627
12628int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12629 int oflags, Inode **outp, Fh **fhp,
12630 struct ceph_statx *stx, unsigned want, unsigned lflags,
12631 const UserPerm& perms)
12632{
12633 unsigned caps = statx_to_mask(lflags, want);
12634 Mutex::Locker lock(client_lock);
12635 InodeRef in;
12636
181888fb
FG
12637 if (unmounting)
12638 return -ENOTCONN;
7c673cae
FG
12639
12640 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12641 if (r >= 0) {
12642 assert(in);
12643
12644 // passing an Inode in outp requires an additional ref
12645 if (outp) {
12646 _ll_get(in.get());
12647 *outp = in.get();
12648 }
12649 fill_statx(in, caps, stx);
12650 } else {
12651 stx->stx_ino = 0;
12652 stx->stx_mask = 0;
12653 }
12654
12655 return r;
12656}
12657
12658loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12659{
12660 Mutex::Locker lock(client_lock);
12661 tout(cct) << "ll_lseek" << std::endl;
12662 tout(cct) << offset << std::endl;
12663 tout(cct) << whence << std::endl;
12664
181888fb
FG
12665 if (unmounting)
12666 return -ENOTCONN;
12667
7c673cae
FG
12668 return _lseek(fh, offset, whence);
12669}
12670
12671int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12672{
12673 Mutex::Locker lock(client_lock);
12674 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12675 tout(cct) << "ll_read" << std::endl;
12676 tout(cct) << (unsigned long)fh << std::endl;
12677 tout(cct) << off << std::endl;
12678 tout(cct) << len << std::endl;
12679
181888fb
FG
12680 if (unmounting)
12681 return -ENOTCONN;
12682
7c673cae
FG
12683 return _read(fh, off, len, bl);
12684}
12685
12686int Client::ll_read_block(Inode *in, uint64_t blockid,
12687 char *buf,
12688 uint64_t offset,
12689 uint64_t length,
12690 file_layout_t* layout)
12691{
12692 Mutex::Locker lock(client_lock);
181888fb
FG
12693
12694 if (unmounting)
12695 return -ENOTCONN;
12696
b32b8144 12697 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12698 object_t oid = file_object_t(vino.ino, blockid);
12699 C_SaferCond onfinish;
12700 bufferlist bl;
12701
12702 objecter->read(oid,
12703 object_locator_t(layout->pool_id),
12704 offset,
12705 length,
12706 vino.snapid,
12707 &bl,
12708 CEPH_OSD_FLAG_READ,
12709 &onfinish);
12710
12711 client_lock.Unlock();
12712 int r = onfinish.wait();
12713 client_lock.Lock();
12714
12715 if (r >= 0) {
12716 bl.copy(0, bl.length(), buf);
12717 r = bl.length();
12718 }
12719
12720 return r;
12721}
12722
12723/* It appears that the OSD doesn't return success unless the entire
12724 buffer was written, return the write length on success. */
12725
12726int Client::ll_write_block(Inode *in, uint64_t blockid,
12727 char* buf, uint64_t offset,
12728 uint64_t length, file_layout_t* layout,
12729 uint64_t snapseq, uint32_t sync)
12730{
12731 Mutex flock("Client::ll_write_block flock");
12732 vinodeno_t vino = ll_get_vino(in);
12733 Cond cond;
12734 bool done;
12735 int r = 0;
181888fb 12736 Context *onsafe = nullptr;
7c673cae
FG
12737
12738 if (length == 0) {
12739 return -EINVAL;
12740 }
12741 if (true || sync) {
12742 /* if write is stable, the epilogue is waiting on
12743 * flock */
12744 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12745 done = false;
12746 } else {
12747 /* if write is unstable, we just place a barrier for
12748 * future commits to wait on */
12749 /*onsafe = new C_Block_Sync(this, vino.ino,
12750 barrier_interval(offset, offset + length), &r);
12751 */
12752 done = true;
12753 }
12754 object_t oid = file_object_t(vino.ino, blockid);
12755 SnapContext fakesnap;
12756 bufferptr bp;
12757 if (length > 0) bp = buffer::copy(buf, length);
12758 bufferlist bl;
12759 bl.push_back(bp);
12760
12761 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12762 << dendl;
12763
12764 fakesnap.seq = snapseq;
12765
12766 /* lock just in time */
12767 client_lock.Lock();
181888fb
FG
12768 if (unmounting) {
12769 client_lock.Unlock();
12770 delete onsafe;
12771 return -ENOTCONN;
12772 }
7c673cae
FG
12773
12774 objecter->write(oid,
12775 object_locator_t(layout->pool_id),
12776 offset,
12777 length,
12778 fakesnap,
12779 bl,
12780 ceph::real_clock::now(),
12781 0,
12782 onsafe);
12783
12784 client_lock.Unlock();
12785 if (!done /* also !sync */) {
12786 flock.Lock();
12787 while (! done)
12788 cond.Wait(flock);
12789 flock.Unlock();
12790 }
12791
12792 if (r < 0) {
12793 return r;
12794 } else {
12795 return length;
12796 }
12797}
12798
12799int Client::ll_commit_blocks(Inode *in,
12800 uint64_t offset,
12801 uint64_t length)
12802{
12803 Mutex::Locker lock(client_lock);
12804 /*
12805 BarrierContext *bctx;
b32b8144 12806 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12807 uint64_t ino = vino.ino;
12808
12809 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12810 << offset << " to " << length << dendl;
12811
12812 if (length == 0) {
12813 return -EINVAL;
12814 }
12815
12816 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12817 if (p != barriers.end()) {
12818 barrier_interval civ(offset, offset + length);
12819 p->second->commit_barrier(civ);
12820 }
12821 */
12822 return 0;
12823}
12824
12825int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12826{
12827 Mutex::Locker lock(client_lock);
12828 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12829 "~" << len << dendl;
12830 tout(cct) << "ll_write" << std::endl;
12831 tout(cct) << (unsigned long)fh << std::endl;
12832 tout(cct) << off << std::endl;
12833 tout(cct) << len << std::endl;
12834
181888fb
FG
12835 if (unmounting)
12836 return -ENOTCONN;
12837
7c673cae
FG
12838 int r = _write(fh, off, len, data, NULL, 0);
12839 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12840 << dendl;
12841 return r;
12842}
12843
12844int Client::ll_flush(Fh *fh)
12845{
12846 Mutex::Locker lock(client_lock);
12847 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12848 tout(cct) << "ll_flush" << std::endl;
12849 tout(cct) << (unsigned long)fh << std::endl;
12850
181888fb
FG
12851 if (unmounting)
12852 return -ENOTCONN;
12853
7c673cae
FG
12854 return _flush(fh);
12855}
12856
12857int Client::ll_fsync(Fh *fh, bool syncdataonly)
12858{
12859 Mutex::Locker lock(client_lock);
12860 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12861 tout(cct) << "ll_fsync" << std::endl;
12862 tout(cct) << (unsigned long)fh << std::endl;
12863
181888fb
FG
12864 if (unmounting)
12865 return -ENOTCONN;
12866
7c673cae
FG
12867 int r = _fsync(fh, syncdataonly);
12868 if (r) {
12869 // If we're returning an error, clear it from the FH
12870 fh->take_async_err();
12871 }
12872 return r;
12873}
12874
12875#ifdef FALLOC_FL_PUNCH_HOLE
12876
12877int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12878{
12879 if (offset < 0 || length <= 0)
12880 return -EINVAL;
12881
12882 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12883 return -EOPNOTSUPP;
12884
12885 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12886 return -EOPNOTSUPP;
12887
12888 Inode *in = fh->inode.get();
12889
12890 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12891 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12892 return -ENOSPC;
12893 }
12894
12895 if (in->snapid != CEPH_NOSNAP)
12896 return -EROFS;
12897
12898 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12899 return -EBADF;
12900
12901 uint64_t size = offset + length;
12902 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12903 size > in->size &&
12904 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12905 return -EDQUOT;
12906 }
12907
12908 int have;
12909 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12910 if (r < 0)
12911 return r;
12912
12913 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12914 Cond uninline_cond;
12915 bool uninline_done = false;
12916 int uninline_ret = 0;
12917 Context *onuninline = NULL;
12918
12919 if (mode & FALLOC_FL_PUNCH_HOLE) {
12920 if (in->inline_version < CEPH_INLINE_NONE &&
12921 (have & CEPH_CAP_FILE_BUFFER)) {
12922 bufferlist bl;
12923 int len = in->inline_data.length();
12924 if (offset < len) {
12925 if (offset > 0)
12926 in->inline_data.copy(0, offset, bl);
12927 int size = length;
12928 if (offset + size > len)
12929 size = len - offset;
12930 if (size > 0)
12931 bl.append_zero(size);
12932 if (offset + size < len)
12933 in->inline_data.copy(offset + size, len - offset - size, bl);
12934 in->inline_data = bl;
12935 in->inline_version++;
12936 }
12937 in->mtime = ceph_clock_now();
12938 in->change_attr++;
12939 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12940 } else {
12941 if (in->inline_version < CEPH_INLINE_NONE) {
12942 onuninline = new C_SafeCond(&uninline_flock,
12943 &uninline_cond,
12944 &uninline_done,
12945 &uninline_ret);
12946 uninline_data(in, onuninline);
12947 }
12948
12949 Mutex flock("Client::_punch_hole flock");
12950 Cond cond;
12951 bool done = false;
12952 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12953
12954 unsafe_sync_write++;
12955 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12956
12957 _invalidate_inode_cache(in, offset, length);
12958 filer->zero(in->ino, &in->layout,
12959 in->snaprealm->get_snap_context(),
12960 offset, length,
12961 ceph::real_clock::now(),
12962 0, true, onfinish);
12963 in->mtime = ceph_clock_now();
12964 in->change_attr++;
12965 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12966
12967 client_lock.Unlock();
12968 flock.Lock();
12969 while (!done)
12970 cond.Wait(flock);
12971 flock.Unlock();
12972 client_lock.Lock();
12973 _sync_write_commit(in);
12974 }
12975 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12976 uint64_t size = offset + length;
12977 if (size > in->size) {
12978 in->size = size;
12979 in->mtime = ceph_clock_now();
12980 in->change_attr++;
12981 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12982
12983 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12984 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
12985 } else if (is_max_size_approaching(in)) {
12986 check_caps(in, 0);
7c673cae
FG
12987 }
12988 }
12989 }
12990
12991 if (onuninline) {
12992 client_lock.Unlock();
12993 uninline_flock.Lock();
12994 while (!uninline_done)
12995 uninline_cond.Wait(uninline_flock);
12996 uninline_flock.Unlock();
12997 client_lock.Lock();
12998
12999 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13000 in->inline_data.clear();
13001 in->inline_version = CEPH_INLINE_NONE;
13002 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
13003 check_caps(in, 0);
13004 } else
13005 r = uninline_ret;
13006 }
13007
13008 put_cap_ref(in, CEPH_CAP_FILE_WR);
13009 return r;
13010}
13011#else
13012
13013int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13014{
13015 return -EOPNOTSUPP;
13016}
13017
13018#endif
13019
13020
13021int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13022{
13023 Mutex::Locker lock(client_lock);
13024 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13025 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13026 tout(cct) << (unsigned long)fh << std::endl;
13027
181888fb
FG
13028 if (unmounting)
13029 return -ENOTCONN;
13030
7c673cae
FG
13031 return _fallocate(fh, mode, offset, length);
13032}
13033
13034int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13035{
13036 Mutex::Locker lock(client_lock);
13037 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13038
181888fb
FG
13039 if (unmounting)
13040 return -ENOTCONN;
13041
7c673cae
FG
13042 Fh *fh = get_filehandle(fd);
13043 if (!fh)
13044 return -EBADF;
13045#if defined(__linux__) && defined(O_PATH)
13046 if (fh->flags & O_PATH)
13047 return -EBADF;
13048#endif
13049 return _fallocate(fh, mode, offset, length);
13050}
13051
13052int Client::ll_release(Fh *fh)
13053{
13054 Mutex::Locker lock(client_lock);
13055 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13056 dendl;
13057 tout(cct) << "ll_release (fh)" << std::endl;
13058 tout(cct) << (unsigned long)fh << std::endl;
13059
181888fb
FG
13060 if (unmounting)
13061 return -ENOTCONN;
13062
7c673cae
FG
13063 if (ll_unclosed_fh_set.count(fh))
13064 ll_unclosed_fh_set.erase(fh);
13065 return _release_fh(fh);
13066}
13067
13068int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13069{
13070 Mutex::Locker lock(client_lock);
13071
13072 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13073 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13074
181888fb
FG
13075 if (unmounting)
13076 return -ENOTCONN;
13077
7c673cae
FG
13078 return _getlk(fh, fl, owner);
13079}
13080
13081int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13082{
13083 Mutex::Locker lock(client_lock);
13084
13085 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13086 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13087
181888fb
FG
13088 if (unmounting)
13089 return -ENOTCONN;
13090
7c673cae
FG
13091 return _setlk(fh, fl, owner, sleep);
13092}
13093
13094int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13095{
13096 Mutex::Locker lock(client_lock);
13097
13098 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13099 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13100
181888fb
FG
13101 if (unmounting)
13102 return -ENOTCONN;
13103
7c673cae
FG
13104 return _flock(fh, cmd, owner);
13105}
13106
b32b8144
FG
13107int Client::set_deleg_timeout(uint32_t timeout)
13108{
13109 Mutex::Locker lock(client_lock);
13110
13111 /*
13112 * The whole point is to prevent blacklisting so we must time out the
13113 * delegation before the session autoclose timeout kicks in.
13114 */
13115 if (timeout >= mdsmap->get_session_autoclose())
13116 return -EINVAL;
13117
13118 deleg_timeout = timeout;
13119 return 0;
13120}
13121
13122int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13123{
13124 int ret = -EINVAL;
13125
13126 Mutex::Locker lock(client_lock);
13127
13128 if (!mounted)
13129 return -ENOTCONN;
13130
13131 Inode *inode = fh->inode.get();
13132
13133 switch(cmd) {
13134 case CEPH_DELEGATION_NONE:
13135 inode->unset_deleg(fh);
13136 ret = 0;
13137 break;
13138 default:
13139 try {
13140 ret = inode->set_deleg(fh, cmd, cb, priv);
13141 } catch (std::bad_alloc) {
13142 ret = -ENOMEM;
13143 }
13144 break;
13145 }
13146 return ret;
13147}
13148
7c673cae
FG
13149class C_Client_RequestInterrupt : public Context {
13150private:
13151 Client *client;
13152 MetaRequest *req;
13153public:
13154 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13155 req->get();
13156 }
13157 void finish(int r) override {
13158 Mutex::Locker l(client->client_lock);
13159 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13160 client->_interrupt_filelock(req);
13161 client->put_request(req);
13162 }
13163};
13164
13165void Client::ll_interrupt(void *d)
13166{
13167 MetaRequest *req = static_cast<MetaRequest*>(d);
13168 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13169 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13170 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13171}
13172
13173// =========================================
13174// layout
13175
13176// expose file layouts
13177
13178int Client::describe_layout(const char *relpath, file_layout_t *lp,
13179 const UserPerm& perms)
13180{
13181 Mutex::Locker lock(client_lock);
13182
181888fb
FG
13183 if (unmounting)
13184 return -ENOTCONN;
13185
7c673cae
FG
13186 filepath path(relpath);
13187 InodeRef in;
13188 int r = path_walk(path, &in, perms);
13189 if (r < 0)
13190 return r;
13191
13192 *lp = in->layout;
13193
13194 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13195 return 0;
13196}
13197
13198int Client::fdescribe_layout(int fd, file_layout_t *lp)
13199{
13200 Mutex::Locker lock(client_lock);
13201
181888fb
FG
13202 if (unmounting)
13203 return -ENOTCONN;
13204
7c673cae
FG
13205 Fh *f = get_filehandle(fd);
13206 if (!f)
13207 return -EBADF;
13208 Inode *in = f->inode.get();
13209
13210 *lp = in->layout;
13211
13212 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13213 return 0;
13214}
13215
d2e6a577
FG
13216int64_t Client::get_default_pool_id()
13217{
13218 Mutex::Locker lock(client_lock);
181888fb
FG
13219
13220 if (unmounting)
13221 return -ENOTCONN;
13222
d2e6a577
FG
13223 /* first data pool is the default */
13224 return mdsmap->get_first_data_pool();
13225}
7c673cae
FG
13226
13227// expose osdmap
13228
13229int64_t Client::get_pool_id(const char *pool_name)
13230{
13231 Mutex::Locker lock(client_lock);
181888fb
FG
13232
13233 if (unmounting)
13234 return -ENOTCONN;
13235
7c673cae
FG
13236 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13237 pool_name);
13238}
13239
13240string Client::get_pool_name(int64_t pool)
13241{
13242 Mutex::Locker lock(client_lock);
181888fb
FG
13243
13244 if (unmounting)
13245 return string();
13246
7c673cae
FG
13247 return objecter->with_osdmap([pool](const OSDMap& o) {
13248 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13249 });
13250}
13251
13252int Client::get_pool_replication(int64_t pool)
13253{
13254 Mutex::Locker lock(client_lock);
181888fb
FG
13255
13256 if (unmounting)
13257 return -ENOTCONN;
13258
7c673cae
FG
13259 return objecter->with_osdmap([pool](const OSDMap& o) {
13260 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13261 });
13262}
13263
13264int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13265{
13266 Mutex::Locker lock(client_lock);
13267
181888fb
FG
13268 if (unmounting)
13269 return -ENOTCONN;
13270
7c673cae
FG
13271 Fh *f = get_filehandle(fd);
13272 if (!f)
13273 return -EBADF;
13274 Inode *in = f->inode.get();
13275
13276 vector<ObjectExtent> extents;
13277 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13278 assert(extents.size() == 1);
13279
13280 objecter->with_osdmap([&](const OSDMap& o) {
13281 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13282 o.pg_to_acting_osds(pg, osds);
13283 });
13284
13285 if (osds.empty())
13286 return -EINVAL;
13287
13288 /*
13289 * Return the remainder of the extent (stripe unit)
13290 *
13291 * If length = 1 is passed to Striper::file_to_extents we get a single
13292 * extent back, but its length is one so we still need to compute the length
13293 * to the end of the stripe unit.
13294 *
13295 * If length = su then we may get 1 or 2 objects back in the extents vector
13296 * which would have to be examined. Even then, the offsets are local to the
13297 * object, so matching up to the file offset is extra work.
13298 *
13299 * It seems simpler to stick with length = 1 and manually compute the
13300 * remainder.
13301 */
13302 if (len) {
13303 uint64_t su = in->layout.stripe_unit;
13304 *len = su - (off % su);
13305 }
13306
13307 return 0;
13308}
13309
13310int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13311{
13312 Mutex::Locker lock(client_lock);
181888fb
FG
13313
13314 if (unmounting)
13315 return -ENOTCONN;
13316
7c673cae
FG
13317 if (id < 0)
13318 return -EINVAL;
13319 return objecter->with_osdmap([&](const OSDMap& o) {
13320 return o.crush->get_full_location_ordered(id, path);
13321 });
13322}
13323
13324int Client::get_file_stripe_address(int fd, loff_t offset,
13325 vector<entity_addr_t>& address)
13326{
13327 Mutex::Locker lock(client_lock);
13328
181888fb
FG
13329 if (unmounting)
13330 return -ENOTCONN;
13331
7c673cae
FG
13332 Fh *f = get_filehandle(fd);
13333 if (!f)
13334 return -EBADF;
13335 Inode *in = f->inode.get();
13336
13337 // which object?
13338 vector<ObjectExtent> extents;
13339 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13340 in->truncate_size, extents);
13341 assert(extents.size() == 1);
13342
13343 // now we have the object and its 'layout'
13344 return objecter->with_osdmap([&](const OSDMap& o) {
13345 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13346 vector<int> osds;
13347 o.pg_to_acting_osds(pg, osds);
13348 if (osds.empty())
13349 return -EINVAL;
13350 for (unsigned i = 0; i < osds.size(); i++) {
13351 entity_addr_t addr = o.get_addr(osds[i]);
13352 address.push_back(addr);
13353 }
13354 return 0;
13355 });
13356}
13357
13358int Client::get_osd_addr(int osd, entity_addr_t& addr)
13359{
13360 Mutex::Locker lock(client_lock);
181888fb
FG
13361
13362 if (unmounting)
13363 return -ENOTCONN;
13364
7c673cae
FG
13365 return objecter->with_osdmap([&](const OSDMap& o) {
13366 if (!o.exists(osd))
13367 return -ENOENT;
13368
13369 addr = o.get_addr(osd);
13370 return 0;
13371 });
13372}
13373
13374int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13375 loff_t length, loff_t offset)
13376{
13377 Mutex::Locker lock(client_lock);
13378
181888fb
FG
13379 if (unmounting)
13380 return -ENOTCONN;
13381
7c673cae
FG
13382 Fh *f = get_filehandle(fd);
13383 if (!f)
13384 return -EBADF;
13385 Inode *in = f->inode.get();
13386
13387 // map to a list of extents
13388 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13389
13390 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13391 return 0;
13392}
13393
13394
b32b8144 13395/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13396int Client::get_local_osd()
13397{
13398 Mutex::Locker lock(client_lock);
181888fb
FG
13399
13400 if (unmounting)
13401 return -ENOTCONN;
13402
7c673cae
FG
13403 objecter->with_osdmap([this](const OSDMap& o) {
13404 if (o.get_epoch() != local_osd_epoch) {
13405 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13406 local_osd_epoch = o.get_epoch();
13407 }
13408 });
13409 return local_osd;
13410}
13411
13412
13413
13414
13415
13416
13417// ===============================
13418
13419void Client::ms_handle_connect(Connection *con)
13420{
13421 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13422}
13423
13424bool Client::ms_handle_reset(Connection *con)
13425{
13426 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13427 return false;
13428}
13429
13430void Client::ms_handle_remote_reset(Connection *con)
13431{
13432 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13433 Mutex::Locker l(client_lock);
13434 switch (con->get_peer_type()) {
13435 case CEPH_ENTITY_TYPE_MDS:
13436 {
13437 // kludge to figure out which mds this is; fixme with a Connection* state
13438 mds_rank_t mds = MDS_RANK_NONE;
13439 MetaSession *s = NULL;
13440 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13441 p != mds_sessions.end();
13442 ++p) {
13443 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13444 mds = p->first;
13445 s = p->second;
13446 }
13447 }
13448 if (mds >= 0) {
d2e6a577 13449 assert (s != NULL);
7c673cae
FG
13450 switch (s->state) {
13451 case MetaSession::STATE_CLOSING:
13452 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13453 _closed_mds_session(s);
13454 break;
13455
13456 case MetaSession::STATE_OPENING:
13457 {
13458 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13459 list<Context*> waiters;
13460 waiters.swap(s->waiting_for_open);
13461 _closed_mds_session(s);
13462 MetaSession *news = _get_or_open_mds_session(mds);
13463 news->waiting_for_open.swap(waiters);
13464 }
13465 break;
13466
13467 case MetaSession::STATE_OPEN:
13468 {
13469 const md_config_t *conf = cct->_conf;
13470 if (conf->client_reconnect_stale) {
13471 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13472 _closed_mds_session(s);
13473 } else {
13474 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13475 s->state = MetaSession::STATE_STALE;
13476 }
13477 }
13478 break;
13479
13480 case MetaSession::STATE_NEW:
13481 case MetaSession::STATE_CLOSED:
13482 default:
13483 break;
13484 }
13485 }
13486 }
13487 break;
13488 }
13489}
13490
13491bool Client::ms_handle_refused(Connection *con)
13492{
13493 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13494 return false;
13495}
13496
13497bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13498{
13499 if (dest_type == CEPH_ENTITY_TYPE_MON)
13500 return true;
13501 *authorizer = monclient->build_authorizer(dest_type);
13502 return true;
13503}
13504
13505Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13506{
13507 Inode *cur = in;
13508 utime_t now = ceph_clock_now();
13509
13510 while (cur) {
13511 if (cur != in && cur->quota.is_enable())
13512 break;
13513
13514 Inode *parent_in = NULL;
13515 if (!cur->dn_set.empty()) {
13516 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13517 Dentry *dn = *p;
13518 if (dn->lease_mds >= 0 &&
13519 dn->lease_ttl > now &&
13520 mds_sessions.count(dn->lease_mds)) {
13521 parent_in = dn->dir->parent_inode;
13522 } else {
13523 Inode *diri = dn->dir->parent_inode;
13524 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13525 diri->shared_gen == dn->cap_shared_gen) {
13526 parent_in = dn->dir->parent_inode;
13527 }
13528 }
13529 if (parent_in)
13530 break;
13531 }
13532 } else if (root_parents.count(cur)) {
13533 parent_in = root_parents[cur].get();
13534 }
13535
13536 if (parent_in) {
13537 cur = parent_in;
13538 continue;
13539 }
13540
13541 if (cur == root_ancestor)
13542 break;
13543
181888fb
FG
13544 // deleted inode
13545 if (cur->nlink == 0) {
13546 cur = root_ancestor;
13547 break;
13548 }
13549
7c673cae
FG
13550 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13551 filepath path(cur->ino);
13552 req->set_filepath(path);
13553 req->set_inode(cur);
13554
13555 InodeRef parent_ref;
13556 int ret = make_request(req, perms, &parent_ref);
13557 if (ret < 0) {
13558 ldout(cct, 1) << __func__ << " " << in->vino()
13559 << " failed to find parent of " << cur->vino()
13560 << " err " << ret << dendl;
13561 // FIXME: what to do?
13562 cur = root_ancestor;
13563 break;
13564 }
13565
13566 now = ceph_clock_now();
13567 if (cur == in)
13568 cur = parent_ref.get();
13569 else
13570 cur = in; // start over
13571 }
13572
13573 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13574 return cur;
13575}
13576
13577/**
13578 * Traverse quota ancestors of the Inode, return true
13579 * if any of them passes the passed function
13580 */
13581bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13582 std::function<bool (const Inode &in)> test)
13583{
13584 while (true) {
13585 assert(in != NULL);
13586 if (test(*in)) {
13587 return true;
13588 }
13589
13590 if (in == root_ancestor) {
13591 // We're done traversing, drop out
13592 return false;
13593 } else {
13594 // Continue up the tree
13595 in = get_quota_root(in, perms);
13596 }
13597 }
13598
13599 return false;
13600}
13601
13602bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13603{
13604 return check_quota_condition(in, perms,
13605 [](const Inode &in) {
13606 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13607 });
13608}
13609
13610bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13611 const UserPerm& perms)
13612{
13613 return check_quota_condition(in, perms,
13614 [&new_bytes](const Inode &in) {
13615 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13616 > in.quota.max_bytes;
13617 });
13618}
13619
13620bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13621{
13622 return check_quota_condition(in, perms,
13623 [](const Inode &in) {
13624 if (in.quota.max_bytes) {
13625 if (in.rstat.rbytes >= in.quota.max_bytes) {
13626 return true;
13627 }
13628
13629 assert(in.size >= in.reported_size);
13630 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13631 const uint64_t size = in.size - in.reported_size;
13632 return (space >> 4) < size;
13633 } else {
13634 return false;
13635 }
13636 });
13637}
13638
13639enum {
13640 POOL_CHECKED = 1,
13641 POOL_CHECKING = 2,
13642 POOL_READ = 4,
13643 POOL_WRITE = 8,
13644};
13645
13646int Client::check_pool_perm(Inode *in, int need)
13647{
13648 if (!cct->_conf->client_check_pool_perm)
13649 return 0;
13650
13651 int64_t pool_id = in->layout.pool_id;
13652 std::string pool_ns = in->layout.pool_ns;
13653 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13654 int have = 0;
13655 while (true) {
13656 auto it = pool_perms.find(perm_key);
13657 if (it == pool_perms.end())
13658 break;
13659 if (it->second == POOL_CHECKING) {
13660 // avoid concurrent checkings
13661 wait_on_list(waiting_for_pool_perm);
13662 } else {
13663 have = it->second;
13664 assert(have & POOL_CHECKED);
13665 break;
13666 }
13667 }
13668
13669 if (!have) {
13670 if (in->snapid != CEPH_NOSNAP) {
13671 // pool permission check needs to write to the first object. But for snapshot,
13672 // head of the first object may have alread been deleted. To avoid creating
13673 // orphan object, skip the check for now.
13674 return 0;
13675 }
13676
13677 pool_perms[perm_key] = POOL_CHECKING;
13678
13679 char oid_buf[32];
13680 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13681 object_t oid = oid_buf;
13682
13683 SnapContext nullsnapc;
13684
13685 C_SaferCond rd_cond;
13686 ObjectOperation rd_op;
13687 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13688
13689 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13690 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13691
13692 C_SaferCond wr_cond;
13693 ObjectOperation wr_op;
13694 wr_op.create(true);
13695
13696 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13697 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13698
13699 client_lock.Unlock();
13700 int rd_ret = rd_cond.wait();
13701 int wr_ret = wr_cond.wait();
13702 client_lock.Lock();
13703
13704 bool errored = false;
13705
13706 if (rd_ret == 0 || rd_ret == -ENOENT)
13707 have |= POOL_READ;
13708 else if (rd_ret != -EPERM) {
13709 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13710 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13711 errored = true;
13712 }
13713
13714 if (wr_ret == 0 || wr_ret == -EEXIST)
13715 have |= POOL_WRITE;
13716 else if (wr_ret != -EPERM) {
13717 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13718 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13719 errored = true;
13720 }
13721
13722 if (errored) {
13723 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13724 // Raise EIO because actual error code might be misleading for
13725 // userspace filesystem user.
13726 pool_perms.erase(perm_key);
13727 signal_cond_list(waiting_for_pool_perm);
13728 return -EIO;
13729 }
13730
13731 pool_perms[perm_key] = have | POOL_CHECKED;
13732 signal_cond_list(waiting_for_pool_perm);
13733 }
13734
13735 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13736 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13737 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13738 return -EPERM;
13739 }
13740 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13741 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13742 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13743 return -EPERM;
13744 }
13745
13746 return 0;
13747}
13748
13749int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13750{
13751 if (acl_type == POSIX_ACL) {
13752 if (in->xattrs.count(ACL_EA_ACCESS)) {
13753 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13754
13755 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13756 }
13757 }
13758 return -EAGAIN;
13759}
13760
13761int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13762{
13763 if (acl_type == NO_ACL)
13764 return 0;
13765
13766 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13767 if (r < 0)
13768 goto out;
13769
13770 if (acl_type == POSIX_ACL) {
13771 if (in->xattrs.count(ACL_EA_ACCESS)) {
13772 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13773 bufferptr acl(access_acl.c_str(), access_acl.length());
13774 r = posix_acl_access_chmod(acl, mode);
13775 if (r < 0)
13776 goto out;
13777 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13778 } else {
13779 r = 0;
13780 }
13781 }
13782out:
13783 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13784 return r;
13785}
13786
13787int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13788 const UserPerm& perms)
13789{
13790 if (acl_type == NO_ACL)
13791 return 0;
13792
13793 if (S_ISLNK(*mode))
13794 return 0;
13795
13796 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13797 if (r < 0)
13798 goto out;
13799
13800 if (acl_type == POSIX_ACL) {
13801 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13802 map<string, bufferptr> xattrs;
13803
13804 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13805 bufferptr acl(default_acl.c_str(), default_acl.length());
13806 r = posix_acl_inherit_mode(acl, mode);
13807 if (r < 0)
13808 goto out;
13809
13810 if (r > 0) {
13811 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13812 if (r < 0)
13813 goto out;
13814 if (r > 0)
13815 xattrs[ACL_EA_ACCESS] = acl;
13816 }
13817
13818 if (S_ISDIR(*mode))
13819 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13820
13821 r = xattrs.size();
13822 if (r > 0)
13823 ::encode(xattrs, xattrs_bl);
13824 } else {
13825 if (umask_cb)
13826 *mode &= ~umask_cb(callback_handle);
13827 r = 0;
13828 }
13829 }
13830out:
13831 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13832 return r;
13833}
13834
13835void Client::set_filer_flags(int flags)
13836{
13837 Mutex::Locker l(client_lock);
13838 assert(flags == 0 ||
13839 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13840 objecter->add_global_op_flags(flags);
13841}
13842
13843void Client::clear_filer_flags(int flags)
13844{
13845 Mutex::Locker l(client_lock);
13846 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13847 objecter->clear_global_op_flag(flags);
13848}
13849
13850/**
13851 * This is included in cap release messages, to cause
13852 * the MDS to wait until this OSD map epoch. It is necessary
13853 * in corner cases where we cancel RADOS ops, so that
13854 * nobody else tries to do IO to the same objects in
13855 * the same epoch as the cancelled ops.
13856 */
13857void Client::set_cap_epoch_barrier(epoch_t e)
13858{
13859 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13860 cap_epoch_barrier = e;
13861}
13862
13863const char** Client::get_tracked_conf_keys() const
13864{
13865 static const char* keys[] = {
13866 "client_cache_size",
13867 "client_cache_mid",
13868 "client_acl_type",
b32b8144
FG
13869 "client_deleg_timeout",
13870 "client_deleg_break_on_open",
7c673cae
FG
13871 NULL
13872 };
13873 return keys;
13874}
13875
13876void Client::handle_conf_change(const struct md_config_t *conf,
13877 const std::set <std::string> &changed)
13878{
13879 Mutex::Locker lock(client_lock);
13880
181888fb 13881 if (changed.count("client_cache_mid")) {
7c673cae
FG
13882 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13883 }
13884 if (changed.count("client_acl_type")) {
13885 acl_type = NO_ACL;
13886 if (cct->_conf->client_acl_type == "posix_acl")
13887 acl_type = POSIX_ACL;
13888 }
13889}
13890
13891void Client::init_groups(UserPerm *perms)
13892{
13893 gid_t *sgids;
13894 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13895 perms->init_gids(sgids, count);
13896}
13897
13898void intrusive_ptr_add_ref(Inode *in)
13899{
13900 in->get();
13901}
13902
13903void intrusive_ptr_release(Inode *in)
13904{
13905 in->client->put_inode(in);
13906}
13907
13908mds_rank_t Client::_get_random_up_mds() const
13909{
13910 assert(client_lock.is_locked_by_me());
13911
13912 std::set<mds_rank_t> up;
13913 mdsmap->get_up_mds_set(up);
13914
13915 if (up.empty())
13916 return MDS_RANK_NONE;
13917 std::set<mds_rank_t>::const_iterator p = up.begin();
13918 for (int n = rand() % up.size(); n; n--)
13919 ++p;
13920 return *p;
13921}
13922
13923
13924StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13925 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13926{
13927 monclient->set_messenger(m);
13928 objecter->set_client_incarnation(0);
13929}
13930
13931StandaloneClient::~StandaloneClient()
13932{
13933 delete objecter;
13934 objecter = nullptr;
13935}
13936
13937int StandaloneClient::init()
13938{
13939 timer.init();
13940 objectcacher->start();
13941 objecter->init();
13942
13943 client_lock.Lock();
13944 assert(!initialized);
13945
13946 messenger->add_dispatcher_tail(objecter);
13947 messenger->add_dispatcher_tail(this);
13948
13949 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13950 int r = monclient->init();
13951 if (r < 0) {
13952 // need to do cleanup because we're in an intermediate init state
13953 timer.shutdown();
13954 client_lock.Unlock();
13955 objecter->shutdown();
13956 objectcacher->stop();
13957 monclient->shutdown();
13958 return r;
13959 }
13960 objecter->start();
13961
13962 client_lock.Unlock();
13963 _finish_init();
13964
13965 return 0;
13966}
13967
13968void StandaloneClient::shutdown()
13969{
13970 Client::shutdown();
13971 objecter->shutdown();
13972 monclient->shutdown();
13973}