]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import ceph 12.2.12
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
b32b8144 86#include "Delegation.h"
7c673cae
FG
87#include "Dir.h"
88#include "ClientSnapRealm.h"
89#include "Fh.h"
90#include "MetaSession.h"
91#include "MetaRequest.h"
92#include "ObjecterWriteback.h"
93#include "posix_acl.h"
94
95#include "include/assert.h"
96#include "include/stat.h"
97
98#include "include/cephfs/ceph_statx.h"
99
100#if HAVE_GETGROUPLIST
101#include <grp.h>
102#include <pwd.h>
103#include <unistd.h>
104#endif
105
106#undef dout_prefix
107#define dout_prefix *_dout << "client." << whoami << " "
108
109#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111// FreeBSD fails to define this
112#ifndef O_DSYNC
113#define O_DSYNC 0x0
114#endif
115// Darwin fails to define this
116#ifndef O_RSYNC
117#define O_RSYNC 0x0
118#endif
119
120#ifndef O_DIRECT
121#define O_DIRECT 0x0
122#endif
123
124#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127{
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130}
131
132
133// -------------
134
135Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137{
138}
139
140bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142{
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163}
164
165
166// -------------
167
168dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174void Client::_reset_faked_inos()
175{
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181}
182
183void Client::_assign_faked_ino(Inode *in)
184{
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201}
202
203void Client::_release_faked_ino(Inode *in)
204{
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207}
208
209vinodeno_t Client::_map_faked_ino(ino_t ino)
210{
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220}
221
222vinodeno_t Client::map_faked_ino(ino_t ino)
223{
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226}
227
228// cons/des
229
230Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
7c673cae
FG
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
7c673cae
FG
241 async_ino_invalidator(m->cct),
242 async_dentry_invalidator(m->cct),
243 interrupt_finisher(m->cct),
244 remount_finisher(m->cct),
245 objecter_finisher(m->cct),
246 tick_event(NULL),
247 messenger(m), monclient(mc),
248 objecter(objecter_),
249 whoami(mc->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
251 initialized(false),
31f18b77 252 mounted(false), unmounting(false), blacklisted(false),
b32b8144 253 local_osd(-ENXIO), local_osd_epoch(0),
7c673cae 254 unsafe_sync_write(0),
b32b8144
FG
255 client_lock("Client::client_lock"),
256 deleg_timeout(0)
7c673cae
FG
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
7c673cae
FG
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
275
276 // file handles
277 free_fd_set.insert(10, 1<<30);
278
279 mdsmap.reset(new MDSMap);
280
281 // osd interfaces
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
283 &client_lock));
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
286 (void*)this,
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
292 true));
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 295 objecter->enable_blacklist_events();
7c673cae
FG
296}
297
298
299Client::~Client()
300{
301 assert(!client_lock.is_locked());
302
31f18b77
FG
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
306 client_lock.Lock();
7c673cae 307 tear_down_cache();
31f18b77 308 client_lock.Unlock();
7c673cae
FG
309}
310
311void Client::tear_down_cache()
312{
313 // fd's
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
315 it != fd_map.end();
316 ++it) {
317 Fh *fh = it->second;
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
319 _release_fh(fh);
320 }
321 fd_map.clear();
322
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
326 _closedir(dirp);
327 }
328
329 // caps!
330 // *** FIXME ***
331
332 // empty lru
7c673cae
FG
333 trim_cache();
334 assert(lru.lru_get_size() == 0);
335
336 // close root ino
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
339 delete root;
340 root = 0;
341 root_ancestor = 0;
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
344 inode_map.clear();
345 _reset_faked_inos();
346 }
347
348 assert(inode_map.empty());
349}
350
351inodeno_t Client::get_root_ino()
352{
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
356 else
357 return root->ino;
358}
359
360Inode *Client::get_root()
361{
362 Mutex::Locker l(client_lock);
363 root->ll_get();
364 return root;
365}
366
367
368// debug crapola
369
370void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
371{
372 filepath path;
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
377 << " " << path
378 << " ref " << in->get_num_ref()
379 << *in << dendl;
380
381 if (f) {
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
384 if (disconnected)
385 f->dump_int("disconnected", 1);
386 in->dump(f);
387 f->close_section();
388 }
389
390 did.insert(in);
391 if (in->dir) {
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
395 ++it) {
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
397 if (f) {
398 f->open_object_section("dentry");
399 it->second->dump(f);
400 f->close_section();
401 }
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
404 }
405 }
406}
407
408void Client::dump_cache(Formatter *f)
409{
410 set<Inode*> did;
411
412 ldout(cct, 1) << "dump_cache" << dendl;
413
414 if (f)
415 f->open_array_section("cache");
416
417 if (root)
418 dump_inode(f, root, did, true);
419
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
423 ++it) {
424 if (did.count(it->second))
425 continue;
426 dump_inode(f, it->second, did, true);
427 }
428
429 if (f)
430 f->close_section();
431}
432
433void Client::dump_status(Formatter *f)
434{
435 assert(client_lock.is_locked_by_me());
436
437 ldout(cct, 1) << __func__ << dendl;
438
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
441
442 if (f) {
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
446 f->close_section();
447
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
1adf2230
AA
451 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
452 f->dump_object("inst", inst);
453 f->dump_stream("inst_str") << inst;
454 f->dump_stream("addr_str") << inst.addr;
7c673cae
FG
455 f->dump_int("inode_count", inode_map.size());
456 f->dump_int("mds_epoch", mdsmap->get_epoch());
457 f->dump_int("osd_epoch", osd_epoch);
458 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 459 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
460 }
461}
462
463int Client::init()
464{
465 timer.init();
466 objectcacher->start();
467
468 client_lock.Lock();
469 assert(!initialized);
470
471 messenger->add_dispatcher_tail(this);
472 client_lock.Unlock();
473
474 _finish_init();
475 return 0;
476}
477
478void Client::_finish_init()
479{
480 client_lock.Lock();
481 // logger
482 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
483 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
484 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
485 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
486 logger.reset(plb.create_perf_counters());
487 cct->get_perfcounters_collection()->add(logger.get());
488
489 client_lock.Unlock();
490
491 cct->_conf->add_observer(this);
492
493 AdminSocket* admin_socket = cct->get_admin_socket();
494 int ret = admin_socket->register_command("mds_requests",
495 "mds_requests",
496 &m_command_hook,
497 "show in-progress mds requests");
498 if (ret < 0) {
499 lderr(cct) << "error registering admin socket command: "
500 << cpp_strerror(-ret) << dendl;
501 }
502 ret = admin_socket->register_command("mds_sessions",
503 "mds_sessions",
504 &m_command_hook,
505 "show mds session state");
506 if (ret < 0) {
507 lderr(cct) << "error registering admin socket command: "
508 << cpp_strerror(-ret) << dendl;
509 }
510 ret = admin_socket->register_command("dump_cache",
511 "dump_cache",
512 &m_command_hook,
513 "show in-memory metadata cache contents");
514 if (ret < 0) {
515 lderr(cct) << "error registering admin socket command: "
516 << cpp_strerror(-ret) << dendl;
517 }
518 ret = admin_socket->register_command("kick_stale_sessions",
519 "kick_stale_sessions",
520 &m_command_hook,
521 "kick sessions that were remote reset");
522 if (ret < 0) {
523 lderr(cct) << "error registering admin socket command: "
524 << cpp_strerror(-ret) << dendl;
525 }
526 ret = admin_socket->register_command("status",
527 "status",
528 &m_command_hook,
529 "show overall client status");
530 if (ret < 0) {
531 lderr(cct) << "error registering admin socket command: "
532 << cpp_strerror(-ret) << dendl;
533 }
534
535 client_lock.Lock();
536 initialized = true;
537 client_lock.Unlock();
538}
539
540void Client::shutdown()
541{
542 ldout(cct, 1) << "shutdown" << dendl;
543
544 // If we were not mounted, but were being used for sending
545 // MDS commands, we may have sessions that need closing.
546 client_lock.Lock();
547 _close_sessions();
548 client_lock.Unlock();
549
550 cct->_conf->remove_observer(this);
551
552 AdminSocket* admin_socket = cct->get_admin_socket();
553 admin_socket->unregister_command("mds_requests");
554 admin_socket->unregister_command("mds_sessions");
555 admin_socket->unregister_command("dump_cache");
556 admin_socket->unregister_command("kick_stale_sessions");
557 admin_socket->unregister_command("status");
558
559 if (ino_invalidate_cb) {
560 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
561 async_ino_invalidator.wait_for_empty();
562 async_ino_invalidator.stop();
563 }
564
565 if (dentry_invalidate_cb) {
566 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
567 async_dentry_invalidator.wait_for_empty();
568 async_dentry_invalidator.stop();
569 }
570
571 if (switch_interrupt_cb) {
572 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
573 interrupt_finisher.wait_for_empty();
574 interrupt_finisher.stop();
575 }
576
577 if (remount_cb) {
578 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
579 remount_finisher.wait_for_empty();
580 remount_finisher.stop();
581 }
582
583 objectcacher->stop(); // outside of client_lock! this does a join.
584
585 client_lock.Lock();
586 assert(initialized);
587 initialized = false;
588 timer.shutdown();
589 client_lock.Unlock();
590
591 objecter_finisher.wait_for_empty();
592 objecter_finisher.stop();
593
594 if (logger) {
595 cct->get_perfcounters_collection()->remove(logger.get());
596 logger.reset();
597 }
598}
599
600
601// ===================
602// metadata cache stuff
603
604void Client::trim_cache(bool trim_kernel_dcache)
605{
181888fb
FG
606 uint64_t max = cct->_conf->client_cache_size;
607 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
608 unsigned last = 0;
609 while (lru.lru_get_size() != last) {
610 last = lru.lru_get_size();
611
181888fb 612 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
613
614 // trim!
31f18b77 615 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
616 if (!dn)
617 break; // done
618
619 trim_dentry(dn);
620 }
621
181888fb 622 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
623 _invalidate_kernel_dcache();
624
625 // hose root?
626 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
627 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
628 delete root;
629 root = 0;
630 root_ancestor = 0;
631 while (!root_parents.empty())
632 root_parents.erase(root_parents.begin());
633 inode_map.clear();
634 _reset_faked_inos();
635 }
636}
637
638void Client::trim_cache_for_reconnect(MetaSession *s)
639{
640 mds_rank_t mds = s->mds_num;
641 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
642
643 int trimmed = 0;
644 list<Dentry*> skipped;
645 while (lru.lru_get_size() > 0) {
646 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
647 if (!dn)
648 break;
649
650 if ((dn->inode && dn->inode->caps.count(mds)) ||
651 dn->dir->parent_inode->caps.count(mds)) {
652 trim_dentry(dn);
653 trimmed++;
654 } else
655 skipped.push_back(dn);
656 }
657
658 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
659 lru.lru_insert_mid(*p);
660
661 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
662 << " trimmed " << trimmed << " dentries" << dendl;
663
664 if (s->caps.size() > 0)
665 _invalidate_kernel_dcache();
666}
667
668void Client::trim_dentry(Dentry *dn)
669{
670 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
671 << " in dir " << hex << dn->dir->parent_inode->ino
672 << dendl;
673 if (dn->inode) {
674 Inode *diri = dn->dir->parent_inode;
675 diri->dir_release_count++;
676 clear_dir_complete_and_ordered(diri, true);
677 }
678 unlink(dn, false, false); // drop dir, drop dentry
679}
680
681
1adf2230
AA
682void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
683 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 684{
7c673cae
FG
685 uint64_t prior_size = in->size;
686
7c673cae
FG
687 if (truncate_seq > in->truncate_seq ||
688 (truncate_seq == in->truncate_seq && size > in->size)) {
689 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
690 in->size = size;
691 in->reported_size = size;
692 if (truncate_seq != in->truncate_seq) {
693 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
694 << truncate_seq << dendl;
695 in->truncate_seq = truncate_seq;
696 in->oset.truncate_seq = truncate_seq;
697
698 // truncate cached file data
699 if (prior_size > size) {
700 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
701 }
702 }
703
704 // truncate inline data
705 if (in->inline_version < CEPH_INLINE_NONE) {
706 uint32_t len = in->inline_data.length();
707 if (size < len)
708 in->inline_data.splice(size, len - size);
709 }
710 }
711 if (truncate_seq >= in->truncate_seq &&
712 in->truncate_size != truncate_size) {
713 if (in->is_file()) {
714 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
715 << truncate_size << dendl;
716 in->truncate_size = truncate_size;
717 in->oset.truncate_size = truncate_size;
718 } else {
719 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
720 }
721 }
1adf2230
AA
722}
723
724void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
725 utime_t ctime, utime_t mtime, utime_t atime)
726{
727 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
728 << " ctime " << ctime << " mtime " << mtime << dendl;
729
730 if (time_warp_seq > in->time_warp_seq)
731 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
732 << " is higher than local time_warp_seq "
733 << in->time_warp_seq << dendl;
734
735 int warn = false;
7c673cae
FG
736 // be careful with size, mtime, atime
737 if (issued & (CEPH_CAP_FILE_EXCL|
738 CEPH_CAP_FILE_WR|
739 CEPH_CAP_FILE_BUFFER|
740 CEPH_CAP_AUTH_EXCL|
741 CEPH_CAP_XATTR_EXCL)) {
742 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
743 if (ctime > in->ctime)
744 in->ctime = ctime;
745 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
746 //the mds updated times, so take those!
747 in->mtime = mtime;
748 in->atime = atime;
749 in->time_warp_seq = time_warp_seq;
750 } else if (time_warp_seq == in->time_warp_seq) {
751 //take max times
752 if (mtime > in->mtime)
753 in->mtime = mtime;
754 if (atime > in->atime)
755 in->atime = atime;
756 } else if (issued & CEPH_CAP_FILE_EXCL) {
757 //ignore mds values as we have a higher seq
758 } else warn = true;
759 } else {
760 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
761 if (time_warp_seq >= in->time_warp_seq) {
762 in->ctime = ctime;
763 in->mtime = mtime;
764 in->atime = atime;
765 in->time_warp_seq = time_warp_seq;
766 } else warn = true;
767 }
768 if (warn) {
769 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
770 << time_warp_seq << " is lower than local time_warp_seq "
771 << in->time_warp_seq
772 << dendl;
773 }
774}
775
776void Client::_fragmap_remove_non_leaves(Inode *in)
777{
778 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
779 if (!in->dirfragtree.is_leaf(p->first))
780 in->fragmap.erase(p++);
781 else
782 ++p;
783}
784
785void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
786{
787 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
788 if (p->second == mds)
789 in->fragmap.erase(p++);
790 else
791 ++p;
792}
793
794Inode * Client::add_update_inode(InodeStat *st, utime_t from,
795 MetaSession *session,
796 const UserPerm& request_perms)
797{
798 Inode *in;
799 bool was_new = false;
800 if (inode_map.count(st->vino)) {
801 in = inode_map[st->vino];
802 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
803 } else {
804 in = new Inode(this, st->vino, &st->layout);
805 inode_map[st->vino] = in;
806
807 if (use_faked_inos())
808 _assign_faked_ino(in);
809
810 if (!root) {
811 root = in;
812 root_ancestor = in;
813 cwd = root;
814 } else if (!mounted) {
815 root_parents[root_ancestor] = in;
816 root_ancestor = in;
817 }
818
819 // immutable bits
820 in->ino = st->vino.ino;
821 in->snapid = st->vino.snapid;
822 in->mode = st->mode & S_IFMT;
823 was_new = true;
824 }
825
826 in->rdev = st->rdev;
827 if (in->is_symlink())
828 in->symlink = st->symlink;
829
7c673cae 830 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
831 bool new_version = false;
832 if (in->version == 0 ||
833 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
834 (in->version & ~1) < st->version))
835 new_version = true;
7c673cae 836
1adf2230
AA
837 int issued;
838 in->caps_issued(&issued);
839 issued |= in->caps_dirty();
840 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 841
1adf2230
AA
842 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
843 !(issued & CEPH_CAP_AUTH_EXCL)) {
844 in->mode = st->mode;
845 in->uid = st->uid;
846 in->gid = st->gid;
847 in->btime = st->btime;
848 }
7c673cae 849
1adf2230
AA
850 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
851 !(issued & CEPH_CAP_LINK_EXCL)) {
852 in->nlink = st->nlink;
853 }
7c673cae 854
1adf2230
AA
855 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
856 update_inode_file_time(in, issued, st->time_warp_seq,
857 st->ctime, st->mtime, st->atime);
858 }
7c673cae 859
1adf2230
AA
860 if (new_version ||
861 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 862 in->layout = st->layout;
1adf2230
AA
863 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
864 }
7c673cae 865
1adf2230
AA
866 if (in->is_dir()) {
867 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
868 in->dirstat = st->dirstat;
869 }
870 // dir_layout/rstat/quota are not tracked by capability, update them only if
871 // the inode stat is from auth mds
872 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
873 in->dir_layout = st->dir_layout;
874 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
875 in->rstat = st->rstat;
876 in->quota = st->quota;
877 }
878 // move me if/when version reflects fragtree changes.
879 if (in->dirfragtree != st->dirfragtree) {
880 in->dirfragtree = st->dirfragtree;
881 _fragmap_remove_non_leaves(in);
7c673cae 882 }
7c673cae
FG
883 }
884
885 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
886 st->xattrbl.length() &&
887 st->xattr_version > in->xattr_version) {
888 bufferlist::iterator p = st->xattrbl.begin();
889 ::decode(in->xattrs, p);
890 in->xattr_version = st->xattr_version;
891 }
892
1adf2230
AA
893 if (st->inline_version > in->inline_version) {
894 in->inline_data = st->inline_data;
895 in->inline_version = st->inline_version;
7c673cae
FG
896 }
897
1adf2230
AA
898 /* always take a newer change attr */
899 if (st->change_attr > in->change_attr)
900 in->change_attr = st->change_attr;
901
902 if (st->version > in->version)
903 in->version = st->version;
904
905 if (was_new)
906 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
907
908 if (!st->cap.caps)
909 return in; // as with readdir returning indoes in different snaprealms (no caps!)
910
7c673cae 911 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
912 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
913 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
914 st->cap.flags, request_perms);
28e407b8 915 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 916 in->max_size = st->max_size;
28e407b8
AA
917 in->rstat = st->rstat;
918 }
7c673cae 919
1adf2230
AA
920 // setting I_COMPLETE needs to happen after adding the cap
921 if (in->is_dir() &&
922 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
923 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
924 in->dirstat.nfiles == 0 &&
925 in->dirstat.nsubdirs == 0) {
926 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
927 in->flags |= I_COMPLETE | I_DIR_ORDERED;
928 if (in->dir) {
929 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
930 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
931 in->dir->readdir_cache.clear();
932 for (const auto& p : in->dir->dentries) {
933 unlink(p.second, true, true); // keep dir, keep dentry
934 }
935 if (in->dir->dentries.empty())
936 close_dir(in->dir);
7c673cae 937 }
7c673cae 938 }
1adf2230
AA
939 } else {
940 in->snap_caps |= st->cap.caps;
7c673cae
FG
941 }
942
943 return in;
944}
945
946
947/*
948 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
949 */
950Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
951 Inode *in, utime_t from, MetaSession *session,
952 Dentry *old_dentry)
953{
954 Dentry *dn = NULL;
955 if (dir->dentries.count(dname))
956 dn = dir->dentries[dname];
957
958 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
959 << " in dir " << dir->parent_inode->vino() << " dn " << dn
960 << dendl;
961
962 if (dn && dn->inode) {
963 if (dn->inode->vino() == in->vino()) {
964 touch_dn(dn);
965 ldout(cct, 12) << " had dentry " << dname
966 << " with correct vino " << dn->inode->vino()
967 << dendl;
968 } else {
969 ldout(cct, 12) << " had dentry " << dname
970 << " with WRONG vino " << dn->inode->vino()
971 << dendl;
972 unlink(dn, true, true); // keep dir, keep dentry
973 }
974 }
975
976 if (!dn || !dn->inode) {
977 InodeRef tmp_ref(in);
978 if (old_dentry) {
979 if (old_dentry->dir != dir) {
980 Inode *old_diri = old_dentry->dir->parent_inode;
981 old_diri->dir_ordered_count++;
982 clear_dir_complete_and_ordered(old_diri, false);
983 }
984 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
985 }
986 Inode *diri = dir->parent_inode;
987 diri->dir_ordered_count++;
988 clear_dir_complete_and_ordered(diri, false);
989 dn = link(dir, dname, in, dn);
990 }
991
992 update_dentry_lease(dn, dlease, from, session);
993 return dn;
994}
995
996void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
997{
998 utime_t dttl = from;
999 dttl += (float)dlease->duration_ms / 1000.0;
1000
1001 assert(dn);
1002
1003 if (dlease->mask & CEPH_LOCK_DN) {
1004 if (dttl > dn->lease_ttl) {
1005 ldout(cct, 10) << "got dentry lease on " << dn->name
1006 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1007 dn->lease_ttl = dttl;
1008 dn->lease_mds = session->mds_num;
1009 dn->lease_seq = dlease->seq;
1010 dn->lease_gen = session->cap_gen;
1011 }
1012 }
1013 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1014}
1015
1016
1017/*
1018 * update MDS location cache for a single inode
1019 */
1020void Client::update_dir_dist(Inode *in, DirStat *dst)
1021{
1022 // auth
1023 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1024 if (dst->auth >= 0) {
1025 in->fragmap[dst->frag] = dst->auth;
1026 } else {
1027 in->fragmap.erase(dst->frag);
1028 }
1029 if (!in->dirfragtree.is_leaf(dst->frag)) {
1030 in->dirfragtree.force_to_leaf(cct, dst->frag);
1031 _fragmap_remove_non_leaves(in);
1032 }
1033
1034 // replicated
1035 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1036
1037 // dist
1038 /*
1039 if (!st->dirfrag_dist.empty()) { // FIXME
1040 set<int> dist = st->dirfrag_dist.begin()->second;
1041 if (dist.empty() && !in->dir_contacts.empty())
1042 ldout(cct, 9) << "lost dist spec for " << in->ino
1043 << " " << dist << dendl;
1044 if (!dist.empty() && in->dir_contacts.empty())
1045 ldout(cct, 9) << "got dist spec for " << in->ino
1046 << " " << dist << dendl;
1047 in->dir_contacts = dist;
1048 }
1049 */
1050}
1051
1052void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1053{
1054 if (diri->flags & I_COMPLETE) {
1055 if (complete) {
1056 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1057 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1058 } else {
1059 if (diri->flags & I_DIR_ORDERED) {
1060 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1061 diri->flags &= ~I_DIR_ORDERED;
1062 }
1063 }
1064 if (diri->dir)
1065 diri->dir->readdir_cache.clear();
1066 }
1067}
1068
1069/*
1070 * insert results from readdir or lssnap into the metadata cache.
1071 */
1072void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1073
1074 MClientReply *reply = request->reply;
1075 ConnectionRef con = request->reply->get_connection();
1076 uint64_t features = con->get_features();
1077
1078 dir_result_t *dirp = request->dirp;
1079 assert(dirp);
1080
1081 // the extra buffer list is only set for readdir and lssnap replies
1082 bufferlist::iterator p = reply->get_extra_bl().begin();
1083 if (!p.end()) {
1084 // snapdir?
1085 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1086 assert(diri);
1087 diri = open_snapdir(diri);
1088 }
1089
1090 // only open dir if we're actually adding stuff to it!
1091 Dir *dir = diri->open_dir();
1092 assert(dir);
1093
1094 // dirstat
1095 DirStat dst(p);
1096 __u32 numdn;
1097 __u16 flags;
1098 ::decode(numdn, p);
1099 ::decode(flags, p);
1100
1101 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1102 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1103
1104 frag_t fg = (unsigned)request->head.args.readdir.frag;
1105 unsigned readdir_offset = dirp->next_offset;
1106 string readdir_start = dirp->last_name;
1107 assert(!readdir_start.empty() || readdir_offset == 2);
1108
1109 unsigned last_hash = 0;
1110 if (hash_order) {
1111 if (!readdir_start.empty()) {
1112 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1113 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1114 /* mds understands offset_hash */
1115 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1116 }
1117 }
1118
1119 if (fg != dst.frag) {
1120 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1121 fg = dst.frag;
1122 if (!hash_order) {
1123 readdir_offset = 2;
1124 readdir_start.clear();
1125 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1126 }
1127 }
1128
1129 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1130 << ", hash_order=" << hash_order
1131 << ", readdir_start " << readdir_start
1132 << ", last_hash " << last_hash
1133 << ", next_offset " << readdir_offset << dendl;
1134
1135 if (diri->snapid != CEPH_SNAPDIR &&
1136 fg.is_leftmost() && readdir_offset == 2 &&
1137 !(hash_order && last_hash)) {
1138 dirp->release_count = diri->dir_release_count;
1139 dirp->ordered_count = diri->dir_ordered_count;
1140 dirp->start_shared_gen = diri->shared_gen;
1141 dirp->cache_index = 0;
1142 }
1143
1144 dirp->buffer_frag = fg;
1145
1146 _readdir_drop_dirp_buffer(dirp);
1147 dirp->buffer.reserve(numdn);
1148
1149 string dname;
1150 LeaseStat dlease;
1151 for (unsigned i=0; i<numdn; i++) {
1152 ::decode(dname, p);
1153 ::decode(dlease, p);
1154 InodeStat ist(p, features);
1155
1156 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1157
1158 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1159 request->perms);
1160 Dentry *dn;
1161 if (diri->dir->dentries.count(dname)) {
1162 Dentry *olddn = diri->dir->dentries[dname];
1163 if (olddn->inode != in) {
1164 // replace incorrect dentry
1165 unlink(olddn, true, true); // keep dir, dentry
1166 dn = link(dir, dname, in, olddn);
1167 assert(dn == olddn);
1168 } else {
1169 // keep existing dn
1170 dn = olddn;
1171 touch_dn(dn);
1172 }
1173 } else {
1174 // new dn
1175 dn = link(dir, dname, in, NULL);
1176 }
1177
1178 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1179 if (hash_order) {
1180 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1181 if (hash != last_hash)
1182 readdir_offset = 2;
1183 last_hash = hash;
1184 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1185 } else {
1186 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1187 }
1188 // add to readdir cache
1189 if (dirp->release_count == diri->dir_release_count &&
1190 dirp->ordered_count == diri->dir_ordered_count &&
1191 dirp->start_shared_gen == diri->shared_gen) {
1192 if (dirp->cache_index == dir->readdir_cache.size()) {
1193 if (i == 0) {
1194 assert(!dirp->inode->is_complete_and_ordered());
1195 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1196 }
1197 dir->readdir_cache.push_back(dn);
1198 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1199 if (dirp->inode->is_complete_and_ordered())
1200 assert(dir->readdir_cache[dirp->cache_index] == dn);
1201 else
1202 dir->readdir_cache[dirp->cache_index] = dn;
1203 } else {
1204 assert(0 == "unexpected readdir buffer idx");
1205 }
1206 dirp->cache_index++;
1207 }
1208 // add to cached result list
1209 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1210 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1211 }
1212
1213 if (numdn > 0)
1214 dirp->last_name = dname;
1215 if (end)
1216 dirp->next_offset = 2;
1217 else
1218 dirp->next_offset = readdir_offset;
1219
1220 if (dir->is_empty())
1221 close_dir(dir);
1222 }
1223}
1224
1225/** insert_trace
1226 *
1227 * insert a trace from a MDS reply into the cache.
1228 */
1229Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1230{
1231 MClientReply *reply = request->reply;
1232 int op = request->get_op();
1233
1234 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1235 << " is_target=" << (int)reply->head.is_target
1236 << " is_dentry=" << (int)reply->head.is_dentry
1237 << dendl;
1238
1239 bufferlist::iterator p = reply->get_trace_bl().begin();
1240 if (request->got_unsafe) {
1241 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1242 assert(p.end());
1243 return NULL;
1244 }
1245
1246 if (p.end()) {
1247 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1248
1249 Dentry *d = request->dentry();
1250 if (d) {
1251 Inode *diri = d->dir->parent_inode;
1252 diri->dir_release_count++;
1253 clear_dir_complete_and_ordered(diri, true);
1254 }
1255
1256 if (d && reply->get_result() == 0) {
1257 if (op == CEPH_MDS_OP_RENAME) {
1258 // rename
1259 Dentry *od = request->old_dentry();
1260 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1261 assert(od);
1262 unlink(od, true, true); // keep dir, dentry
1263 } else if (op == CEPH_MDS_OP_RMDIR ||
1264 op == CEPH_MDS_OP_UNLINK) {
1265 // unlink, rmdir
1266 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1267 unlink(d, true, true); // keep dir, dentry
1268 }
1269 }
1270 return NULL;
1271 }
1272
1273 ConnectionRef con = request->reply->get_connection();
1274 uint64_t features = con->get_features();
1275 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1276
1277 // snap trace
1278 SnapRealm *realm = NULL;
1279 if (reply->snapbl.length())
1280 update_snap_trace(reply->snapbl, &realm);
1281
1282 ldout(cct, 10) << " hrm "
1283 << " is_target=" << (int)reply->head.is_target
1284 << " is_dentry=" << (int)reply->head.is_dentry
1285 << dendl;
1286
1287 InodeStat dirst;
1288 DirStat dst;
1289 string dname;
1290 LeaseStat dlease;
1291 InodeStat ist;
1292
1293 if (reply->head.is_dentry) {
1294 dirst.decode(p, features);
1295 dst.decode(p);
1296 ::decode(dname, p);
1297 ::decode(dlease, p);
1298 }
1299
1300 Inode *in = 0;
1301 if (reply->head.is_target) {
1302 ist.decode(p, features);
1303 if (cct->_conf->client_debug_getattr_caps) {
1304 unsigned wanted = 0;
1305 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1306 wanted = request->head.args.getattr.mask;
1307 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1308 wanted = request->head.args.open.mask;
1309
1310 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1311 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1312 assert(0 == "MDS reply does not contain xattrs");
1313 }
1314
1315 in = add_update_inode(&ist, request->sent_stamp, session,
1316 request->perms);
1317 }
1318
1319 Inode *diri = NULL;
1320 if (reply->head.is_dentry) {
1321 diri = add_update_inode(&dirst, request->sent_stamp, session,
1322 request->perms);
1323 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1324
1325 if (in) {
1326 Dir *dir = diri->open_dir();
1327 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1328 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1329 } else {
1330 Dentry *dn = NULL;
1331 if (diri->dir && diri->dir->dentries.count(dname)) {
1332 dn = diri->dir->dentries[dname];
1333 if (dn->inode) {
1334 diri->dir_ordered_count++;
1335 clear_dir_complete_and_ordered(diri, false);
1336 unlink(dn, true, true); // keep dir, dentry
1337 }
1338 }
1339 if (dlease.duration_ms > 0) {
1340 if (!dn) {
1341 Dir *dir = diri->open_dir();
1342 dn = link(dir, dname, NULL, NULL);
1343 }
1344 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1345 }
1346 }
1347 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1348 op == CEPH_MDS_OP_MKSNAP) {
1349 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1350 // fake it for snap lookup
1351 vinodeno_t vino = ist.vino;
1352 vino.snapid = CEPH_SNAPDIR;
1353 assert(inode_map.count(vino));
1354 diri = inode_map[vino];
1355
1356 string dname = request->path.last_dentry();
1357
1358 LeaseStat dlease;
1359 dlease.duration_ms = 0;
1360
1361 if (in) {
1362 Dir *dir = diri->open_dir();
1363 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1364 } else {
1365 if (diri->dir && diri->dir->dentries.count(dname)) {
1366 Dentry *dn = diri->dir->dentries[dname];
1367 if (dn->inode)
1368 unlink(dn, true, true); // keep dir, dentry
1369 }
1370 }
1371 }
1372
1373 if (in) {
1374 if (op == CEPH_MDS_OP_READDIR ||
1375 op == CEPH_MDS_OP_LSSNAP) {
1376 insert_readdir_results(request, session, in);
1377 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1378 // hack: return parent inode instead
1379 in = diri;
1380 }
1381
1382 if (request->dentry() == NULL && in != request->inode()) {
1383 // pin the target inode if its parent dentry is not pinned
1384 request->set_other_inode(in);
1385 }
1386 }
1387
1388 if (realm)
1389 put_snap_realm(realm);
1390
1391 request->target = in;
1392 return in;
1393}
1394
1395// -------
1396
1397mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1398{
1399 mds_rank_t mds = MDS_RANK_NONE;
1400 __u32 hash = 0;
1401 bool is_hash = false;
1402
1403 Inode *in = NULL;
1404 Dentry *de = NULL;
1405 Cap *cap = NULL;
1406
1407 if (req->resend_mds >= 0) {
1408 mds = req->resend_mds;
1409 req->resend_mds = -1;
1410 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1411 goto out;
1412 }
1413
1414 if (cct->_conf->client_use_random_mds)
1415 goto random_mds;
1416
1417 in = req->inode();
1418 de = req->dentry();
1419 if (in) {
1420 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1421 if (req->path.depth()) {
1422 hash = in->hash_dentry_name(req->path[0]);
1423 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1424 << " on " << req->path[0]
1425 << " => " << hash << dendl;
1426 is_hash = true;
1427 }
1428 } else if (de) {
1429 if (de->inode) {
1430 in = de->inode.get();
1431 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1432 } else {
1433 in = de->dir->parent_inode;
1434 hash = in->hash_dentry_name(de->name);
1435 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1436 << " on " << de->name
1437 << " => " << hash << dendl;
1438 is_hash = true;
1439 }
1440 }
1441 if (in) {
1442 if (in->snapid != CEPH_NOSNAP) {
1443 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1444 while (in->snapid != CEPH_NOSNAP) {
1445 if (in->snapid == CEPH_SNAPDIR)
1446 in = in->snapdir_parent.get();
1447 else if (!in->dn_set.empty())
1448 /* In most cases there will only be one dentry, so getting it
1449 * will be the correct action. If there are multiple hard links,
1450 * I think the MDS should be able to redirect as needed*/
1451 in = in->get_first_parent()->dir->parent_inode;
1452 else {
1453 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1454 break;
1455 }
1456 }
1457 is_hash = false;
1458 }
1459
1460 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1461 << " hash=" << hash << dendl;
1462
1463 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1464 frag_t fg = in->dirfragtree[hash];
1465 if (in->fragmap.count(fg)) {
1466 mds = in->fragmap[fg];
1467 if (phash_diri)
1468 *phash_diri = in;
91327a77
AA
1469 } else if (in->auth_cap) {
1470 mds = in->auth_cap->session->mds_num;
1471 }
1472 if (mds >= 0) {
7c673cae
FG
1473 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1474 goto out;
1475 }
1476 }
1477
1478 if (req->auth_is_best())
1479 cap = in->auth_cap;
1480 if (!cap && !in->caps.empty())
1481 cap = in->caps.begin()->second;
1482 if (!cap)
1483 goto random_mds;
1484 mds = cap->session->mds_num;
1485 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1486
1487 goto out;
1488 }
1489
1490random_mds:
1491 if (mds < 0) {
1492 mds = _get_random_up_mds();
1493 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1494 }
1495
1496out:
1497 ldout(cct, 20) << "mds is " << mds << dendl;
1498 return mds;
1499}
1500
1501
1502void Client::connect_mds_targets(mds_rank_t mds)
1503{
1504 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1505 assert(mds_sessions.count(mds));
1506 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1507 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1508 q != info.export_targets.end();
1509 ++q) {
1510 if (mds_sessions.count(*q) == 0 &&
1511 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1512 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1513 << " export target mds." << *q << dendl;
1514 _open_mds_session(*q);
1515 }
1516 }
1517}
1518
1519void Client::dump_mds_sessions(Formatter *f)
1520{
1521 f->dump_int("id", get_nodeid().v);
1adf2230
AA
1522 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
1523 f->dump_object("inst", inst);
1524 f->dump_stream("inst_str") << inst;
1525 f->dump_stream("addr_str") << inst.addr;
7c673cae
FG
1526 f->open_array_section("sessions");
1527 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1528 f->open_object_section("session");
1529 p->second->dump(f);
1530 f->close_section();
1531 }
1532 f->close_section();
1533 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1534}
1535void Client::dump_mds_requests(Formatter *f)
1536{
1537 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1538 p != mds_requests.end();
1539 ++p) {
1540 f->open_object_section("request");
1541 p->second->dump(f);
1542 f->close_section();
1543 }
1544}
1545
1546int Client::verify_reply_trace(int r,
1547 MetaRequest *request, MClientReply *reply,
1548 InodeRef *ptarget, bool *pcreated,
1549 const UserPerm& perms)
1550{
1551 // check whether this request actually did the create, and set created flag
1552 bufferlist extra_bl;
1553 inodeno_t created_ino;
1554 bool got_created_ino = false;
1555 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1556
1557 extra_bl.claim(reply->get_extra_bl());
1558 if (extra_bl.length() >= 8) {
1559 // if the extra bufferlist has a buffer, we assume its the created inode
1560 // and that this request to create succeeded in actually creating
1561 // the inode (won the race with other create requests)
1562 ::decode(created_ino, extra_bl);
1563 got_created_ino = true;
1564 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1565 }
1566
1567 if (pcreated)
1568 *pcreated = got_created_ino;
1569
1570 if (request->target) {
1571 *ptarget = request->target;
1572 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1573 } else {
1574 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1575 (*ptarget) = p->second;
1576 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1577 } else {
1578 // we got a traceless reply, and need to look up what we just
1579 // created. for now, do this by name. someday, do this by the
1580 // ino... which we know! FIXME.
1581 InodeRef target;
1582 Dentry *d = request->dentry();
1583 if (d) {
1584 if (d->dir) {
1585 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1586 << d->dir->parent_inode->ino << "/" << d->name
1587 << " got_ino " << got_created_ino
1588 << " ino " << created_ino
1589 << dendl;
1590 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1591 &target, perms);
1592 } else {
1593 // if the dentry is not linked, just do our best. see #5021.
1594 assert(0 == "how did this happen? i want logs!");
1595 }
1596 } else {
1597 Inode *in = request->inode();
1598 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1599 << in->ino << dendl;
1600 r = _getattr(in, request->regetattr_mask, perms, true);
1601 target = in;
1602 }
1603 if (r >= 0) {
1604 // verify ino returned in reply and trace_dist are the same
1605 if (got_created_ino &&
1606 created_ino.val != target->ino.val) {
1607 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1608 r = -EINTR;
1609 }
1610 if (ptarget)
1611 ptarget->swap(target);
1612 }
1613 }
1614 }
1615
1616 return r;
1617}
1618
1619
1620/**
1621 * make a request
1622 *
1623 * Blocking helper to make an MDS request.
1624 *
1625 * If the ptarget flag is set, behavior changes slightly: the caller
1626 * expects to get a pointer to the inode we are creating or operating
1627 * on. As a result, we will follow up any traceless mutation reply
1628 * with a getattr or lookup to transparently handle a traceless reply
1629 * from the MDS (as when the MDS restarts and the client has to replay
1630 * a request).
1631 *
1632 * @param request the MetaRequest to execute
1633 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1634 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1635 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1636 * @param use_mds [optional] prefer a specific mds (-1 for default)
1637 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1638 */
1639int Client::make_request(MetaRequest *request,
1640 const UserPerm& perms,
1641 InodeRef *ptarget, bool *pcreated,
1642 mds_rank_t use_mds,
1643 bufferlist *pdirbl)
1644{
1645 int r = 0;
1646
1647 // assign a unique tid
1648 ceph_tid_t tid = ++last_tid;
1649 request->set_tid(tid);
1650
1651 // and timestamp
1652 request->op_stamp = ceph_clock_now();
1653
1654 // make note
1655 mds_requests[tid] = request->get();
1656 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1657 oldest_tid = tid;
1658
1659 request->set_caller_perms(perms);
1660
1661 if (cct->_conf->client_inject_fixed_oldest_tid) {
1662 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1663 request->set_oldest_client_tid(1);
1664 } else {
1665 request->set_oldest_client_tid(oldest_tid);
1666 }
1667
1668 // hack target mds?
1669 if (use_mds >= 0)
1670 request->resend_mds = use_mds;
1671
1672 while (1) {
1673 if (request->aborted())
1674 break;
1675
31f18b77
FG
1676 if (blacklisted) {
1677 request->abort(-EBLACKLISTED);
1678 break;
1679 }
1680
7c673cae
FG
1681 // set up wait cond
1682 Cond caller_cond;
1683 request->caller_cond = &caller_cond;
1684
1685 // choose mds
1686 Inode *hash_diri = NULL;
1687 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1688 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1689 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1690 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1691 if (hash_diri) {
1692 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1693 _fragmap_remove_stopped_mds(hash_diri, mds);
1694 } else {
1695 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1696 request->resend_mds = _get_random_up_mds();
1697 }
1698 } else {
1699 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1700 wait_on_list(waiting_for_mdsmap);
1701 }
1702 continue;
1703 }
1704
1705 // open a session?
1706 MetaSession *session = NULL;
1707 if (!have_open_session(mds)) {
1708 session = _get_or_open_mds_session(mds);
1709
1710 // wait
1711 if (session->state == MetaSession::STATE_OPENING) {
1712 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1713 wait_on_context_list(session->waiting_for_open);
1714 // Abort requests on REJECT from MDS
1715 if (rejected_by_mds.count(mds)) {
1716 request->abort(-EPERM);
1717 break;
1718 }
1719 continue;
1720 }
1721
1722 if (!have_open_session(mds))
1723 continue;
1724 } else {
1725 session = mds_sessions[mds];
1726 }
1727
1728 // send request.
1729 send_request(request, session);
1730
1731 // wait for signal
1732 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1733 request->kick = false;
1734 while (!request->reply && // reply
1735 request->resend_mds < 0 && // forward
1736 !request->kick)
1737 caller_cond.Wait(client_lock);
1738 request->caller_cond = NULL;
1739
1740 // did we get a reply?
1741 if (request->reply)
1742 break;
1743 }
1744
1745 if (!request->reply) {
1746 assert(request->aborted());
1747 assert(!request->got_unsafe);
1748 r = request->get_abort_code();
1749 request->item.remove_myself();
1750 unregister_request(request);
1751 put_request(request); // ours
1752 return r;
1753 }
1754
1755 // got it!
1756 MClientReply *reply = request->reply;
1757 request->reply = NULL;
1758 r = reply->get_result();
1759 if (r >= 0)
1760 request->success = true;
1761
1762 // kick dispatcher (we've got it!)
1763 assert(request->dispatch_cond);
1764 request->dispatch_cond->Signal();
1765 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1766 request->dispatch_cond = 0;
1767
1768 if (r >= 0 && ptarget)
1769 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1770
1771 if (pdirbl)
1772 pdirbl->claim(reply->get_extra_bl());
1773
1774 // -- log times --
1775 utime_t lat = ceph_clock_now();
1776 lat -= request->sent_stamp;
1777 ldout(cct, 20) << "lat " << lat << dendl;
1778 logger->tinc(l_c_lat, lat);
1779 logger->tinc(l_c_reply, lat);
1780
1781 put_request(request);
1782
1783 reply->put();
1784 return r;
1785}
1786
1787void Client::unregister_request(MetaRequest *req)
1788{
1789 mds_requests.erase(req->tid);
1790 if (req->tid == oldest_tid) {
1791 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1792 while (true) {
1793 if (p == mds_requests.end()) {
1794 oldest_tid = 0;
1795 break;
1796 }
1797 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1798 oldest_tid = p->first;
1799 break;
1800 }
1801 ++p;
1802 }
1803 }
1804 put_request(req);
1805}
1806
1807void Client::put_request(MetaRequest *request)
1808{
1809 if (request->_put()) {
1810 int op = -1;
1811 if (request->success)
1812 op = request->get_op();
1813 InodeRef other_in;
1814 request->take_other_inode(&other_in);
1815 delete request;
1816
1817 if (other_in &&
1818 (op == CEPH_MDS_OP_RMDIR ||
1819 op == CEPH_MDS_OP_RENAME ||
1820 op == CEPH_MDS_OP_RMSNAP)) {
1821 _try_to_trim_inode(other_in.get(), false);
1822 }
1823 }
1824}
1825
1826int Client::encode_inode_release(Inode *in, MetaRequest *req,
1827 mds_rank_t mds, int drop,
1828 int unless, int force)
1829{
1830 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1831 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1832 << ", have:" << ", force:" << force << ")" << dendl;
1833 int released = 0;
1834 if (in->caps.count(mds)) {
1835 Cap *caps = in->caps[mds];
1836 drop &= ~(in->dirty_caps | get_caps_used(in));
1837 if ((drop & caps->issued) &&
1838 !(unless & caps->issued)) {
1839 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1840 caps->issued &= ~drop;
1841 caps->implemented &= ~drop;
1842 released = 1;
1843 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1844 } else {
1845 released = force;
1846 }
1847 if (released) {
1848 ceph_mds_request_release rel;
1849 rel.ino = in->ino;
1850 rel.cap_id = caps->cap_id;
1851 rel.seq = caps->seq;
1852 rel.issue_seq = caps->issue_seq;
1853 rel.mseq = caps->mseq;
1854 rel.caps = caps->implemented;
1855 rel.wanted = caps->wanted;
1856 rel.dname_len = 0;
1857 rel.dname_seq = 0;
1858 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1859 }
1860 }
1861 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1862 << released << dendl;
1863 return released;
1864}
1865
1866void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1867 mds_rank_t mds, int drop, int unless)
1868{
1869 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1870 << dn << ")" << dendl;
1871 int released = 0;
1872 if (dn->dir)
1873 released = encode_inode_release(dn->dir->parent_inode, req,
1874 mds, drop, unless, 1);
1875 if (released && dn->lease_mds == mds) {
1876 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1877 MClientRequest::Release& rel = req->cap_releases.back();
1878 rel.item.dname_len = dn->name.length();
1879 rel.item.dname_seq = dn->lease_seq;
1880 rel.dname = dn->name;
1881 }
1882 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1883 << dn << ")" << dendl;
1884}
1885
1886
1887/*
1888 * This requires the MClientRequest *request member to be set.
1889 * It will error out horribly without one.
1890 * Additionally, if you set any *drop member, you'd better have
1891 * set the corresponding dentry!
1892 */
1893void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1894{
1895 ldout(cct, 20) << "encode_cap_releases enter (req: "
1896 << req << ", mds: " << mds << ")" << dendl;
1897 if (req->inode_drop && req->inode())
1898 encode_inode_release(req->inode(), req,
1899 mds, req->inode_drop,
1900 req->inode_unless);
1901
1902 if (req->old_inode_drop && req->old_inode())
1903 encode_inode_release(req->old_inode(), req,
1904 mds, req->old_inode_drop,
1905 req->old_inode_unless);
1906 if (req->other_inode_drop && req->other_inode())
1907 encode_inode_release(req->other_inode(), req,
1908 mds, req->other_inode_drop,
1909 req->other_inode_unless);
1910
1911 if (req->dentry_drop && req->dentry())
1912 encode_dentry_release(req->dentry(), req,
1913 mds, req->dentry_drop,
1914 req->dentry_unless);
1915
1916 if (req->old_dentry_drop && req->old_dentry())
1917 encode_dentry_release(req->old_dentry(), req,
1918 mds, req->old_dentry_drop,
1919 req->old_dentry_unless);
1920 ldout(cct, 25) << "encode_cap_releases exit (req: "
1921 << req << ", mds " << mds <<dendl;
1922}
1923
1924bool Client::have_open_session(mds_rank_t mds)
1925{
1926 return
1927 mds_sessions.count(mds) &&
1928 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1929 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1930}
1931
1932MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1933{
1934 if (mds_sessions.count(mds) == 0)
1935 return NULL;
1936 MetaSession *s = mds_sessions[mds];
1937 if (s->con != con)
1938 return NULL;
1939 return s;
1940}
1941
1942MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1943{
1944 if (mds_sessions.count(mds))
1945 return mds_sessions[mds];
1946 return _open_mds_session(mds);
1947}
1948
1949/**
1950 * Populate a map of strings with client-identifying metadata,
1951 * such as the hostname. Call this once at initialization.
1952 */
1953void Client::populate_metadata(const std::string &mount_root)
1954{
1955 // Hostname
1956 struct utsname u;
1957 int r = uname(&u);
1958 if (r >= 0) {
1959 metadata["hostname"] = u.nodename;
1960 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1961 } else {
1962 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1963 }
1964
1965 metadata["pid"] = stringify(getpid());
1966
1967 // Ceph entity id (the '0' in "client.0")
1968 metadata["entity_id"] = cct->_conf->name.get_id();
1969
1970 // Our mount position
1971 if (!mount_root.empty()) {
1972 metadata["root"] = mount_root;
1973 }
1974
1975 // Ceph version
1976 metadata["ceph_version"] = pretty_version_to_str();
1977 metadata["ceph_sha1"] = git_version_to_str();
1978
1979 // Apply any metadata from the user's configured overrides
1980 std::vector<std::string> tokens;
1981 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1982 for (const auto &i : tokens) {
1983 auto eqpos = i.find("=");
1984 // Throw out anything that isn't of the form "<str>=<str>"
1985 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1986 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1987 continue;
1988 }
1989 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1990 }
1991}
1992
1993/**
1994 * Optionally add or override client metadata fields.
1995 */
1996void Client::update_metadata(std::string const &k, std::string const &v)
1997{
1998 Mutex::Locker l(client_lock);
1999 assert(initialized);
2000
2001 if (metadata.count(k)) {
2002 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2003 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
2004 }
2005
2006 metadata[k] = v;
2007}
2008
2009MetaSession *Client::_open_mds_session(mds_rank_t mds)
2010{
2011 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
2012 assert(mds_sessions.count(mds) == 0);
2013 MetaSession *session = new MetaSession;
2014 session->mds_num = mds;
2015 session->seq = 0;
2016 session->inst = mdsmap->get_inst(mds);
2017 session->con = messenger->get_connection(session->inst);
2018 session->state = MetaSession::STATE_OPENING;
2019 session->mds_state = MDSMap::STATE_NULL;
2020 mds_sessions[mds] = session;
2021
2022 // Maybe skip sending a request to open if this MDS daemon
2023 // has previously sent us a REJECT.
2024 if (rejected_by_mds.count(mds)) {
2025 if (rejected_by_mds[mds] == session->inst) {
2026 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2027 "because we were rejected" << dendl;
2028 return session;
2029 } else {
2030 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2031 "rejected us, trying with new inst" << dendl;
2032 rejected_by_mds.erase(mds);
2033 }
2034 }
2035
2036 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2037 m->client_meta = metadata;
2038 session->con->send_message(m);
2039 return session;
2040}
2041
2042void Client::_close_mds_session(MetaSession *s)
2043{
2044 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2045 s->state = MetaSession::STATE_CLOSING;
2046 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2047}
2048
2049void Client::_closed_mds_session(MetaSession *s)
2050{
2051 s->state = MetaSession::STATE_CLOSED;
2052 s->con->mark_down();
2053 signal_context_list(s->waiting_for_open);
2054 mount_cond.Signal();
2055 remove_session_caps(s);
2056 kick_requests_closed(s);
2057 mds_sessions.erase(s->mds_num);
2058 delete s;
2059}
2060
2061void Client::handle_client_session(MClientSession *m)
2062{
2063 mds_rank_t from = mds_rank_t(m->get_source().num());
2064 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2065
2066 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2067 if (!session) {
2068 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2069 m->put();
2070 return;
2071 }
2072
2073 switch (m->get_op()) {
2074 case CEPH_SESSION_OPEN:
2075 renew_caps(session);
2076 session->state = MetaSession::STATE_OPEN;
2077 if (unmounting)
2078 mount_cond.Signal();
2079 else
2080 connect_mds_targets(from);
2081 signal_context_list(session->waiting_for_open);
2082 break;
2083
2084 case CEPH_SESSION_CLOSE:
2085 _closed_mds_session(session);
2086 break;
2087
2088 case CEPH_SESSION_RENEWCAPS:
2089 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2090 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2091 session->cap_ttl =
2092 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2093 if (was_stale)
2094 wake_up_session_caps(session, false);
7c673cae
FG
2095 }
2096 break;
2097
2098 case CEPH_SESSION_STALE:
28e407b8
AA
2099 // invalidate session caps/leases
2100 session->cap_gen++;
2101 session->cap_ttl = ceph_clock_now();
2102 session->cap_ttl -= 1;
7c673cae
FG
2103 renew_caps(session);
2104 break;
2105
2106 case CEPH_SESSION_RECALL_STATE:
2107 trim_caps(session, m->get_max_caps());
2108 break;
2109
2110 case CEPH_SESSION_FLUSHMSG:
a8e16298
TL
2111 /* flush cap release */
2112 {
2113 auto& m = session->release;
2114 if (m) {
2115 session->con->send_message(std::move(m));
2116 m = nullptr;
2117 }
2118 }
7c673cae
FG
2119 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2120 break;
2121
2122 case CEPH_SESSION_FORCE_RO:
2123 force_session_readonly(session);
2124 break;
2125
2126 case CEPH_SESSION_REJECT:
2127 rejected_by_mds[session->mds_num] = session->inst;
2128 _closed_mds_session(session);
2129
2130 break;
2131
2132 default:
2133 ceph_abort();
2134 }
2135
2136 m->put();
2137}
2138
2139bool Client::_any_stale_sessions() const
2140{
2141 assert(client_lock.is_locked_by_me());
2142
2143 for (const auto &i : mds_sessions) {
2144 if (i.second->state == MetaSession::STATE_STALE) {
2145 return true;
2146 }
2147 }
2148
2149 return false;
2150}
2151
2152void Client::_kick_stale_sessions()
2153{
2154 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2155
2156 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2157 p != mds_sessions.end(); ) {
2158 MetaSession *s = p->second;
2159 ++p;
2160 if (s->state == MetaSession::STATE_STALE)
2161 _closed_mds_session(s);
2162 }
2163}
2164
2165void Client::send_request(MetaRequest *request, MetaSession *session,
2166 bool drop_cap_releases)
2167{
2168 // make the request
2169 mds_rank_t mds = session->mds_num;
2170 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2171 << " for mds." << mds << dendl;
2172 MClientRequest *r = build_client_request(request);
2173 if (request->dentry()) {
2174 r->set_dentry_wanted();
2175 }
2176 if (request->got_unsafe) {
2177 r->set_replayed_op();
2178 if (request->target)
2179 r->head.ino = request->target->ino;
2180 } else {
2181 encode_cap_releases(request, mds);
2182 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2183 request->cap_releases.clear();
2184 else
2185 r->releases.swap(request->cap_releases);
2186 }
2187 r->set_mdsmap_epoch(mdsmap->get_epoch());
2188 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2189 objecter->with_osdmap([r](const OSDMap& o) {
2190 r->set_osdmap_epoch(o.get_epoch());
2191 });
2192 }
2193
2194 if (request->mds == -1) {
2195 request->sent_stamp = ceph_clock_now();
2196 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2197 }
2198 request->mds = mds;
2199
2200 Inode *in = request->inode();
2201 if (in && in->caps.count(mds))
2202 request->sent_on_mseq = in->caps[mds]->mseq;
2203
2204 session->requests.push_back(&request->item);
2205
2206 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2207 session->con->send_message(r);
2208}
2209
2210MClientRequest* Client::build_client_request(MetaRequest *request)
2211{
2212 MClientRequest *req = new MClientRequest(request->get_op());
2213 req->set_tid(request->tid);
2214 req->set_stamp(request->op_stamp);
2215 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2216
2217 // if the filepath's haven't been set, set them!
2218 if (request->path.empty()) {
2219 Inode *in = request->inode();
2220 Dentry *de = request->dentry();
2221 if (in)
2222 in->make_nosnap_relative_path(request->path);
2223 else if (de) {
2224 if (de->inode)
2225 de->inode->make_nosnap_relative_path(request->path);
2226 else if (de->dir) {
2227 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2228 request->path.push_dentry(de->name);
2229 }
2230 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2231 << " No path, inode, or appropriately-endowed dentry given!"
2232 << dendl;
2233 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2234 << " No path, inode, or dentry given!"
2235 << dendl;
2236 }
2237 req->set_filepath(request->get_filepath());
2238 req->set_filepath2(request->get_filepath2());
2239 req->set_data(request->data);
2240 req->set_retry_attempt(request->retry_attempt++);
2241 req->head.num_fwd = request->num_fwd;
2242 const gid_t *_gids;
2243 int gid_count = request->perms.get_gids(&_gids);
2244 req->set_gid_list(gid_count, _gids);
2245 return req;
2246}
2247
2248
2249
2250void Client::handle_client_request_forward(MClientRequestForward *fwd)
2251{
2252 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2253 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2254 if (!session) {
2255 fwd->put();
2256 return;
2257 }
2258 ceph_tid_t tid = fwd->get_tid();
2259
2260 if (mds_requests.count(tid) == 0) {
2261 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2262 fwd->put();
2263 return;
2264 }
2265
2266 MetaRequest *request = mds_requests[tid];
2267 assert(request);
2268
2269 // reset retry counter
2270 request->retry_attempt = 0;
2271
2272 // request not forwarded, or dest mds has no session.
2273 // resend.
2274 ldout(cct, 10) << "handle_client_request tid " << tid
2275 << " fwd " << fwd->get_num_fwd()
2276 << " to mds." << fwd->get_dest_mds()
2277 << ", resending to " << fwd->get_dest_mds()
2278 << dendl;
2279
2280 request->mds = -1;
2281 request->item.remove_myself();
2282 request->num_fwd = fwd->get_num_fwd();
2283 request->resend_mds = fwd->get_dest_mds();
2284 request->caller_cond->Signal();
2285
2286 fwd->put();
2287}
2288
2289bool Client::is_dir_operation(MetaRequest *req)
2290{
2291 int op = req->get_op();
2292 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2293 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2294 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2295 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2296 return true;
2297 return false;
2298}
2299
2300void Client::handle_client_reply(MClientReply *reply)
2301{
2302 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2303 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2304 if (!session) {
2305 reply->put();
2306 return;
2307 }
2308
2309 ceph_tid_t tid = reply->get_tid();
2310 bool is_safe = reply->is_safe();
2311
2312 if (mds_requests.count(tid) == 0) {
2313 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2314 << " safe is:" << is_safe << dendl;
2315 reply->put();
2316 return;
2317 }
2318 MetaRequest *request = mds_requests.at(tid);
2319
2320 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2321 << " tid " << tid << dendl;
2322
2323 if (request->got_unsafe && !is_safe) {
2324 //duplicate response
2325 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2326 << mds_num << " safe:" << is_safe << dendl;
2327 reply->put();
2328 return;
2329 }
2330
2331 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2332 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2333 << " from mds." << request->mds << dendl;
2334 request->send_to_auth = true;
2335 request->resend_mds = choose_target_mds(request);
2336 Inode *in = request->inode();
2337 if (request->resend_mds >= 0 &&
2338 request->resend_mds == request->mds &&
2339 (in == NULL ||
2340 in->caps.count(request->resend_mds) == 0 ||
2341 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2342 // have to return ESTALE
2343 } else {
2344 request->caller_cond->Signal();
2345 reply->put();
2346 return;
2347 }
2348 ldout(cct, 20) << "have to return ESTALE" << dendl;
2349 }
2350
2351 assert(request->reply == NULL);
2352 request->reply = reply;
2353 insert_trace(request, session);
2354
2355 // Handle unsafe reply
2356 if (!is_safe) {
2357 request->got_unsafe = true;
2358 session->unsafe_requests.push_back(&request->unsafe_item);
2359 if (is_dir_operation(request)) {
2360 Inode *dir = request->inode();
2361 assert(dir);
2362 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2363 }
2364 if (request->target) {
2365 InodeRef &in = request->target;
2366 in->unsafe_ops.push_back(&request->unsafe_target_item);
2367 }
2368 }
2369
2370 // Only signal the caller once (on the first reply):
2371 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2372 if (!is_safe || !request->got_unsafe) {
2373 Cond cond;
2374 request->dispatch_cond = &cond;
2375
2376 // wake up waiter
2377 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2378 request->caller_cond->Signal();
2379
2380 // wake for kick back
2381 while (request->dispatch_cond) {
2382 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2383 cond.Wait(client_lock);
2384 }
2385 }
2386
2387 if (is_safe) {
2388 // the filesystem change is committed to disk
2389 // we're done, clean up
2390 if (request->got_unsafe) {
2391 request->unsafe_item.remove_myself();
2392 request->unsafe_dir_item.remove_myself();
2393 request->unsafe_target_item.remove_myself();
2394 signal_cond_list(request->waitfor_safe);
2395 }
2396 request->item.remove_myself();
2397 unregister_request(request);
2398 }
2399 if (unmounting)
2400 mount_cond.Signal();
2401}
2402
2403void Client::_handle_full_flag(int64_t pool)
2404{
2405 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2406 << "on " << pool << dendl;
2407 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2408 // to do this rather than blocking, because otherwise when we fill up we
2409 // potentially lock caps forever on files with dirty pages, and we need
2410 // to be able to release those caps to the MDS so that it can delete files
2411 // and free up space.
2412 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2413
2414 // For all inodes with layouts in this pool and a pending flush write op
2415 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2416 // from ObjectCacher so that it doesn't re-issue the write in response to
2417 // the ENOSPC error.
2418 // Fortunately since we're cancelling everything in a given pool, we don't
2419 // need to know which ops belong to which ObjectSet, we can just blow all
2420 // the un-flushed cached data away and mark any dirty inodes' async_err
2421 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2422 // affecting this pool, and all the objectsets we're purging were also
2423 // in this pool.
2424 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2425 i != inode_map.end(); ++i)
2426 {
2427 Inode *inode = i->second;
2428 if (inode->oset.dirty_or_tx
2429 && (pool == -1 || inode->layout.pool_id == pool)) {
2430 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2431 << " has dirty objects, purging and setting ENOSPC" << dendl;
2432 objectcacher->purge_set(&inode->oset);
2433 inode->set_async_err(-ENOSPC);
2434 }
2435 }
2436
2437 if (cancelled_epoch != (epoch_t)-1) {
2438 set_cap_epoch_barrier(cancelled_epoch);
2439 }
2440}
2441
2442void Client::handle_osd_map(MOSDMap *m)
2443{
31f18b77
FG
2444 std::set<entity_addr_t> new_blacklists;
2445 objecter->consume_blacklist_events(&new_blacklists);
2446
2447 const auto myaddr = messenger->get_myaddr();
2448 if (!blacklisted && new_blacklists.count(myaddr)) {
2449 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2450 return o.get_epoch();
2451 });
2452 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2453 blacklisted = true;
2454 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2455 p != mds_requests.end(); ) {
2456 auto req = p->second;
2457 ++p;
2458 req->abort(-EBLACKLISTED);
2459 if (req->caller_cond) {
2460 req->kick = true;
2461 req->caller_cond->Signal();
2462 }
2463 }
2464
2465 // Progress aborts on any requests that were on this waitlist. Any
2466 // requests that were on a waiting_for_open session waitlist
2467 // will get kicked during close session below.
2468 signal_cond_list(waiting_for_mdsmap);
2469
2470 // Force-close all sessions: assume this is not abandoning any state
2471 // on the MDS side because the MDS will have seen the blacklist too.
2472 while(!mds_sessions.empty()) {
2473 auto i = mds_sessions.begin();
2474 auto session = i->second;
2475 _closed_mds_session(session);
2476 }
2477
2478 // Since we know all our OSD ops will fail, cancel them all preemtively,
2479 // so that on an unhealthy cluster we can umount promptly even if e.g.
2480 // some PGs were inaccessible.
2481 objecter->op_cancel_writes(-EBLACKLISTED);
2482
2483 } else if (blacklisted) {
2484 // Handle case where we were blacklisted but no longer are
2485 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2486 return o.is_blacklisted(myaddr);});
2487 }
2488
f64942e4
AA
2489 // Always subscribe to next osdmap for blacklisted client
2490 // until this client is not blacklisted.
2491 if (blacklisted) {
2492 objecter->maybe_request_map();
2493 }
2494
7c673cae
FG
2495 if (objecter->osdmap_full_flag()) {
2496 _handle_full_flag(-1);
2497 } else {
2498 // Accumulate local list of full pools so that I can drop
2499 // the objecter lock before re-entering objecter in
2500 // cancel_writes
2501 std::vector<int64_t> full_pools;
2502
2503 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2504 for (const auto& kv : o.get_pools()) {
2505 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2506 full_pools.push_back(kv.first);
2507 }
2508 }
2509 });
2510
2511 for (auto p : full_pools)
2512 _handle_full_flag(p);
2513
2514 // Subscribe to subsequent maps to watch for the full flag going
2515 // away. For the global full flag objecter does this for us, but
2516 // it pays no attention to the per-pool full flag so in this branch
2517 // we do it ourselves.
2518 if (!full_pools.empty()) {
2519 objecter->maybe_request_map();
2520 }
2521 }
2522
2523 m->put();
2524}
2525
2526
2527// ------------------------
2528// incoming messages
2529
2530
2531bool Client::ms_dispatch(Message *m)
2532{
2533 Mutex::Locker l(client_lock);
2534 if (!initialized) {
2535 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2536 m->put();
2537 return true;
2538 }
2539
2540 switch (m->get_type()) {
2541 // mounting and mds sessions
2542 case CEPH_MSG_MDS_MAP:
2543 handle_mds_map(static_cast<MMDSMap*>(m));
2544 break;
2545 case CEPH_MSG_FS_MAP:
2546 handle_fs_map(static_cast<MFSMap*>(m));
2547 break;
2548 case CEPH_MSG_FS_MAP_USER:
2549 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2550 break;
2551 case CEPH_MSG_CLIENT_SESSION:
2552 handle_client_session(static_cast<MClientSession*>(m));
2553 break;
2554
2555 case CEPH_MSG_OSD_MAP:
2556 handle_osd_map(static_cast<MOSDMap*>(m));
2557 break;
2558
2559 // requests
2560 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2561 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2562 break;
2563 case CEPH_MSG_CLIENT_REPLY:
2564 handle_client_reply(static_cast<MClientReply*>(m));
2565 break;
2566
2567 case CEPH_MSG_CLIENT_SNAP:
2568 handle_snap(static_cast<MClientSnap*>(m));
2569 break;
2570 case CEPH_MSG_CLIENT_CAPS:
2571 handle_caps(static_cast<MClientCaps*>(m));
2572 break;
2573 case CEPH_MSG_CLIENT_LEASE:
2574 handle_lease(static_cast<MClientLease*>(m));
2575 break;
2576 case MSG_COMMAND_REPLY:
2577 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2578 handle_command_reply(static_cast<MCommandReply*>(m));
2579 } else {
2580 return false;
2581 }
2582 break;
2583 case CEPH_MSG_CLIENT_QUOTA:
2584 handle_quota(static_cast<MClientQuota*>(m));
2585 break;
2586
2587 default:
2588 return false;
2589 }
2590
2591 // unmounting?
2592 if (unmounting) {
2593 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2594 << "+" << inode_map.size() << dendl;
2595 long unsigned size = lru.lru_get_size() + inode_map.size();
2596 trim_cache();
2597 if (size < lru.lru_get_size() + inode_map.size()) {
2598 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2599 mount_cond.Signal();
2600 } else {
2601 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2602 << "+" << inode_map.size() << dendl;
2603 }
2604 }
2605
2606 return true;
2607}
2608
2609void Client::handle_fs_map(MFSMap *m)
2610{
2611 fsmap.reset(new FSMap(m->get_fsmap()));
2612 m->put();
2613
2614 signal_cond_list(waiting_for_fsmap);
2615
2616 monclient->sub_got("fsmap", fsmap->get_epoch());
2617}
2618
2619void Client::handle_fs_map_user(MFSMapUser *m)
2620{
2621 fsmap_user.reset(new FSMapUser);
2622 *fsmap_user = m->get_fsmap();
2623 m->put();
2624
2625 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2626 signal_cond_list(waiting_for_fsmap);
2627}
2628
2629void Client::handle_mds_map(MMDSMap* m)
2630{
f64942e4 2631 mds_gid_t old_inc, new_inc;
7c673cae
FG
2632 if (m->get_epoch() <= mdsmap->get_epoch()) {
2633 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2634 << " is identical to or older than our "
2635 << mdsmap->get_epoch() << dendl;
2636 m->put();
2637 return;
f64942e4 2638 }
7c673cae
FG
2639
2640 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2641
2642 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2643 oldmap.swap(mdsmap);
2644
2645 mdsmap->decode(m->get_encoded());
2646
2647 // Cancel any commands for missing or laggy GIDs
2648 std::list<ceph_tid_t> cancel_ops;
2649 auto &commands = command_table.get_commands();
2650 for (const auto &i : commands) {
2651 auto &op = i.second;
2652 const mds_gid_t op_mds_gid = op.mds_gid;
2653 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2654 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2655 cancel_ops.push_back(i.first);
2656 if (op.outs) {
2657 std::ostringstream ss;
2658 ss << "MDS " << op_mds_gid << " went away";
2659 *(op.outs) = ss.str();
2660 }
2661 op.con->mark_down();
2662 if (op.on_finish) {
2663 op.on_finish->complete(-ETIMEDOUT);
2664 }
2665 }
2666 }
2667
2668 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2669 i != cancel_ops.end(); ++i) {
2670 command_table.erase(*i);
2671 }
2672
2673 // reset session
2674 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2675 p != mds_sessions.end(); ) {
2676 mds_rank_t mds = p->first;
2677 MetaSession *session = p->second;
2678 ++p;
2679
2680 int oldstate = oldmap->get_state(mds);
2681 int newstate = mdsmap->get_state(mds);
2682 if (!mdsmap->is_up(mds)) {
2683 session->con->mark_down();
2684 } else if (mdsmap->get_inst(mds) != session->inst) {
f64942e4
AA
2685 old_inc = oldmap->get_incarnation(mds);
2686 new_inc = mdsmap->get_incarnation(mds);
2687 if (old_inc != new_inc) {
2688 ldout(cct, 1) << "mds incarnation changed from "
2689 << old_inc << " to " << new_inc << dendl;
2690 oldstate = MDSMap::STATE_NULL;
2691 }
7c673cae
FG
2692 session->con->mark_down();
2693 session->inst = mdsmap->get_inst(mds);
2694 // When new MDS starts to take over, notify kernel to trim unused entries
2695 // in its dcache/icache. Hopefully, the kernel will release some unused
2696 // inodes before the new MDS enters reconnect state.
2697 trim_cache_for_reconnect(session);
2698 } else if (oldstate == newstate)
2699 continue; // no change
2700
2701 session->mds_state = newstate;
f64942e4
AA
2702 if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
2703 // missed reconnect close the session so that it can be reopened
2704 _closed_mds_session(session);
2705 continue;
2706 }
7c673cae
FG
2707 if (newstate == MDSMap::STATE_RECONNECT) {
2708 session->con = messenger->get_connection(session->inst);
2709 send_reconnect(session);
2710 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2711 if (oldstate < MDSMap::STATE_ACTIVE) {
2712 // kick new requests
2713 kick_requests(session);
2714 kick_flushing_caps(session);
2715 signal_context_list(session->waiting_for_open);
a8e16298 2716 wake_up_session_caps(session, true);
7c673cae
FG
2717 }
2718 connect_mds_targets(mds);
2719 } else if (newstate == MDSMap::STATE_NULL &&
2720 mds >= mdsmap->get_max_mds()) {
2721 _closed_mds_session(session);
2722 }
2723 }
2724
2725 // kick any waiting threads
2726 signal_cond_list(waiting_for_mdsmap);
2727
2728 m->put();
2729
2730 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2731}
2732
2733void Client::send_reconnect(MetaSession *session)
2734{
2735 mds_rank_t mds = session->mds_num;
2736 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2737
2738 // trim unused caps to reduce MDS's cache rejoin time
2739 trim_cache_for_reconnect(session);
2740
2741 session->readonly = false;
2742
2743 if (session->release) {
2744 session->release->put();
2745 session->release = NULL;
2746 }
2747
2748 // reset my cap seq number
2749 session->seq = 0;
2750 //connect to the mds' offload targets
2751 connect_mds_targets(mds);
2752 //make sure unsafe requests get saved
2753 resend_unsafe_requests(session);
2754
2755 MClientReconnect *m = new MClientReconnect;
2756
2757 // i have an open session.
2758 ceph::unordered_set<inodeno_t> did_snaprealm;
2759 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2760 p != inode_map.end();
2761 ++p) {
2762 Inode *in = p->second;
2763 if (in->caps.count(mds)) {
2764 ldout(cct, 10) << " caps on " << p->first
2765 << " " << ccap_string(in->caps[mds]->issued)
2766 << " wants " << ccap_string(in->caps_wanted())
2767 << dendl;
2768 filepath path;
2769 in->make_long_path(path);
2770 ldout(cct, 10) << " path " << path << dendl;
2771
2772 bufferlist flockbl;
2773 _encode_filelocks(in, flockbl);
2774
2775 Cap *cap = in->caps[mds];
2776 cap->seq = 0; // reset seq.
2777 cap->issue_seq = 0; // reset seq.
2778 cap->mseq = 0; // reset seq.
2779 cap->issued = cap->implemented;
2780
2781 snapid_t snap_follows = 0;
2782 if (!in->cap_snaps.empty())
2783 snap_follows = in->cap_snaps.begin()->first;
2784
2785 m->add_cap(p->first.ino,
2786 cap->cap_id,
2787 path.get_ino(), path.get_path(), // ino
2788 in->caps_wanted(), // wanted
2789 cap->issued, // issued
2790 in->snaprealm->ino,
2791 snap_follows,
2792 flockbl);
2793
2794 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2795 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2796 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2797 did_snaprealm.insert(in->snaprealm->ino);
2798 }
2799 }
2800 }
2801
2802 early_kick_flushing_caps(session);
2803
2804 session->con->send_message(m);
2805
2806 mount_cond.Signal();
2807}
2808
2809
2810void Client::kick_requests(MetaSession *session)
2811{
2812 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2813 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2814 p != mds_requests.end();
2815 ++p) {
31f18b77
FG
2816 MetaRequest *req = p->second;
2817 if (req->got_unsafe)
2818 continue;
2819 if (req->aborted()) {
2820 if (req->caller_cond) {
2821 req->kick = true;
2822 req->caller_cond->Signal();
2823 }
7c673cae 2824 continue;
31f18b77
FG
2825 }
2826 if (req->retry_attempt > 0)
7c673cae 2827 continue; // new requests only
31f18b77 2828 if (req->mds == session->mds_num) {
7c673cae
FG
2829 send_request(p->second, session);
2830 }
2831 }
2832}
2833
2834void Client::resend_unsafe_requests(MetaSession *session)
2835{
2836 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2837 !iter.end();
2838 ++iter)
2839 send_request(*iter, session);
2840
2841 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2842 // process completed requests in clientreplay stage.
2843 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2844 p != mds_requests.end();
2845 ++p) {
2846 MetaRequest *req = p->second;
2847 if (req->got_unsafe)
2848 continue;
31f18b77
FG
2849 if (req->aborted())
2850 continue;
7c673cae
FG
2851 if (req->retry_attempt == 0)
2852 continue; // old requests only
2853 if (req->mds == session->mds_num)
2854 send_request(req, session, true);
2855 }
2856}
2857
2858void Client::wait_unsafe_requests()
2859{
2860 list<MetaRequest*> last_unsafe_reqs;
2861 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2862 p != mds_sessions.end();
2863 ++p) {
2864 MetaSession *s = p->second;
2865 if (!s->unsafe_requests.empty()) {
2866 MetaRequest *req = s->unsafe_requests.back();
2867 req->get();
2868 last_unsafe_reqs.push_back(req);
2869 }
2870 }
2871
2872 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2873 p != last_unsafe_reqs.end();
2874 ++p) {
2875 MetaRequest *req = *p;
2876 if (req->unsafe_item.is_on_list())
2877 wait_on_list(req->waitfor_safe);
2878 put_request(req);
2879 }
2880}
2881
2882void Client::kick_requests_closed(MetaSession *session)
2883{
2884 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2885 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2886 p != mds_requests.end(); ) {
2887 MetaRequest *req = p->second;
2888 ++p;
2889 if (req->mds == session->mds_num) {
2890 if (req->caller_cond) {
2891 req->kick = true;
2892 req->caller_cond->Signal();
2893 }
2894 req->item.remove_myself();
2895 if (req->got_unsafe) {
2896 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2897 req->unsafe_item.remove_myself();
2898 req->unsafe_dir_item.remove_myself();
2899 req->unsafe_target_item.remove_myself();
2900 signal_cond_list(req->waitfor_safe);
2901 unregister_request(req);
2902 }
2903 }
2904 }
2905 assert(session->requests.empty());
2906 assert(session->unsafe_requests.empty());
2907}
2908
2909
2910
2911
2912/************
2913 * leases
2914 */
2915
2916void Client::got_mds_push(MetaSession *s)
2917{
2918 s->seq++;
2919 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2920 if (s->state == MetaSession::STATE_CLOSING) {
2921 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2922 }
2923}
2924
2925void Client::handle_lease(MClientLease *m)
2926{
2927 ldout(cct, 10) << "handle_lease " << *m << dendl;
2928
2929 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2930
2931 mds_rank_t mds = mds_rank_t(m->get_source().num());
2932 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2933 if (!session) {
2934 m->put();
2935 return;
2936 }
2937
2938 got_mds_push(session);
2939
2940 ceph_seq_t seq = m->get_seq();
2941
2942 Inode *in;
2943 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2944 if (inode_map.count(vino) == 0) {
2945 ldout(cct, 10) << " don't have vino " << vino << dendl;
2946 goto revoke;
2947 }
2948 in = inode_map[vino];
2949
2950 if (m->get_mask() & CEPH_LOCK_DN) {
2951 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2952 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2953 goto revoke;
2954 }
2955 Dentry *dn = in->dir->dentries[m->dname];
2956 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2957 dn->lease_mds = -1;
2958 }
2959
2960 revoke:
2961 m->get_connection()->send_message(
2962 new MClientLease(
2963 CEPH_MDS_LEASE_RELEASE, seq,
2964 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2965 m->put();
2966}
2967
2968void Client::put_inode(Inode *in, int n)
2969{
2970 ldout(cct, 10) << "put_inode on " << *in << dendl;
2971 int left = in->_put(n);
2972 if (left == 0) {
2973 // release any caps
2974 remove_all_caps(in);
2975
2976 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2977 bool unclean = objectcacher->release_set(&in->oset);
2978 assert(!unclean);
2979 inode_map.erase(in->vino());
2980 if (use_faked_inos())
2981 _release_faked_ino(in);
2982
2983 if (in == root) {
2984 root = 0;
2985 root_ancestor = 0;
2986 while (!root_parents.empty())
2987 root_parents.erase(root_parents.begin());
2988 }
2989
2990 delete in;
2991 }
2992}
2993
2994void Client::close_dir(Dir *dir)
2995{
2996 Inode *in = dir->parent_inode;
2997 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2998 assert(dir->is_empty());
2999 assert(in->dir == dir);
3000 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
3001 if (!in->dn_set.empty())
3002 in->get_first_parent()->put(); // unpin dentry
3003
3004 delete in->dir;
3005 in->dir = 0;
3006 put_inode(in); // unpin inode
3007}
3008
3009 /**
3010 * Don't call this with in==NULL, use get_or_create for that
3011 * leave dn set to default NULL unless you're trying to add
3012 * a new inode to a pre-created Dentry
3013 */
3014Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3015{
3016 if (!dn) {
3017 // create a new Dentry
3018 dn = new Dentry;
3019 dn->name = name;
3020
3021 // link to dir
3022 dn->dir = dir;
3023 dir->dentries[dn->name] = dn;
3024 lru.lru_insert_mid(dn); // mid or top?
91327a77
AA
3025 if (!in)
3026 dir->num_null_dentries++;
7c673cae
FG
3027
3028 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3029 << " dn " << dn << " (new dn)" << dendl;
3030 } else {
91327a77
AA
3031 assert(!dn->inode);
3032 if (in)
3033 dir->num_null_dentries--;
7c673cae
FG
3034 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3035 << " dn " << dn << " (old dn)" << dendl;
3036 }
3037
3038 if (in) { // link to inode
3039 dn->inode = in;
3040 if (in->is_dir()) {
3041 if (in->dir)
3042 dn->get(); // dir -> dn pin
3043 if (in->ll_ref)
3044 dn->get(); // ll_ref -> dn pin
3045 }
3046
3047 assert(in->dn_set.count(dn) == 0);
3048
3049 // only one parent for directories!
3050 if (in->is_dir() && !in->dn_set.empty()) {
3051 Dentry *olddn = in->get_first_parent();
3052 assert(olddn->dir != dir || olddn->name != name);
3053 Inode *old_diri = olddn->dir->parent_inode;
3054 old_diri->dir_release_count++;
3055 clear_dir_complete_and_ordered(old_diri, true);
3056 unlink(olddn, true, true); // keep dir, dentry
3057 }
3058
3059 in->dn_set.insert(dn);
3060
3061 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3062 }
3063
3064 return dn;
3065}
3066
3067void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3068{
3069 InodeRef in;
3070 in.swap(dn->inode);
3071 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3072 << " inode " << dn->inode << dendl;
3073
3074 // unlink from inode
3075 if (in) {
3076 if (in->is_dir()) {
3077 if (in->dir)
3078 dn->put(); // dir -> dn pin
3079 if (in->ll_ref)
3080 dn->put(); // ll_ref -> dn pin
3081 }
3082 dn->inode = 0;
3083 assert(in->dn_set.count(dn));
3084 in->dn_set.erase(dn);
3085 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3086 }
3087
3088 if (keepdentry) {
3089 dn->lease_mds = -1;
91327a77
AA
3090 if (in)
3091 dn->dir->num_null_dentries++;
7c673cae
FG
3092 } else {
3093 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3094
3095 // unlink from dir
3096 dn->dir->dentries.erase(dn->name);
91327a77
AA
3097 if (!in)
3098 dn->dir->num_null_dentries--;
7c673cae
FG
3099 if (dn->dir->is_empty() && !keepdir)
3100 close_dir(dn->dir);
3101 dn->dir = 0;
3102
3103 // delete den
3104 lru.lru_remove(dn);
3105 dn->put();
3106 }
3107}
3108
3109/**
3110 * For asynchronous flushes, check for errors from the IO and
3111 * update the inode if necessary
3112 */
3113class C_Client_FlushComplete : public Context {
3114private:
3115 Client *client;
3116 InodeRef inode;
3117public:
3118 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3119 void finish(int r) override {
3120 assert(client->client_lock.is_locked_by_me());
3121 if (r != 0) {
3122 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3123 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3124 << " 0x" << std::hex << inode->ino << std::dec
3125 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3126 inode->set_async_err(r);
3127 }
3128 }
3129};
3130
3131
3132/****
3133 * caps
3134 */
3135
3136void Client::get_cap_ref(Inode *in, int cap)
3137{
3138 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3139 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3140 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3141 in->get();
3142 }
3143 if ((cap & CEPH_CAP_FILE_CACHE) &&
3144 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3145 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3146 in->get();
3147 }
3148 in->get_cap_ref(cap);
3149}
3150
3151void Client::put_cap_ref(Inode *in, int cap)
3152{
3153 int last = in->put_cap_ref(cap);
3154 if (last) {
3155 int put_nref = 0;
3156 int drop = last & ~in->caps_issued();
3157 if (in->snapid == CEPH_NOSNAP) {
3158 if ((last & CEPH_CAP_FILE_WR) &&
3159 !in->cap_snaps.empty() &&
3160 in->cap_snaps.rbegin()->second.writing) {
3161 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3162 in->cap_snaps.rbegin()->second.writing = 0;
3163 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3164 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3165 }
3166 if (last & CEPH_CAP_FILE_BUFFER) {
3167 for (auto &p : in->cap_snaps)
3168 p.second.dirty_data = 0;
3169 signal_cond_list(in->waitfor_commit);
3170 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3171 ++put_nref;
3172 }
3173 }
3174 if (last & CEPH_CAP_FILE_CACHE) {
3175 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3176 ++put_nref;
3177 }
3178 if (drop)
3179 check_caps(in, 0);
3180 if (put_nref)
3181 put_inode(in, put_nref);
3182 }
3183}
3184
3185int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3186{
3187 int r = check_pool_perm(in, need);
3188 if (r < 0)
3189 return r;
3190
3191 while (1) {
3192 int file_wanted = in->caps_file_wanted();
3193 if ((file_wanted & need) != need) {
3194 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3195 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3196 << dendl;
3197 return -EBADF;
3198 }
3199
3200 int implemented;
3201 int have = in->caps_issued(&implemented);
3202
3203 bool waitfor_caps = false;
3204 bool waitfor_commit = false;
3205
3206 if (have & need & CEPH_CAP_FILE_WR) {
3207 if (endoff > 0 &&
3208 (endoff >= (loff_t)in->max_size ||
3209 endoff > (loff_t)(in->size << 1)) &&
3210 endoff > (loff_t)in->wanted_max_size) {
3211 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3212 in->wanted_max_size = endoff;
3213 check_caps(in, 0);
3214 }
3215
3216 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3217 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3218 waitfor_caps = true;
3219 }
3220 if (!in->cap_snaps.empty()) {
3221 if (in->cap_snaps.rbegin()->second.writing) {
3222 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3223 waitfor_caps = true;
3224 }
3225 for (auto &p : in->cap_snaps) {
3226 if (p.second.dirty_data) {
3227 waitfor_commit = true;
3228 break;
3229 }
3230 }
3231 if (waitfor_commit) {
3232 _flush(in, new C_Client_FlushComplete(this, in));
3233 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3234 }
3235 }
3236 }
3237
3238 if (!waitfor_caps && !waitfor_commit) {
3239 if ((have & need) == need) {
7c673cae
FG
3240 int revoking = implemented & ~have;
3241 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3242 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3243 << " revoking " << ccap_string(revoking)
7c673cae 3244 << dendl;
c07f9fc5 3245 if ((revoking & want) == 0) {
7c673cae
FG
3246 *phave = need | (have & want);
3247 in->get_cap_ref(need);
3248 return 0;
3249 }
3250 }
3251 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3252 waitfor_caps = true;
3253 }
3254
3255 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3256 in->auth_cap->session->readonly)
3257 return -EROFS;
3258
3259 if (in->flags & I_CAP_DROPPED) {
3260 int mds_wanted = in->caps_mds_wanted();
3261 if ((mds_wanted & need) != need) {
3262 int ret = _renew_caps(in);
3263 if (ret < 0)
3264 return ret;
3265 continue;
3266 }
a8e16298 3267 if (!(file_wanted & ~mds_wanted))
7c673cae 3268 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3269 }
3270
3271 if (waitfor_caps)
3272 wait_on_list(in->waitfor_caps);
3273 else if (waitfor_commit)
3274 wait_on_list(in->waitfor_commit);
3275 }
3276}
3277
3278int Client::get_caps_used(Inode *in)
3279{
3280 unsigned used = in->caps_used();
3281 if (!(used & CEPH_CAP_FILE_CACHE) &&
3282 !objectcacher->set_is_empty(&in->oset))
3283 used |= CEPH_CAP_FILE_CACHE;
3284 return used;
3285}
3286
3287void Client::cap_delay_requeue(Inode *in)
3288{
3289 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3290 in->hold_caps_until = ceph_clock_now();
3291 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3292 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3293}
3294
3295void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3296 bool sync, int used, int want, int retain,
3297 int flush, ceph_tid_t flush_tid)
3298{
3299 int held = cap->issued | cap->implemented;
3300 int revoking = cap->implemented & ~cap->issued;
3301 retain &= ~revoking;
3302 int dropping = cap->issued & ~retain;
3303 int op = CEPH_CAP_OP_UPDATE;
3304
3305 ldout(cct, 10) << "send_cap " << *in
3306 << " mds." << session->mds_num << " seq " << cap->seq
3307 << (sync ? " sync " : " async ")
3308 << " used " << ccap_string(used)
3309 << " want " << ccap_string(want)
3310 << " flush " << ccap_string(flush)
3311 << " retain " << ccap_string(retain)
3312 << " held "<< ccap_string(held)
3313 << " revoking " << ccap_string(revoking)
3314 << " dropping " << ccap_string(dropping)
3315 << dendl;
3316
3317 if (cct->_conf->client_inject_release_failure && revoking) {
3318 const int would_have_issued = cap->issued & retain;
3319 const int would_have_implemented = cap->implemented & (cap->issued | used);
3320 // Simulated bug:
3321 // - tell the server we think issued is whatever they issued plus whatever we implemented
3322 // - leave what we have implemented in place
3323 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3324 cap->issued = cap->issued | cap->implemented;
3325
3326 // Make an exception for revoking xattr caps: we are injecting
3327 // failure to release other caps, but allow xattr because client
3328 // will block on xattr ops if it can't release these to MDS (#9800)
3329 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3330 cap->issued ^= xattr_mask & revoking;
3331 cap->implemented ^= xattr_mask & revoking;
3332
3333 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3334 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3335 } else {
3336 // Normal behaviour
3337 cap->issued &= retain;
3338 cap->implemented &= cap->issued | used;
3339 }
3340
3341 snapid_t follows = 0;
3342
3343 if (flush)
3344 follows = in->snaprealm->get_snap_context().seq;
3345
3346 MClientCaps *m = new MClientCaps(op,
3347 in->ino,
3348 0,
3349 cap->cap_id, cap->seq,
3350 cap->implemented,
3351 want,
3352 flush,
3353 cap->mseq,
3354 cap_epoch_barrier);
3355 m->caller_uid = in->cap_dirtier_uid;
3356 m->caller_gid = in->cap_dirtier_gid;
3357
3358 m->head.issue_seq = cap->issue_seq;
3359 m->set_tid(flush_tid);
3360
3361 m->head.uid = in->uid;
3362 m->head.gid = in->gid;
3363 m->head.mode = in->mode;
3364
3365 m->head.nlink = in->nlink;
3366
3367 if (flush & CEPH_CAP_XATTR_EXCL) {
3368 ::encode(in->xattrs, m->xattrbl);
3369 m->head.xattr_version = in->xattr_version;
3370 }
3371
3372 m->size = in->size;
3373 m->max_size = in->max_size;
3374 m->truncate_seq = in->truncate_seq;
3375 m->truncate_size = in->truncate_size;
3376 m->mtime = in->mtime;
3377 m->atime = in->atime;
3378 m->ctime = in->ctime;
3379 m->btime = in->btime;
3380 m->time_warp_seq = in->time_warp_seq;
3381 m->change_attr = in->change_attr;
3382 if (sync)
3383 m->flags |= CLIENT_CAPS_SYNC;
3384
3385 if (flush & CEPH_CAP_FILE_WR) {
3386 m->inline_version = in->inline_version;
3387 m->inline_data = in->inline_data;
3388 }
3389
3390 in->reported_size = in->size;
3391 m->set_snap_follows(follows);
3392 cap->wanted = want;
3393 if (cap == in->auth_cap) {
3394 m->set_max_size(in->wanted_max_size);
3395 in->requested_max_size = in->wanted_max_size;
3396 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3397 }
3398
3399 if (!session->flushing_caps_tids.empty())
3400 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3401
3402 session->con->send_message(m);
3403}
3404
31f18b77
FG
3405static bool is_max_size_approaching(Inode *in)
3406{
3407 /* mds will adjust max size according to the reported size */
3408 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3409 return false;
3410 if (in->size >= in->max_size)
3411 return true;
3412 /* half of previous max_size increment has been used */
3413 if (in->max_size > in->reported_size &&
3414 (in->size << 1) >= in->max_size + in->reported_size)
3415 return true;
3416 return false;
3417}
7c673cae
FG
3418
3419/**
3420 * check_caps
3421 *
3422 * Examine currently used and wanted versus held caps. Release, flush or ack
3423 * revoked caps to the MDS as appropriate.
3424 *
3425 * @param in the inode to check
3426 * @param flags flags to apply to cap check
3427 */
3428void Client::check_caps(Inode *in, unsigned flags)
3429{
3430 unsigned wanted = in->caps_wanted();
3431 unsigned used = get_caps_used(in);
3432 unsigned cap_used;
3433
7c673cae
FG
3434 int implemented;
3435 int issued = in->caps_issued(&implemented);
3436 int revoking = implemented & ~issued;
3437
3438 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3439 if (!unmounting && in->nlink > 0) {
3440 if (wanted) {
7c673cae 3441 retain |= CEPH_CAP_ANY;
a8e16298
TL
3442 } else if (in->is_dir() &&
3443 (issued & CEPH_CAP_FILE_SHARED) &&
3444 (in->flags & I_COMPLETE)) {
3445 // we do this here because we don't want to drop to Fs (and then
3446 // drop the Fs if we do a create!) if that alone makes us send lookups
3447 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3448 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3449 retain |= wanted;
3450 } else {
7c673cae 3451 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3452 // keep RD only if we didn't have the file open RW,
3453 // because then the mds would revoke it anyway to
3454 // journal max_size=0.
3455 if (in->max_size == 0)
3456 retain |= CEPH_CAP_ANY_RD;
3457 }
7c673cae
FG
3458 }
3459
3460 ldout(cct, 10) << "check_caps on " << *in
3461 << " wanted " << ccap_string(wanted)
3462 << " used " << ccap_string(used)
3463 << " issued " << ccap_string(issued)
3464 << " revoking " << ccap_string(revoking)
3465 << " flags=" << flags
3466 << dendl;
3467
3468 if (in->snapid != CEPH_NOSNAP)
3469 return; //snap caps last forever, can't write
3470
3471 if (in->caps.empty())
3472 return; // guard if at end of func
3473
3474 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
94b18763
FG
3475 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3476 if (_release(in))
3477 used &= ~CEPH_CAP_FILE_CACHE;
3478 }
7c673cae
FG
3479
3480 if (!in->cap_snaps.empty())
3481 flush_snaps(in);
3482
3483 if (flags & CHECK_CAPS_NODELAY)
3484 in->hold_caps_until = utime_t();
3485 else
3486 cap_delay_requeue(in);
3487
3488 utime_t now = ceph_clock_now();
3489
3490 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3491 while (it != in->caps.end()) {
3492 mds_rank_t mds = it->first;
3493 Cap *cap = it->second;
3494 ++it;
3495
3496 MetaSession *session = mds_sessions[mds];
3497 assert(session);
3498
3499 cap_used = used;
3500 if (in->auth_cap && cap != in->auth_cap)
3501 cap_used &= ~in->auth_cap->issued;
3502
3503 revoking = cap->implemented & ~cap->issued;
3504
3505 ldout(cct, 10) << " cap mds." << mds
3506 << " issued " << ccap_string(cap->issued)
3507 << " implemented " << ccap_string(cap->implemented)
3508 << " revoking " << ccap_string(revoking) << dendl;
3509
3510 if (in->wanted_max_size > in->max_size &&
3511 in->wanted_max_size > in->requested_max_size &&
3512 cap == in->auth_cap)
3513 goto ack;
3514
3515 /* approaching file_max? */
3516 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3517 cap == in->auth_cap &&
3518 is_max_size_approaching(in)) {
7c673cae 3519 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3520 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3521 goto ack;
3522 }
3523
3524 /* completed revocation? */
3525 if (revoking && (revoking & cap_used) == 0) {
3526 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3527 goto ack;
3528 }
3529
3530 /* want more caps from mds? */
3531 if (wanted & ~(cap->wanted | cap->issued))
3532 goto ack;
3533
3534 if (!revoking && unmounting && (cap_used == 0))
3535 goto ack;
3536
a8e16298
TL
3537 if ((cap->issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3538 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3539 continue;
3540
3541 if (now < in->hold_caps_until) {
3542 ldout(cct, 10) << "delaying cap release" << dendl;
3543 continue;
3544 }
3545
3546 ack:
3547 // re-send old cap/snapcap flushes first.
3548 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3549 session->mds_state < MDSMap::STATE_ACTIVE &&
3550 session->early_flushing_caps.count(in) == 0) {
3551 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3552 << " to mds." << session->mds_num << dendl;
3553 session->early_flushing_caps.insert(in);
3554 if (in->cap_snaps.size())
3555 flush_snaps(in, true);
3556 if (in->flushing_caps)
3557 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3558 }
3559
3560 int flushing;
3561 ceph_tid_t flush_tid;
3562 if (in->auth_cap == cap && in->dirty_caps) {
3563 flushing = mark_caps_flushing(in, &flush_tid);
3564 } else {
3565 flushing = 0;
3566 flush_tid = 0;
3567 }
3568
3569 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3570 retain, flushing, flush_tid);
3571 }
3572}
3573
3574
3575void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3576{
3577 int used = get_caps_used(in);
3578 int dirty = in->caps_dirty();
3579 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3580
3581 if (in->cap_snaps.size() &&
3582 in->cap_snaps.rbegin()->second.writing) {
3583 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3584 return;
3585 } else if (in->caps_dirty() ||
3586 (used & CEPH_CAP_FILE_WR) ||
3587 (dirty & CEPH_CAP_ANY_WR)) {
3588 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3589 assert(capsnapem.second == true); /* element inserted */
3590 CapSnap &capsnap = capsnapem.first->second;
3591 capsnap.context = old_snapc;
3592 capsnap.issued = in->caps_issued();
3593 capsnap.dirty = in->caps_dirty();
3594
3595 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3596
3597 capsnap.uid = in->uid;
3598 capsnap.gid = in->gid;
3599 capsnap.mode = in->mode;
3600 capsnap.btime = in->btime;
3601 capsnap.xattrs = in->xattrs;
3602 capsnap.xattr_version = in->xattr_version;
3603
3604 if (used & CEPH_CAP_FILE_WR) {
3605 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3606 capsnap.writing = 1;
3607 } else {
3608 finish_cap_snap(in, capsnap, used);
3609 }
3610 } else {
3611 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3612 }
3613}
3614
3615void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3616{
3617 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3618 capsnap.size = in->size;
3619 capsnap.mtime = in->mtime;
3620 capsnap.atime = in->atime;
3621 capsnap.ctime = in->ctime;
3622 capsnap.time_warp_seq = in->time_warp_seq;
3623 capsnap.change_attr = in->change_attr;
3624
3625 capsnap.dirty |= in->caps_dirty();
3626
3627 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3628 capsnap.inline_data = in->inline_data;
3629 capsnap.inline_version = in->inline_version;
3630 }
3631
3632 if (used & CEPH_CAP_FILE_BUFFER) {
3633 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3634 << " WRBUFFER, delaying" << dendl;
3635 } else {
3636 capsnap.dirty_data = 0;
3637 flush_snaps(in);
3638 }
3639}
3640
3641void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3642{
3643 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3644 in->cap_snaps.at(seq).dirty_data = 0;
3645 flush_snaps(in);
3646}
3647
3648void Client::flush_snaps(Inode *in, bool all_again)
3649{
3650 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3651 assert(in->cap_snaps.size());
3652
3653 // pick auth mds
3654 assert(in->auth_cap);
3655 MetaSession *session = in->auth_cap->session;
3656 int mseq = in->auth_cap->mseq;
3657
3658 for (auto &p : in->cap_snaps) {
3659 CapSnap &capsnap = p.second;
3660 if (!all_again) {
3661 // only flush once per session
3662 if (capsnap.flush_tid > 0)
3663 continue;
3664 }
3665
3666 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3667 << " follows " << p.first
3668 << " size " << capsnap.size
3669 << " mtime " << capsnap.mtime
3670 << " dirty_data=" << capsnap.dirty_data
3671 << " writing=" << capsnap.writing
3672 << " on " << *in << dendl;
3673 if (capsnap.dirty_data || capsnap.writing)
3674 continue;
3675
3676 if (capsnap.flush_tid == 0) {
3677 capsnap.flush_tid = ++last_flush_tid;
3678 if (!in->flushing_cap_item.is_on_list())
3679 session->flushing_caps.push_back(&in->flushing_cap_item);
3680 session->flushing_caps_tids.insert(capsnap.flush_tid);
3681 }
3682
3683 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3684 cap_epoch_barrier);
3685 if (user_id >= 0)
3686 m->caller_uid = user_id;
3687 if (group_id >= 0)
3688 m->caller_gid = group_id;
3689
3690 m->set_client_tid(capsnap.flush_tid);
3691 m->head.snap_follows = p.first;
3692
3693 m->head.caps = capsnap.issued;
3694 m->head.dirty = capsnap.dirty;
3695
3696 m->head.uid = capsnap.uid;
3697 m->head.gid = capsnap.gid;
3698 m->head.mode = capsnap.mode;
3699 m->btime = capsnap.btime;
3700
3701 m->size = capsnap.size;
3702
3703 m->head.xattr_version = capsnap.xattr_version;
3704 ::encode(capsnap.xattrs, m->xattrbl);
3705
3706 m->ctime = capsnap.ctime;
3707 m->btime = capsnap.btime;
3708 m->mtime = capsnap.mtime;
3709 m->atime = capsnap.atime;
3710 m->time_warp_seq = capsnap.time_warp_seq;
3711 m->change_attr = capsnap.change_attr;
3712
3713 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3714 m->inline_version = in->inline_version;
3715 m->inline_data = in->inline_data;
3716 }
3717
3718 assert(!session->flushing_caps_tids.empty());
3719 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3720
3721 session->con->send_message(m);
3722 }
3723}
3724
3725
3726
3727void Client::wait_on_list(list<Cond*>& ls)
3728{
3729 Cond cond;
3730 ls.push_back(&cond);
3731 cond.Wait(client_lock);
3732 ls.remove(&cond);
3733}
3734
3735void Client::signal_cond_list(list<Cond*>& ls)
3736{
3737 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3738 (*it)->Signal();
3739}
3740
3741void Client::wait_on_context_list(list<Context*>& ls)
3742{
3743 Cond cond;
3744 bool done = false;
3745 int r;
3746 ls.push_back(new C_Cond(&cond, &done, &r));
3747 while (!done)
3748 cond.Wait(client_lock);
3749}
3750
3751void Client::signal_context_list(list<Context*>& ls)
3752{
3753 while (!ls.empty()) {
3754 ls.front()->complete(0);
3755 ls.pop_front();
3756 }
3757}
3758
a8e16298 3759void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae
FG
3760{
3761 xlist<Cap*>::iterator iter = s->caps.begin();
3762 while (!iter.end()){
a8e16298
TL
3763 auto cap = *iter;
3764 auto in = cap->inode;
7c673cae 3765 ++iter;
a8e16298
TL
3766 if (reconnect) {
3767 in->requested_max_size = 0;
3768 in->wanted_max_size = 0;
3769 } else {
3770 if (cap->gen < s->cap_gen) {
3771 // mds did not re-issue stale cap.
3772 cap->issued = cap->implemented = CEPH_CAP_PIN;
3773 // make sure mds knows what we want.
3774 if (in->caps_file_wanted() & ~cap->wanted)
3775 in->flags |= I_CAP_DROPPED;
3776 }
3777 }
3778 signal_cond_list(in->waitfor_caps);
7c673cae
FG
3779 }
3780}
3781
3782
3783// flush dirty data (from objectcache)
3784
3785class C_Client_CacheInvalidate : public Context {
3786private:
3787 Client *client;
3788 vinodeno_t ino;
3789 int64_t offset, length;
3790public:
3791 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3792 client(c), offset(off), length(len) {
3793 if (client->use_faked_inos())
3794 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3795 else
3796 ino = in->vino();
3797 }
3798 void finish(int r) override {
3799 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3800 assert(!client->client_lock.is_locked_by_me());
3801 client->_async_invalidate(ino, offset, length);
3802 }
3803};
3804
3805void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3806{
3807 if (unmounting)
3808 return;
3809 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3810 ino_invalidate_cb(callback_handle, ino, off, len);
3811}
3812
3813void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3814
3815 if (ino_invalidate_cb)
3816 // we queue the invalidate, which calls the callback and decrements the ref
3817 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3818}
3819
3820void Client::_invalidate_inode_cache(Inode *in)
3821{
3822 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3823
3824 // invalidate our userspace inode cache
94b18763 3825 if (cct->_conf->client_oc) {
7c673cae 3826 objectcacher->release_set(&in->oset);
94b18763
FG
3827 if (!objectcacher->set_is_empty(&in->oset))
3828 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3829 }
7c673cae
FG
3830
3831 _schedule_invalidate_callback(in, 0, 0);
3832}
3833
3834void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3835{
3836 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3837
3838 // invalidate our userspace inode cache
3839 if (cct->_conf->client_oc) {
3840 vector<ObjectExtent> ls;
3841 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3842 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3843 }
3844
3845 _schedule_invalidate_callback(in, off, len);
3846}
3847
3848bool Client::_release(Inode *in)
3849{
3850 ldout(cct, 20) << "_release " << *in << dendl;
3851 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3852 _invalidate_inode_cache(in);
3853 return true;
3854 }
3855 return false;
3856}
3857
3858bool Client::_flush(Inode *in, Context *onfinish)
3859{
3860 ldout(cct, 10) << "_flush " << *in << dendl;
3861
3862 if (!in->oset.dirty_or_tx) {
3863 ldout(cct, 10) << " nothing to flush" << dendl;
3864 onfinish->complete(0);
3865 return true;
3866 }
3867
3868 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3869 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3870 objectcacher->purge_set(&in->oset);
3871 if (onfinish) {
3872 onfinish->complete(-ENOSPC);
3873 }
3874 return true;
3875 }
3876
3877 return objectcacher->flush_set(&in->oset, onfinish);
3878}
3879
3880void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3881{
3882 assert(client_lock.is_locked());
3883 if (!in->oset.dirty_or_tx) {
3884 ldout(cct, 10) << " nothing to flush" << dendl;
3885 return;
3886 }
3887
3888 Mutex flock("Client::_flush_range flock");
3889 Cond cond;
3890 bool safe = false;
3891 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3892 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3893 offset, size, onflush);
3894 if (!ret) {
3895 // wait for flush
3896 client_lock.Unlock();
3897 flock.Lock();
3898 while (!safe)
3899 cond.Wait(flock);
3900 flock.Unlock();
3901 client_lock.Lock();
3902 }
3903}
3904
3905void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3906{
3907 // Mutex::Locker l(client_lock);
3908 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3909 Inode *in = static_cast<Inode *>(oset->parent);
3910 assert(in);
3911 _flushed(in);
3912}
3913
3914void Client::_flushed(Inode *in)
3915{
3916 ldout(cct, 10) << "_flushed " << *in << dendl;
3917
3918 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3919}
3920
3921
3922
3923// checks common to add_update_cap, handle_cap_grant
3924void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3925{
3926 unsigned had = in->caps_issued();
3927
3928 if ((issued & CEPH_CAP_FILE_CACHE) &&
3929 !(had & CEPH_CAP_FILE_CACHE))
3930 in->cache_gen++;
3931
3932 if ((issued & CEPH_CAP_FILE_SHARED) &&
3933 !(had & CEPH_CAP_FILE_SHARED)) {
3934 in->shared_gen++;
3935
3936 if (in->is_dir())
3937 clear_dir_complete_and_ordered(in, true);
3938 }
3939}
3940
3941void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3942 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3943 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae
FG
3944{
3945 Cap *cap = 0;
3946 mds_rank_t mds = mds_session->mds_num;
a8e16298
TL
3947 auto it = in->caps.find(mds);
3948 if (it != in->caps.end()) {
3949 cap = it->second;
3950 if (cap->gen < mds_session->cap_gen)
3951 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae
FG
3952
3953 /*
3954 * auth mds of the inode changed. we received the cap export
3955 * message, but still haven't received the cap import message.
3956 * handle_cap_export() updated the new auth MDS' cap.
3957 *
3958 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3959 * a message that was send before the cap import message. So
3960 * don't remove caps.
3961 */
3962 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3963 assert(cap == in->auth_cap);
3964 assert(cap->cap_id == cap_id);
3965 seq = cap->seq;
3966 mseq = cap->mseq;
3967 issued |= cap->issued;
3968 flags |= CEPH_CAP_FLAG_AUTH;
3969 }
3970 } else {
3971 mds_session->num_caps++;
3972 if (!in->is_any_caps()) {
3973 assert(in->snaprealm == 0);
3974 in->snaprealm = get_snap_realm(realm);
3975 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3976 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3977 }
3978 in->caps[mds] = cap = new Cap;
3979
3980 mds_session->caps.push_back(&cap->cap_item);
3981 cap->session = mds_session;
3982 cap->inode = in;
3983 cap->gen = mds_session->cap_gen;
7c673cae
FG
3984 }
3985
3986 check_cap_issue(in, cap, issued);
3987
3988 if (flags & CEPH_CAP_FLAG_AUTH) {
3989 if (in->auth_cap != cap &&
3990 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3991 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3992 ldout(cct, 10) << "add_update_cap changing auth cap: "
3993 << "add myself to new auth MDS' flushing caps list" << dendl;
3994 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3995 }
3996 in->auth_cap = cap;
3997 }
3998 }
3999
4000 unsigned old_caps = cap->issued;
4001 cap->cap_id = cap_id;
91327a77 4002 cap->issued = issued;
7c673cae 4003 cap->implemented |= issued;
a8e16298
TL
4004 if (ceph_seq_cmp(mseq, cap->mseq) > 0)
4005 cap->wanted = wanted;
4006 else
4007 cap->wanted |= wanted;
7c673cae
FG
4008 cap->seq = seq;
4009 cap->issue_seq = seq;
4010 cap->mseq = mseq;
28e407b8 4011 cap->gen = mds_session->cap_gen;
7c673cae
FG
4012 cap->latest_perms = cap_perms;
4013 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
a8e16298 4014 << " from mds." << mds << " on " << *in << dendl;
7c673cae
FG
4015
4016 if ((issued & ~old_caps) && in->auth_cap == cap) {
4017 // non-auth MDS is revoking the newly grant caps ?
4018 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
4019 if (it->second == cap)
4020 continue;
4021 if (it->second->implemented & ~it->second->issued & issued) {
4022 check_caps(in, CHECK_CAPS_NODELAY);
4023 break;
4024 }
4025 }
4026 }
4027
4028 if (issued & ~old_caps)
4029 signal_cond_list(in->waitfor_caps);
4030}
4031
4032void Client::remove_cap(Cap *cap, bool queue_release)
4033{
4034 Inode *in = cap->inode;
4035 MetaSession *session = cap->session;
4036 mds_rank_t mds = cap->session->mds_num;
4037
4038 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
4039
4040 if (queue_release) {
4041 session->enqueue_cap_release(
4042 in->ino,
4043 cap->cap_id,
4044 cap->issue_seq,
4045 cap->mseq,
4046 cap_epoch_barrier);
4047 }
4048
4049 if (in->auth_cap == cap) {
4050 if (in->flushing_cap_item.is_on_list()) {
4051 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4052 in->flushing_cap_item.remove_myself();
4053 }
4054 in->auth_cap = NULL;
4055 }
4056 assert(in->caps.count(mds));
4057 in->caps.erase(mds);
4058
4059 cap->cap_item.remove_myself();
4060 delete cap;
4061 cap = nullptr;
4062
4063 if (!in->is_any_caps()) {
4064 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
4065 in->snaprealm_item.remove_myself();
4066 put_snap_realm(in->snaprealm);
4067 in->snaprealm = 0;
4068 }
4069}
4070
4071void Client::remove_all_caps(Inode *in)
4072{
4073 while (!in->caps.empty())
4074 remove_cap(in->caps.begin()->second, true);
4075}
4076
4077void Client::remove_session_caps(MetaSession *s)
4078{
4079 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
4080
4081 while (s->caps.size()) {
4082 Cap *cap = *s->caps.begin();
4083 Inode *in = cap->inode;
4084 bool dirty_caps = false, cap_snaps = false;
4085 if (in->auth_cap == cap) {
4086 cap_snaps = !in->cap_snaps.empty();
4087 dirty_caps = in->dirty_caps | in->flushing_caps;
4088 in->wanted_max_size = 0;
4089 in->requested_max_size = 0;
7c673cae 4090 }
a8e16298
TL
4091 if (cap->wanted | cap->issued)
4092 in->flags |= I_CAP_DROPPED;
7c673cae 4093 remove_cap(cap, false);
7c673cae
FG
4094 if (cap_snaps) {
4095 InodeRef tmp_ref(in);
4096 in->cap_snaps.clear();
4097 }
4098 if (dirty_caps) {
4099 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4100 if (in->flushing_caps) {
4101 num_flushing_caps--;
4102 in->flushing_cap_tids.clear();
4103 }
4104 in->flushing_caps = 0;
28e407b8 4105 in->mark_caps_clean();
7c673cae
FG
4106 put_inode(in);
4107 }
a8e16298 4108 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4109 }
4110 s->flushing_caps_tids.clear();
4111 sync_cond.Signal();
4112}
4113
91327a77 4114int Client::_do_remount(bool retry_on_error)
b32b8144 4115{
91327a77
AA
4116 uint64_t max_retries = cct->_conf->get_val<uint64_t>("mds_max_retries_on_remount_failure");
4117
b32b8144
FG
4118 errno = 0;
4119 int r = remount_cb(callback_handle);
91327a77
AA
4120 if (r == 0) {
4121 retries_on_invalidate = 0;
4122 } else {
b32b8144
FG
4123 int e = errno;
4124 client_t whoami = get_nodeid();
4125 if (r == -1) {
4126 lderr(cct) <<
4127 "failed to remount (to trim kernel dentries): "
4128 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4129 } else {
4130 lderr(cct) <<
4131 "failed to remount (to trim kernel dentries): "
4132 "return code = " << r << dendl;
4133 }
91327a77
AA
4134 bool should_abort =
4135 (cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4136 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4137 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4138 if (should_abort && !unmounting) {
4139 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4140 ceph_abort();
4141 }
4142 }
4143 return r;
4144}
4145
7c673cae
FG
4146class C_Client_Remount : public Context {
4147private:
4148 Client *client;
4149public:
4150 explicit C_Client_Remount(Client *c) : client(c) {}
4151 void finish(int r) override {
b32b8144 4152 assert(r == 0);
91327a77 4153 client->_do_remount(true);
7c673cae
FG
4154 }
4155};
4156
4157void Client::_invalidate_kernel_dcache()
4158{
4159 if (unmounting)
4160 return;
94b18763
FG
4161 if (can_invalidate_dentries) {
4162 if (dentry_invalidate_cb && root->dir) {
4163 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4164 p != root->dir->dentries.end();
4165 ++p) {
4166 if (p->second->inode)
4167 _schedule_invalidate_dentry_callback(p->second, false);
4168 }
7c673cae
FG
4169 }
4170 } else if (remount_cb) {
4171 // Hacky:
4172 // when remounting a file system, linux kernel trims all unused dentries in the fs
4173 remount_finisher.queue(new C_Client_Remount(this));
4174 }
4175}
4176
91327a77
AA
4177void Client::_trim_negative_child_dentries(InodeRef& in)
4178{
4179 if (!in->is_dir())
4180 return;
4181
4182 Dir* dir = in->dir;
4183 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4184 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4185 Dentry *dn = p->second;
4186 ++p;
4187 assert(!dn->inode);
4188 if (dn->lru_is_expireable())
4189 unlink(dn, true, false); // keep dir, drop dentry
4190 }
4191 if (dir->dentries.empty()) {
4192 close_dir(dir);
4193 }
4194 }
4195
4196 if (in->flags & I_SNAPDIR_OPEN) {
4197 InodeRef snapdir = open_snapdir(in.get());
4198 _trim_negative_child_dentries(snapdir);
4199 }
4200}
4201
28e407b8 4202void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4203{
4204 mds_rank_t mds = s->mds_num;
28e407b8 4205 size_t caps_size = s->caps.size();
7c673cae
FG
4206 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4207 << " caps " << caps_size << dendl;
4208
28e407b8
AA
4209 uint64_t trimmed = 0;
4210 auto p = s->caps.begin();
4211 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4212 * looking at from getting deleted during traversal. */
7c673cae
FG
4213 while ((caps_size - trimmed) > max && !p.end()) {
4214 Cap *cap = *p;
b32b8144 4215 InodeRef in(cap->inode);
7c673cae
FG
4216
4217 // Increment p early because it will be invalidated if cap
4218 // is deleted inside remove_cap
4219 ++p;
4220
4221 if (in->caps.size() > 1 && cap != in->auth_cap) {
4222 int mine = cap->issued | cap->implemented;
4223 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4224 // disposable non-auth cap
b32b8144 4225 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4226 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4227 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4228 trimmed++;
4229 }
4230 } else {
4231 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4232 _trim_negative_child_dentries(in);
7c673cae
FG
4233 bool all = true;
4234 set<Dentry*>::iterator q = in->dn_set.begin();
7c673cae
FG
4235 while (q != in->dn_set.end()) {
4236 Dentry *dn = *q++;
4237 if (dn->lru_is_expireable()) {
4238 if (can_invalidate_dentries &&
4239 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4240 // Only issue one of these per DN for inodes in root: handle
4241 // others more efficiently by calling for root-child DNs at
4242 // the end of this function.
4243 _schedule_invalidate_dentry_callback(dn, true);
4244 }
28e407b8
AA
4245 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4246 to_trim.insert(dn);
7c673cae
FG
4247 } else {
4248 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4249 all = false;
4250 }
4251 }
4252 if (all && in->ino != MDS_INO_ROOT) {
4253 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4254 trimmed++;
4255 }
4256 }
4257 }
28e407b8
AA
4258 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4259 for (const auto &dn : to_trim) {
4260 trim_dentry(dn);
4261 }
4262 to_trim.clear();
7c673cae 4263
b32b8144
FG
4264 caps_size = s->caps.size();
4265 if (caps_size > max)
7c673cae
FG
4266 _invalidate_kernel_dcache();
4267}
4268
4269void Client::force_session_readonly(MetaSession *s)
4270{
4271 s->readonly = true;
4272 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4273 Inode *in = (*p)->inode;
4274 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4275 signal_cond_list(in->waitfor_caps);
4276 }
4277}
4278
7c673cae
FG
4279int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4280{
4281 MetaSession *session = in->auth_cap->session;
4282
4283 int flushing = in->dirty_caps;
4284 assert(flushing);
4285
4286 ceph_tid_t flush_tid = ++last_flush_tid;
4287 in->flushing_cap_tids[flush_tid] = flushing;
4288
4289 if (!in->flushing_caps) {
4290 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4291 num_flushing_caps++;
4292 } else {
4293 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4294 }
4295
4296 in->flushing_caps |= flushing;
28e407b8 4297 in->mark_caps_clean();
7c673cae
FG
4298
4299 if (!in->flushing_cap_item.is_on_list())
4300 session->flushing_caps.push_back(&in->flushing_cap_item);
4301 session->flushing_caps_tids.insert(flush_tid);
4302
4303 *ptid = flush_tid;
4304 return flushing;
4305}
4306
4307void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4308{
4309 for (auto &p : in->cap_snaps) {
4310 CapSnap &capsnap = p.second;
4311 if (capsnap.flush_tid > 0) {
4312 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4313 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4314 }
4315 }
4316 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4317 it != in->flushing_cap_tids.end();
4318 ++it) {
4319 old_s->flushing_caps_tids.erase(it->first);
4320 new_s->flushing_caps_tids.insert(it->first);
4321 }
4322 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4323}
4324
4325/*
4326 * Flush all caps back to the MDS. Because the callers generally wait on the
4327 * result of this function (syncfs and umount cases), we set
4328 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4329 */
4330void Client::flush_caps_sync()
4331{
4332 ldout(cct, 10) << __func__ << dendl;
28e407b8 4333 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4334 while (!p.end()) {
4335 unsigned flags = CHECK_CAPS_NODELAY;
4336 Inode *in = *p;
4337
4338 ++p;
28e407b8
AA
4339 delayed_list.pop_front();
4340 if (p.end() && dirty_list.empty())
7c673cae
FG
4341 flags |= CHECK_CAPS_SYNCHRONOUS;
4342 check_caps(in, flags);
4343 }
4344
4345 // other caps, too
28e407b8 4346 p = dirty_list.begin();
7c673cae
FG
4347 while (!p.end()) {
4348 unsigned flags = CHECK_CAPS_NODELAY;
4349 Inode *in = *p;
4350
4351 ++p;
4352 if (p.end())
4353 flags |= CHECK_CAPS_SYNCHRONOUS;
4354 check_caps(in, flags);
4355 }
4356}
4357
4358void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4359{
4360 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4361 Cap *cap = in->auth_cap;
4362 assert(cap->session == session);
4363
4364 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4365 p != in->flushing_cap_tids.end();
4366 ++p) {
4367 bool req_sync = false;
4368
4369 /* If this is a synchronous request, then flush the journal on last one */
4370 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4371 req_sync = true;
4372
4373 send_cap(in, session, cap, req_sync,
4374 (get_caps_used(in) | in->caps_dirty()),
4375 in->caps_wanted(), (cap->issued | cap->implemented),
4376 p->second, p->first);
4377 }
4378}
4379
4380void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4381{
4382 while (in->flushing_caps) {
4383 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4384 assert(it != in->flushing_cap_tids.end());
4385 if (it->first > want)
4386 break;
4387 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4388 << ccap_string(it->second) << " want " << want
4389 << " last " << it->first << dendl;
4390 wait_on_list(in->waitfor_caps);
4391 }
4392}
4393
4394void Client::wait_sync_caps(ceph_tid_t want)
4395{
4396 retry:
4397 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4398 << num_flushing_caps << " total flushing)" << dendl;
4399 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4400 p != mds_sessions.end();
4401 ++p) {
4402 MetaSession *s = p->second;
4403 if (s->flushing_caps_tids.empty())
4404 continue;
4405 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4406 if (oldest_tid <= want) {
4407 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4408 << " (want " << want << ")" << dendl;
4409 sync_cond.Wait(client_lock);
4410 goto retry;
4411 }
4412 }
4413}
4414
4415void Client::kick_flushing_caps(MetaSession *session)
4416{
4417 mds_rank_t mds = session->mds_num;
4418 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4419
4420 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4421 Inode *in = *p;
4422 if (session->early_flushing_caps.count(in))
4423 continue;
4424 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4425 if (in->cap_snaps.size())
4426 flush_snaps(in, true);
4427 if (in->flushing_caps)
4428 flush_caps(in, session);
4429 }
4430
4431 session->early_flushing_caps.clear();
4432}
4433
4434void Client::early_kick_flushing_caps(MetaSession *session)
4435{
4436 session->early_flushing_caps.clear();
4437
4438 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4439 Inode *in = *p;
4440 assert(in->auth_cap);
4441
4442 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4443 // stage. This guarantees that MDS processes the cap flush message before issuing
4444 // the flushing caps to other client.
4445 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4446 continue;
4447
4448 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4449 << " to mds." << session->mds_num << dendl;
4450
4451 session->early_flushing_caps.insert(in);
4452
4453 if (in->cap_snaps.size())
4454 flush_snaps(in, true);
4455 if (in->flushing_caps)
4456 flush_caps(in, session);
4457
4458 }
4459}
4460
7c673cae
FG
4461void SnapRealm::build_snap_context()
4462{
4463 set<snapid_t> snaps;
4464 snapid_t max_seq = seq;
4465
4466 // start with prior_parents?
4467 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4468 snaps.insert(prior_parent_snaps[i]);
4469
4470 // current parent's snaps
4471 if (pparent) {
4472 const SnapContext& psnapc = pparent->get_snap_context();
4473 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4474 if (psnapc.snaps[i] >= parent_since)
4475 snaps.insert(psnapc.snaps[i]);
4476 if (psnapc.seq > max_seq)
4477 max_seq = psnapc.seq;
4478 }
4479
4480 // my snaps
4481 for (unsigned i=0; i<my_snaps.size(); i++)
4482 snaps.insert(my_snaps[i]);
4483
4484 // ok!
4485 cached_snap_context.seq = max_seq;
4486 cached_snap_context.snaps.resize(0);
4487 cached_snap_context.snaps.reserve(snaps.size());
4488 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4489 cached_snap_context.snaps.push_back(*p);
4490}
4491
4492void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4493{
4494 list<SnapRealm*> q;
4495 q.push_back(realm);
4496
4497 while (!q.empty()) {
4498 realm = q.front();
4499 q.pop_front();
4500
4501 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4502 realm->invalidate_cache();
4503
4504 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4505 p != realm->pchildren.end();
4506 ++p)
4507 q.push_back(*p);
4508 }
4509}
4510
4511SnapRealm *Client::get_snap_realm(inodeno_t r)
4512{
4513 SnapRealm *realm = snap_realms[r];
4514 if (!realm)
4515 snap_realms[r] = realm = new SnapRealm(r);
4516 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4517 realm->nref++;
4518 return realm;
4519}
4520
4521SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4522{
4523 if (snap_realms.count(r) == 0) {
4524 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4525 return NULL;
4526 }
4527 SnapRealm *realm = snap_realms[r];
4528 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4529 realm->nref++;
4530 return realm;
4531}
4532
4533void Client::put_snap_realm(SnapRealm *realm)
4534{
4535 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4536 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4537 if (--realm->nref == 0) {
4538 snap_realms.erase(realm->ino);
4539 if (realm->pparent) {
4540 realm->pparent->pchildren.erase(realm);
4541 put_snap_realm(realm->pparent);
4542 }
4543 delete realm;
4544 }
4545}
4546
4547bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4548{
4549 if (realm->parent != parent) {
4550 ldout(cct, 10) << "adjust_realm_parent " << *realm
4551 << " " << realm->parent << " -> " << parent << dendl;
4552 realm->parent = parent;
4553 if (realm->pparent) {
4554 realm->pparent->pchildren.erase(realm);
4555 put_snap_realm(realm->pparent);
4556 }
4557 realm->pparent = get_snap_realm(parent);
4558 realm->pparent->pchildren.insert(realm);
4559 return true;
4560 }
4561 return false;
4562}
4563
4564static bool has_new_snaps(const SnapContext& old_snapc,
4565 const SnapContext& new_snapc)
4566{
4567 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4568}
4569
4570
4571void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4572{
4573 SnapRealm *first_realm = NULL;
4574 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4575
4576 map<SnapRealm*, SnapContext> dirty_realms;
4577
4578 bufferlist::iterator p = bl.begin();
4579 while (!p.end()) {
4580 SnapRealmInfo info;
4581 ::decode(info, p);
4582 SnapRealm *realm = get_snap_realm(info.ino());
4583
4584 bool invalidate = false;
4585
4586 if (info.seq() > realm->seq) {
4587 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4588 << dendl;
4589
4590 if (flush) {
4591 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4592 // flush me + children
4593 list<SnapRealm*> q;
4594 q.push_back(realm);
4595 while (!q.empty()) {
4596 SnapRealm *realm = q.front();
4597 q.pop_front();
4598
4599 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4600 p != realm->pchildren.end();
4601 ++p)
4602 q.push_back(*p);
4603
4604 if (dirty_realms.count(realm) == 0) {
4605 realm->nref++;
4606 dirty_realms[realm] = realm->get_snap_context();
4607 }
4608 }
4609 }
4610
4611 // update
4612 realm->seq = info.seq();
4613 realm->created = info.created();
4614 realm->parent_since = info.parent_since();
4615 realm->prior_parent_snaps = info.prior_parent_snaps;
4616 realm->my_snaps = info.my_snaps;
4617 invalidate = true;
4618 }
4619
4620 // _always_ verify parent
4621 if (adjust_realm_parent(realm, info.parent()))
4622 invalidate = true;
4623
4624 if (invalidate) {
4625 invalidate_snaprealm_and_children(realm);
4626 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4627 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4628 } else {
4629 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4630 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4631 }
4632
4633 if (!first_realm)
4634 first_realm = realm;
4635 else
4636 put_snap_realm(realm);
4637 }
4638
4639 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4640 q != dirty_realms.end();
4641 ++q) {
4642 SnapRealm *realm = q->first;
4643 // if there are new snaps ?
4644 if (has_new_snaps(q->second, realm->get_snap_context())) {
4645 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4646 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4647 while (!r.end()) {
4648 Inode *in = *r;
4649 ++r;
4650 queue_cap_snap(in, q->second);
4651 }
4652 } else {
4653 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4654 }
4655 put_snap_realm(realm);
4656 }
4657
4658 if (realm_ret)
4659 *realm_ret = first_realm;
4660 else
4661 put_snap_realm(first_realm);
4662}
4663
4664void Client::handle_snap(MClientSnap *m)
4665{
4666 ldout(cct, 10) << "handle_snap " << *m << dendl;
4667 mds_rank_t mds = mds_rank_t(m->get_source().num());
4668 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4669 if (!session) {
4670 m->put();
4671 return;
4672 }
4673
4674 got_mds_push(session);
4675
4676 map<Inode*, SnapContext> to_move;
4677 SnapRealm *realm = 0;
4678
4679 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4680 assert(m->head.split);
4681 SnapRealmInfo info;
4682 bufferlist::iterator p = m->bl.begin();
4683 ::decode(info, p);
4684 assert(info.ino() == m->head.split);
4685
4686 // flush, then move, ino's.
4687 realm = get_snap_realm(info.ino());
4688 ldout(cct, 10) << " splitting off " << *realm << dendl;
4689 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4690 p != m->split_inos.end();
4691 ++p) {
4692 vinodeno_t vino(*p, CEPH_NOSNAP);
4693 if (inode_map.count(vino)) {
4694 Inode *in = inode_map[vino];
4695 if (!in->snaprealm || in->snaprealm == realm)
4696 continue;
4697 if (in->snaprealm->created > info.created()) {
4698 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4699 << *in->snaprealm << dendl;
4700 continue;
4701 }
4702 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4703
4704
4705 in->snaprealm_item.remove_myself();
4706 to_move[in] = in->snaprealm->get_snap_context();
4707 put_snap_realm(in->snaprealm);
4708 }
4709 }
4710
4711 // move child snaprealms, too
4712 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4713 p != m->split_realms.end();
4714 ++p) {
4715 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4716 SnapRealm *child = get_snap_realm_maybe(*p);
4717 if (!child)
4718 continue;
4719 adjust_realm_parent(child, realm->ino);
4720 put_snap_realm(child);
4721 }
4722 }
4723
4724 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4725
4726 if (realm) {
4727 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4728 Inode *in = p->first;
4729 in->snaprealm = realm;
4730 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4731 realm->nref++;
4732 // queue for snap writeback
4733 if (has_new_snaps(p->second, realm->get_snap_context()))
4734 queue_cap_snap(in, p->second);
4735 }
4736 put_snap_realm(realm);
4737 }
4738
4739 m->put();
4740}
4741
4742void Client::handle_quota(MClientQuota *m)
4743{
4744 mds_rank_t mds = mds_rank_t(m->get_source().num());
4745 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4746 if (!session) {
4747 m->put();
4748 return;
4749 }
4750
4751 got_mds_push(session);
4752
4753 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4754
4755 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4756 if (inode_map.count(vino)) {
4757 Inode *in = NULL;
4758 in = inode_map[vino];
4759
4760 if (in) {
4761 in->quota = m->quota;
4762 in->rstat = m->rstat;
4763 }
4764 }
4765
4766 m->put();
4767}
4768
4769void Client::handle_caps(MClientCaps *m)
4770{
4771 mds_rank_t mds = mds_rank_t(m->get_source().num());
4772 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4773 if (!session) {
4774 m->put();
4775 return;
4776 }
4777
4778 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4779 // Pause RADOS operations until we see the required epoch
4780 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4781 }
4782
4783 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4784 // Record the barrier so that we will transmit it to MDS when releasing
4785 set_cap_epoch_barrier(m->osd_epoch_barrier);
4786 }
4787
4788 got_mds_push(session);
4789
4790 m->clear_payload(); // for if/when we send back to MDS
4791
4792 Inode *in = 0;
4793 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4794 if (inode_map.count(vino))
4795 in = inode_map[vino];
4796 if (!in) {
4797 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4798 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4799 session->enqueue_cap_release(
4800 m->get_ino(),
4801 m->get_cap_id(),
4802 m->get_seq(),
4803 m->get_mseq(),
4804 cap_epoch_barrier);
4805 } else {
4806 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4807 }
4808 m->put();
4809
4810 // in case the mds is waiting on e.g. a revocation
4811 flush_cap_releases();
4812 return;
4813 }
4814
4815 switch (m->get_op()) {
4816 case CEPH_CAP_OP_EXPORT:
4817 return handle_cap_export(session, in, m);
4818 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4819 return handle_cap_flushsnap_ack(session, in, m);
4820 case CEPH_CAP_OP_IMPORT:
4821 handle_cap_import(session, in, m);
4822 }
4823
4824 if (in->caps.count(mds) == 0) {
4825 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4826 m->put();
4827 return;
4828 }
4829
4830 Cap *cap = in->caps[mds];
4831
4832 switch (m->get_op()) {
4833 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4834 case CEPH_CAP_OP_IMPORT:
4835 case CEPH_CAP_OP_REVOKE:
4836 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4837 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4838 default:
4839 m->put();
4840 }
4841}
4842
4843void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4844{
4845 mds_rank_t mds = session->mds_num;
4846
4847 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4848 << " IMPORT from mds." << mds << dendl;
4849
4850 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4851 Cap *cap = NULL;
4852 UserPerm cap_perms;
4853 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4854 cap = in->caps[peer_mds];
4855 if (cap) {
4856 cap_perms = cap->latest_perms;
4857 }
4858 }
4859
4860 // add/update it
4861 SnapRealm *realm = NULL;
4862 update_snap_trace(m->snapbl, &realm);
4863
4864 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4865 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4866 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4867
4868 if (cap && cap->cap_id == m->peer.cap_id) {
4869 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4870 }
4871
4872 if (realm)
4873 put_snap_realm(realm);
4874
4875 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4876 // reflush any/all caps (if we are now the auth_cap)
4877 if (in->cap_snaps.size())
4878 flush_snaps(in, true);
4879 if (in->flushing_caps)
4880 flush_caps(in, session);
4881 }
4882}
4883
4884void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4885{
4886 mds_rank_t mds = session->mds_num;
4887
4888 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4889 << " EXPORT from mds." << mds << dendl;
4890
4891 Cap *cap = NULL;
4892 if (in->caps.count(mds))
4893 cap = in->caps[mds];
4894
7c673cae
FG
4895 if (cap && cap->cap_id == m->get_cap_id()) {
4896 if (m->peer.cap_id) {
a8e16298 4897 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
7c673cae
FG
4898 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4899 if (in->caps.count(peer_mds)) {
4900 Cap *tcap = in->caps[peer_mds];
181888fb 4901 if (tcap->cap_id == m->peer.cap_id &&
7c673cae
FG
4902 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4903 tcap->cap_id = m->peer.cap_id;
4904 tcap->seq = m->peer.seq - 1;
4905 tcap->issue_seq = tcap->seq;
7c673cae
FG
4906 tcap->issued |= cap->issued;
4907 tcap->implemented |= cap->issued;
4908 if (cap == in->auth_cap)
4909 in->auth_cap = tcap;
4910 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4911 adjust_session_flushing_caps(in, session, tsession);
4912 }
4913 } else {
a8e16298 4914 add_update_cap(in, tsession, m->peer.cap_id, cap->issued, 0,
7c673cae
FG
4915 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4916 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4917 cap->latest_perms);
4918 }
4919 } else {
a8e16298 4920 if (cap->wanted | cap->issued)
7c673cae
FG
4921 in->flags |= I_CAP_DROPPED;
4922 }
4923
4924 remove_cap(cap, false);
4925 }
4926
4927 m->put();
4928}
4929
4930void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4931{
4932 mds_rank_t mds = session->mds_num;
4933 assert(in->caps[mds]);
4934
4935 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4936 << " size " << in->size << " -> " << m->get_size()
4937 << dendl;
4938
1adf2230
AA
4939 int issued;
4940 in->caps_issued(&issued);
4941 issued |= in->caps_dirty();
4942 update_inode_file_size(in, issued, m->get_size(),
4943 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4944 m->put();
4945}
4946
4947void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4948{
4949 ceph_tid_t flush_ack_tid = m->get_client_tid();
4950 int dirty = m->get_dirty();
4951 int cleaned = 0;
4952 int flushed = 0;
4953
4954 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4955 it != in->flushing_cap_tids.end(); ) {
4956 if (it->first == flush_ack_tid)
4957 cleaned = it->second;
4958 if (it->first <= flush_ack_tid) {
4959 session->flushing_caps_tids.erase(it->first);
4960 in->flushing_cap_tids.erase(it++);
4961 ++flushed;
4962 continue;
4963 }
4964 cleaned &= ~it->second;
4965 if (!cleaned)
4966 break;
4967 ++it;
4968 }
4969
4970 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4971 << " cleaned " << ccap_string(cleaned) << " on " << *in
4972 << " with " << ccap_string(dirty) << dendl;
4973
4974 if (flushed) {
4975 signal_cond_list(in->waitfor_caps);
4976 if (session->flushing_caps_tids.empty() ||
4977 *session->flushing_caps_tids.begin() > flush_ack_tid)
4978 sync_cond.Signal();
4979 }
4980
4981 if (!dirty) {
4982 in->cap_dirtier_uid = -1;
4983 in->cap_dirtier_gid = -1;
4984 }
4985
4986 if (!cleaned) {
4987 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4988 } else {
4989 if (in->flushing_caps) {
4990 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4991 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4992 in->flushing_caps &= ~cleaned;
4993 if (in->flushing_caps == 0) {
4994 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4995 num_flushing_caps--;
4996 if (in->cap_snaps.empty())
4997 in->flushing_cap_item.remove_myself();
4998 }
4999 if (!in->caps_dirty())
5000 put_inode(in);
5001 }
5002 }
5003
5004 m->put();
5005}
5006
5007
5008void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
5009{
5010 mds_rank_t mds = session->mds_num;
5011 assert(in->caps[mds]);
5012 snapid_t follows = m->get_snap_follows();
5013
5014 if (in->cap_snaps.count(follows)) {
5015 CapSnap &capsnap = in->cap_snaps.at(follows);
5016 if (m->get_client_tid() != capsnap.flush_tid) {
5017 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
5018 } else {
5019 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
5020 << " on " << *in << dendl;
5021 InodeRef tmp_ref;
5022 if (in->get_num_ref() == 1)
5023 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5024 if (in->flushing_caps == 0 && in->cap_snaps.empty())
5025 in->flushing_cap_item.remove_myself();
5026 session->flushing_caps_tids.erase(capsnap.flush_tid);
5027 in->cap_snaps.erase(follows);
5028 }
5029 } else {
5030 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
5031 << " on " << *in << dendl;
5032 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5033 }
5034
5035 m->put();
5036}
5037
5038class C_Client_DentryInvalidate : public Context {
5039private:
5040 Client *client;
5041 vinodeno_t dirino;
5042 vinodeno_t ino;
5043 string name;
5044public:
5045 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5046 client(c), name(dn->name) {
5047 if (client->use_faked_inos()) {
5048 dirino.ino = dn->dir->parent_inode->faked_ino;
5049 if (del)
5050 ino.ino = dn->inode->faked_ino;
5051 } else {
5052 dirino = dn->dir->parent_inode->vino();
5053 if (del)
5054 ino = dn->inode->vino();
5055 }
5056 if (!del)
5057 ino.ino = inodeno_t();
5058 }
5059 void finish(int r) override {
5060 // _async_dentry_invalidate is responsible for its own locking
5061 assert(!client->client_lock.is_locked_by_me());
5062 client->_async_dentry_invalidate(dirino, ino, name);
5063 }
5064};
5065
5066void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5067{
5068 if (unmounting)
5069 return;
5070 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
5071 << " in dir " << dirino << dendl;
5072 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5073}
5074
5075void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5076{
5077 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5078 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5079}
5080
5081void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5082{
5083 int ref = in->get_num_ref();
5084
5085 if (in->dir && !in->dir->dentries.empty()) {
5086 for (auto p = in->dir->dentries.begin();
5087 p != in->dir->dentries.end(); ) {
5088 Dentry *dn = p->second;
5089 ++p;
5090 /* rmsnap removes whole subtree, need trim inodes recursively.
5091 * we don't need to invalidate dentries recursively. because
5092 * invalidating a directory dentry effectively invalidate
5093 * whole subtree */
5094 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5095 _try_to_trim_inode(dn->inode.get(), false);
5096
5097 if (dn->lru_is_expireable())
5098 unlink(dn, true, false); // keep dir, drop dentry
5099 }
5100 if (in->dir->dentries.empty()) {
5101 close_dir(in->dir);
5102 --ref;
5103 }
5104 }
5105
5106 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5107 InodeRef snapdir = open_snapdir(in);
5108 _try_to_trim_inode(snapdir.get(), false);
5109 --ref;
5110 }
5111
5112 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5113 set<Dentry*>::iterator q = in->dn_set.begin();
5114 while (q != in->dn_set.end()) {
5115 Dentry *dn = *q++;
5116 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5117 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5118 _schedule_invalidate_dentry_callback(dn, true);
5119 unlink(dn, true, true);
5120 }
5121 }
5122}
5123
5124void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5125{
5126 mds_rank_t mds = session->mds_num;
5127 int used = get_caps_used(in);
5128 int wanted = in->caps_wanted();
5129
a8e16298
TL
5130 const unsigned new_caps = m->get_caps();
5131 const bool was_stale = session->cap_gen > cap->gen;
7c673cae
FG
5132 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5133 << " mds." << mds << " seq " << m->get_seq()
5134 << " caps now " << ccap_string(new_caps)
a8e16298
TL
5135 << " was " << ccap_string(cap->issued)
5136 << (was_stale ? "" : " (stale)") << dendl;
5137
5138 if (was_stale)
5139 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5140 cap->seq = m->get_seq();
28e407b8 5141 cap->gen = session->cap_gen;
7c673cae 5142
a8e16298
TL
5143 check_cap_issue(in, cap, new_caps);
5144
7c673cae 5145 // update inode
1adf2230
AA
5146 int issued;
5147 in->caps_issued(&issued);
5148 issued |= in->caps_dirty();
7c673cae 5149
1adf2230
AA
5150 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5151 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5152 in->mode = m->head.mode;
5153 in->uid = m->head.uid;
5154 in->gid = m->head.gid;
5155 in->btime = m->btime;
5156 }
5157 bool deleted_inode = false;
1adf2230
AA
5158 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5159 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5160 in->nlink = m->head.nlink;
5161 if (in->nlink == 0 &&
5162 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5163 deleted_inode = true;
5164 }
1adf2230 5165 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5166 m->xattrbl.length() &&
5167 m->head.xattr_version > in->xattr_version) {
5168 bufferlist::iterator p = m->xattrbl.begin();
5169 ::decode(in->xattrs, p);
5170 in->xattr_version = m->head.xattr_version;
5171 }
28e407b8
AA
5172
5173 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5174 in->dirstat.nfiles = m->get_nfiles();
5175 in->dirstat.nsubdirs = m->get_nsubdirs();
5176 }
5177
1adf2230
AA
5178 if (new_caps & CEPH_CAP_ANY_RD) {
5179 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5180 m->get_ctime(), m->get_mtime(), m->get_atime());
5181 }
5182
5183 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5184 in->layout = m->get_layout();
5185 update_inode_file_size(in, issued, m->get_size(),
5186 m->get_truncate_seq(), m->get_truncate_size());
5187 }
5188
5189 if (m->inline_version > in->inline_version) {
5190 in->inline_data = m->inline_data;
5191 in->inline_version = m->inline_version;
5192 }
5193
5194 /* always take a newer change attr */
5195 if (m->get_change_attr() > in->change_attr)
5196 in->change_attr = m->get_change_attr();
7c673cae
FG
5197
5198 // max_size
5199 if (cap == in->auth_cap &&
1adf2230
AA
5200 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5201 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5202 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5203 in->max_size = m->get_max_size();
5204 if (in->max_size > in->wanted_max_size) {
5205 in->wanted_max_size = 0;
5206 in->requested_max_size = 0;
5207 }
5208 }
5209
5210 bool check = false;
a8e16298
TL
5211 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5212 (wanted & ~(cap->wanted | new_caps))) {
5213 // If mds is importing cap, prior cap messages that update 'wanted'
5214 // may get dropped by mds (migrate seq mismatch).
5215 //
5216 // We don't send cap message to update 'wanted' if what we want are
5217 // already issued. If mds revokes caps, cap message that releases caps
5218 // also tells mds what we want. But if caps got revoked by mds forcedly
5219 // (session stale). We may haven't told mds what we want.
7c673cae 5220 check = true;
a8e16298 5221 }
7c673cae 5222
7c673cae
FG
5223
5224 // update caps
a8e16298 5225 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5226 if (revoked) {
5227 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5228 cap->issued = new_caps;
5229 cap->implemented |= new_caps;
5230
b32b8144
FG
5231 // recall delegations if we're losing caps necessary for them
5232 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5233 in->recall_deleg(false);
5234 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5235 in->recall_deleg(true);
5236
28e407b8
AA
5237 if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
5238 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5239 // waitin' for flush
28e407b8 5240 } else if (revoked & CEPH_CAP_FILE_CACHE) {
7c673cae
FG
5241 if (_release(in))
5242 check = true;
5243 } else {
5244 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5245 check = true;
5246 }
a8e16298
TL
5247 } else if (cap->issued == new_caps) {
5248 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5249 } else {
a8e16298 5250 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5251 cap->issued = new_caps;
5252 cap->implemented |= new_caps;
5253
5254 if (cap == in->auth_cap) {
5255 // non-auth MDS is revoking the newly grant caps ?
5256 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5257 if (it->second == cap)
5258 continue;
5259 if (it->second->implemented & ~it->second->issued & new_caps) {
5260 check = true;
5261 break;
5262 }
5263 }
5264 }
5265 }
5266
5267 if (check)
5268 check_caps(in, 0);
5269
5270 // wake up waiters
5271 if (new_caps)
5272 signal_cond_list(in->waitfor_caps);
5273
5274 // may drop inode's last ref
5275 if (deleted_inode)
5276 _try_to_trim_inode(in, true);
5277
5278 m->put();
5279}
5280
7c673cae
FG
5281int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5282{
5283 if (perms.uid() == 0)
5284 return 0;
5285
5286 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5287 int ret = _posix_acl_permission(in, perms, want);
5288 if (ret != -EAGAIN)
5289 return ret;
5290 }
5291
5292 // check permissions before doing anything else
5293 if (!in->check_mode(perms, want))
5294 return -EACCES;
5295 return 0;
5296}
5297
5298int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5299 const UserPerm& perms)
5300{
5301 int r = _getattr_for_perm(in, perms);
5302 if (r < 0)
5303 goto out;
5304
5305 r = 0;
5306 if (strncmp(name, "system.", 7) == 0) {
5307 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5308 r = -EPERM;
5309 } else {
5310 r = inode_permission(in, perms, want);
5311 }
5312out:
1adf2230 5313 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5314 return r;
5315}
5316
5317ostream& operator<<(ostream &out, const UserPerm& perm) {
5318 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5319 return out;
5320}
5321
5322int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5323 const UserPerm& perms)
5324{
181888fb 5325 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5326 int r = _getattr_for_perm(in, perms);
5327 if (r < 0)
5328 goto out;
5329
5330 if (mask & CEPH_SETATTR_SIZE) {
5331 r = inode_permission(in, perms, MAY_WRITE);
5332 if (r < 0)
5333 goto out;
5334 }
5335
5336 r = -EPERM;
5337 if (mask & CEPH_SETATTR_UID) {
5338 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5339 goto out;
5340 }
5341 if (mask & CEPH_SETATTR_GID) {
5342 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5343 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5344 goto out;
5345 }
5346
5347 if (mask & CEPH_SETATTR_MODE) {
5348 if (perms.uid() != 0 && perms.uid() != in->uid)
5349 goto out;
5350
5351 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5352 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5353 stx->stx_mode &= ~S_ISGID;
5354 }
5355
5356 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5357 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5358 if (perms.uid() != 0 && perms.uid() != in->uid) {
5359 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5360 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5361 check_mask |= CEPH_SETATTR_MTIME;
5362 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5363 check_mask |= CEPH_SETATTR_ATIME;
5364 if (check_mask & mask) {
5365 goto out;
5366 } else {
5367 r = inode_permission(in, perms, MAY_WRITE);
5368 if (r < 0)
5369 goto out;
5370 }
5371 }
5372 }
5373 r = 0;
5374out:
5375 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5376 return r;
5377}
5378
5379int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5380{
181888fb 5381 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5382 unsigned want = 0;
5383
5384 if ((flags & O_ACCMODE) == O_WRONLY)
5385 want = MAY_WRITE;
5386 else if ((flags & O_ACCMODE) == O_RDWR)
5387 want = MAY_READ | MAY_WRITE;
5388 else if ((flags & O_ACCMODE) == O_RDONLY)
5389 want = MAY_READ;
5390 if (flags & O_TRUNC)
5391 want |= MAY_WRITE;
5392
5393 int r = 0;
5394 switch (in->mode & S_IFMT) {
5395 case S_IFLNK:
5396 r = -ELOOP;
5397 goto out;
5398 case S_IFDIR:
5399 if (want & MAY_WRITE) {
5400 r = -EISDIR;
5401 goto out;
5402 }
5403 break;
5404 }
5405
5406 r = _getattr_for_perm(in, perms);
5407 if (r < 0)
5408 goto out;
5409
5410 r = inode_permission(in, perms, want);
5411out:
5412 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5413 return r;
5414}
5415
5416int Client::may_lookup(Inode *dir, const UserPerm& perms)
5417{
181888fb 5418 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5419 int r = _getattr_for_perm(dir, perms);
5420 if (r < 0)
5421 goto out;
5422
5423 r = inode_permission(dir, perms, MAY_EXEC);
5424out:
5425 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5426 return r;
5427}
5428
5429int Client::may_create(Inode *dir, const UserPerm& perms)
5430{
181888fb 5431 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5432 int r = _getattr_for_perm(dir, perms);
5433 if (r < 0)
5434 goto out;
5435
5436 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5437out:
5438 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5439 return r;
5440}
5441
5442int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5443{
181888fb 5444 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5445 int r = _getattr_for_perm(dir, perms);
5446 if (r < 0)
5447 goto out;
5448
5449 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5450 if (r < 0)
5451 goto out;
5452
5453 /* 'name == NULL' means rmsnap */
5454 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5455 InodeRef otherin;
5456 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5457 if (r < 0)
5458 goto out;
5459 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5460 r = -EPERM;
5461 }
5462out:
5463 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5464 return r;
5465}
5466
5467int Client::may_hardlink(Inode *in, const UserPerm& perms)
5468{
181888fb 5469 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5470 int r = _getattr_for_perm(in, perms);
5471 if (r < 0)
5472 goto out;
5473
5474 if (perms.uid() == 0 || perms.uid() == in->uid) {
5475 r = 0;
5476 goto out;
5477 }
5478
5479 r = -EPERM;
5480 if (!S_ISREG(in->mode))
5481 goto out;
5482
5483 if (in->mode & S_ISUID)
5484 goto out;
5485
5486 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5487 goto out;
5488
5489 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5490out:
5491 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5492 return r;
5493}
5494
5495int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5496{
5497 int mask = CEPH_STAT_CAP_MODE;
5498 bool force = false;
5499 if (acl_type != NO_ACL) {
5500 mask |= CEPH_STAT_CAP_XATTR;
5501 force = in->xattr_version == 0;
5502 }
5503 return _getattr(in, mask, perms, force);
5504}
5505
5506vinodeno_t Client::_get_vino(Inode *in)
5507{
5508 /* The caller must hold the client lock */
5509 return vinodeno_t(in->ino, in->snapid);
5510}
5511
5512inodeno_t Client::_get_inodeno(Inode *in)
5513{
5514 /* The caller must hold the client lock */
5515 return in->ino;
5516}
5517
5518
5519/**
5520 * Resolve an MDS spec to a list of MDS daemon GIDs.
5521 *
5522 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5523 * It may be '*' in which case it matches all GIDs.
5524 *
5525 * If no error is returned, the `targets` vector will be populated with at least
5526 * one MDS.
5527 */
5528int Client::resolve_mds(
5529 const std::string &mds_spec,
5530 std::vector<mds_gid_t> *targets)
5531{
5532 assert(fsmap);
5533 assert(targets != nullptr);
5534
5535 mds_role_t role;
5536 std::stringstream ss;
5537 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5538 if (role_r == 0) {
5539 // We got a role, resolve it to a GID
5540 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5541 << role << "'" << dendl;
5542 targets->push_back(
5543 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5544 return 0;
5545 }
5546
5547 std::string strtol_err;
5548 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5549 if (strtol_err.empty()) {
5550 // It is a possible GID
5551 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5552 if (fsmap->gid_exists(mds_gid)) {
5553 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5554 targets->push_back(mds_gid);
5555 } else {
5556 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5557 << dendl;
5558 return -ENOENT;
5559 }
5560 } else if (mds_spec == "*") {
5561 // It is a wildcard: use all MDSs
5562 const auto mds_info = fsmap->get_mds_info();
5563
5564 if (mds_info.empty()) {
5565 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5566 return -ENOENT;
5567 }
5568
5569 for (const auto i : mds_info) {
5570 targets->push_back(i.first);
5571 }
5572 } else {
5573 // It did not parse as an integer, it is not a wildcard, it must be a name
5574 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5575 if (mds_gid == 0) {
5576 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5577
5578 lderr(cct) << "FSMap: " << *fsmap << dendl;
5579
5580 return -ENOENT;
5581 } else {
5582 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5583 << "' to GID " << mds_gid << dendl;
5584 targets->push_back(mds_gid);
5585 }
5586 }
5587
5588 return 0;
5589}
5590
5591
5592/**
5593 * Authenticate with mon and establish global ID
5594 */
5595int Client::authenticate()
5596{
5597 assert(client_lock.is_locked_by_me());
5598
5599 if (monclient->is_authenticated()) {
5600 return 0;
5601 }
5602
5603 client_lock.Unlock();
5604 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5605 client_lock.Lock();
5606 if (r < 0) {
5607 return r;
5608 }
5609
5610 whoami = monclient->get_global_id();
5611 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5612
5613 return 0;
5614}
5615
5616int Client::fetch_fsmap(bool user)
5617{
5618 int r;
5619 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5620 // rather than MDSMap because no one MDSMap contains all the daemons, and
5621 // a `tell` can address any daemon.
5622 version_t fsmap_latest;
5623 do {
5624 C_SaferCond cond;
5625 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5626 client_lock.Unlock();
5627 r = cond.wait();
5628 client_lock.Lock();
5629 } while (r == -EAGAIN);
5630
5631 if (r < 0) {
5632 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5633 return r;
5634 }
5635
5636 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5637
5638 if (user) {
5639 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5640 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5641 monclient->renew_subs();
5642 wait_on_list(waiting_for_fsmap);
5643 }
5644 assert(fsmap_user);
5645 assert(fsmap_user->get_epoch() >= fsmap_latest);
5646 } else {
5647 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5648 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5649 monclient->renew_subs();
5650 wait_on_list(waiting_for_fsmap);
5651 }
5652 assert(fsmap);
5653 assert(fsmap->get_epoch() >= fsmap_latest);
5654 }
5655 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5656 << fsmap_latest << dendl;
5657 return 0;
5658}
5659
5660/**
5661 *
5662 * @mds_spec one of ID, rank, GID, "*"
5663 *
5664 */
5665int Client::mds_command(
5666 const std::string &mds_spec,
5667 const vector<string>& cmd,
5668 const bufferlist& inbl,
5669 bufferlist *outbl,
5670 string *outs,
5671 Context *onfinish)
5672{
5673 Mutex::Locker lock(client_lock);
5674
181888fb
FG
5675 if (!initialized)
5676 return -ENOTCONN;
7c673cae
FG
5677
5678 int r;
5679 r = authenticate();
5680 if (r < 0) {
5681 return r;
5682 }
5683
5684 r = fetch_fsmap(false);
5685 if (r < 0) {
5686 return r;
5687 }
5688
5689 // Look up MDS target(s) of the command
5690 std::vector<mds_gid_t> targets;
5691 r = resolve_mds(mds_spec, &targets);
5692 if (r < 0) {
5693 return r;
5694 }
5695
5696 // If daemons are laggy, we won't send them commands. If all
5697 // are laggy then we fail.
5698 std::vector<mds_gid_t> non_laggy;
5699 for (const auto gid : targets) {
5700 const auto info = fsmap->get_info_gid(gid);
5701 if (!info.laggy()) {
5702 non_laggy.push_back(gid);
5703 }
5704 }
5705 if (non_laggy.size() == 0) {
5706 *outs = "All targeted MDS daemons are laggy";
5707 return -ENOENT;
5708 }
5709
5710 if (metadata.empty()) {
5711 // We are called on an unmounted client, so metadata
5712 // won't be initialized yet.
5713 populate_metadata("");
5714 }
5715
5716 // Send commands to targets
5717 C_GatherBuilder gather(cct, onfinish);
5718 for (const auto target_gid : non_laggy) {
5719 const auto info = fsmap->get_info_gid(target_gid);
5720
5721 // Open a connection to the target MDS
5722 entity_inst_t inst = info.get_inst();
5723 ConnectionRef conn = messenger->get_connection(inst);
5724
5725 // Generate MDSCommandOp state
5726 auto &op = command_table.start_command();
5727
5728 op.on_finish = gather.new_sub();
5729 op.cmd = cmd;
5730 op.outbl = outbl;
5731 op.outs = outs;
5732 op.inbl = inbl;
5733 op.mds_gid = target_gid;
5734 op.con = conn;
5735
5736 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5737 << " tid=" << op.tid << cmd << dendl;
5738
5739 // Construct and send MCommand
5740 MCommand *m = op.get_message(monclient->get_fsid());
5741 conn->send_message(m);
5742 }
5743 gather.activate();
5744
5745 return 0;
5746}
5747
5748void Client::handle_command_reply(MCommandReply *m)
5749{
5750 ceph_tid_t const tid = m->get_tid();
5751
5752 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5753
5754 if (!command_table.exists(tid)) {
5755 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5756 m->put();
5757 return;
5758 }
5759
5760 auto &op = command_table.get_command(tid);
5761 if (op.outbl) {
5762 op.outbl->claim(m->get_data());
5763 }
5764 if (op.outs) {
5765 *op.outs = m->rs;
5766 }
5767
5768 if (op.on_finish) {
5769 op.on_finish->complete(m->r);
5770 }
5771
5772 command_table.erase(tid);
5773
5774 m->put();
5775}
5776
5777// -------------------
5778// MOUNT
5779
5780int Client::mount(const std::string &mount_root, const UserPerm& perms,
5781 bool require_mds)
5782{
5783 Mutex::Locker lock(client_lock);
5784
5785 if (mounted) {
5786 ldout(cct, 5) << "already mounted" << dendl;
5787 return 0;
5788 }
5789
b32b8144
FG
5790 unmounting = false;
5791
7c673cae
FG
5792 int r = authenticate();
5793 if (r < 0) {
5794 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5795 return r;
5796 }
5797
5798 std::string want = "mdsmap";
5799 const auto &mds_ns = cct->_conf->client_mds_namespace;
5800 if (!mds_ns.empty()) {
5801 r = fetch_fsmap(true);
5802 if (r < 0)
5803 return r;
5804 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5805 if (cid == FS_CLUSTER_ID_NONE)
5806 return -ENOENT;
5807
5808 std::ostringstream oss;
5809 oss << want << "." << cid;
5810 want = oss.str();
5811 }
5812 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5813
5814 monclient->sub_want(want, 0, 0);
5815 monclient->renew_subs();
5816
5817 tick(); // start tick
5818
5819 if (require_mds) {
5820 while (1) {
5821 auto availability = mdsmap->is_cluster_available();
5822 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5823 // Error out
5824 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5825 return CEPH_FUSE_NO_MDS_UP;
5826 } else if (availability == MDSMap::AVAILABLE) {
5827 // Continue to mount
5828 break;
5829 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5830 // Else, wait. MDSMonitor will update the map to bring
5831 // us to a conclusion eventually.
5832 wait_on_list(waiting_for_mdsmap);
5833 } else {
5834 // Unexpected value!
5835 ceph_abort();
5836 }
5837 }
5838 }
5839
5840 populate_metadata(mount_root.empty() ? "/" : mount_root);
5841
5842 filepath fp(CEPH_INO_ROOT);
5843 if (!mount_root.empty()) {
5844 fp = filepath(mount_root.c_str());
5845 }
5846 while (true) {
5847 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5848 req->set_filepath(fp);
5849 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5850 int res = make_request(req, perms);
5851 if (res < 0) {
5852 if (res == -EACCES && root) {
5853 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5854 break;
5855 }
5856 return res;
5857 }
5858
5859 if (fp.depth())
5860 fp.pop_dentry();
5861 else
5862 break;
5863 }
5864
5865 assert(root);
5866 _ll_get(root);
5867
5868 mounted = true;
5869
5870 // trace?
5871 if (!cct->_conf->client_trace.empty()) {
5872 traceout.open(cct->_conf->client_trace.c_str());
5873 if (traceout.is_open()) {
5874 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5875 } else {
5876 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5877 }
5878 }
5879
5880 /*
5881 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5882 ldout(cct, 3) << "op: struct stat st;" << dendl;
5883 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5884 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5885 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5886 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5887 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5888 ldout(cct, 3) << "op: int fd;" << dendl;
5889 */
5890 return 0;
5891}
5892
5893// UNMOUNT
5894
5895void Client::_close_sessions()
5896{
5897 while (!mds_sessions.empty()) {
5898 // send session closes!
5899 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5900 p != mds_sessions.end();
5901 ++p) {
5902 if (p->second->state != MetaSession::STATE_CLOSING) {
5903 _close_mds_session(p->second);
5904 }
5905 }
5906
5907 // wait for sessions to close
5908 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5909 mount_cond.Wait(client_lock);
5910 }
5911}
5912
31f18b77
FG
5913void Client::flush_mdlog_sync()
5914{
5915 if (mds_requests.empty())
5916 return;
5917 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5918 p != mds_sessions.end();
5919 ++p) {
5920 MetaSession *s = p->second;
5921 flush_mdlog(s);
5922 }
5923}
5924
5925void Client::flush_mdlog(MetaSession *session)
5926{
5927 // Only send this to Luminous or newer MDS daemons, older daemons
5928 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5929 const uint64_t features = session->con->get_features();
5930 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5931 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5932 session->con->send_message(m);
5933 }
5934}
5935
5936
b32b8144 5937void Client::_unmount()
7c673cae 5938{
181888fb
FG
5939 if (unmounting)
5940 return;
7c673cae
FG
5941
5942 ldout(cct, 2) << "unmounting" << dendl;
5943 unmounting = true;
5944
b32b8144
FG
5945 deleg_timeout = 0;
5946
31f18b77 5947 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5948 while (!mds_requests.empty()) {
5949 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5950 mount_cond.Wait(client_lock);
5951 }
5952
5953 if (tick_event)
5954 timer.cancel_event(tick_event);
5955 tick_event = 0;
5956
5957 cwd.reset();
5958
5959 // clean up any unclosed files
5960 while (!fd_map.empty()) {
5961 Fh *fh = fd_map.begin()->second;
5962 fd_map.erase(fd_map.begin());
5963 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5964 _release_fh(fh);
5965 }
5966
5967 while (!ll_unclosed_fh_set.empty()) {
5968 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5969 Fh *fh = *it;
5970 ll_unclosed_fh_set.erase(fh);
5971 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5972 _release_fh(fh);
5973 }
5974
5975 while (!opened_dirs.empty()) {
5976 dir_result_t *dirp = *opened_dirs.begin();
5977 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5978 _closedir(dirp);
5979 }
5980
5981 _ll_drop_pins();
5982
31f18b77
FG
5983 if (blacklisted) {
5984 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5985
5986 if (cct->_conf->client_oc) {
5987 // Purge all cached data so that ObjectCacher doesn't get hung up
5988 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5989 // is to just leave things marked dirty
5990 // (http://tracker.ceph.com/issues/9105)
5991 for (const auto &i : inode_map) {
5992 objectcacher->purge_set(&(i.second->oset));
5993 }
5994 }
5995
5996 mounted = false;
5997 return;
5998 }
5999
7c673cae
FG
6000 while (unsafe_sync_write > 0) {
6001 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6002 mount_cond.Wait(client_lock);
6003 }
6004
6005 if (cct->_conf->client_oc) {
6006 // flush/release all buffered data
6007 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
6008 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
6009 p != inode_map.end();
6010 p = next) {
6011 next = p;
6012 ++next;
6013 Inode *in = p->second;
6014 if (!in) {
6015 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
6016 assert(in);
6017 }
6018 if (!in->caps.empty()) {
6019 InodeRef tmp_ref(in);
6020 _release(in);
6021 _flush(in, new C_Client_FlushComplete(this, in));
6022 }
6023 }
6024 }
6025
6026 flush_caps_sync();
6027 wait_sync_caps(last_flush_tid);
6028
6029 // empty lru cache
7c673cae
FG
6030 trim_cache();
6031
6032 while (lru.lru_get_size() > 0 ||
6033 !inode_map.empty()) {
6034 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6035 << "+" << inode_map.size() << " items"
6036 << ", waiting (for caps to release?)"
6037 << dendl;
6038 utime_t until = ceph_clock_now() + utime_t(5, 0);
6039 int r = mount_cond.WaitUntil(client_lock, until);
6040 if (r == ETIMEDOUT) {
6041 dump_cache(NULL);
6042 }
6043 }
6044 assert(lru.lru_get_size() == 0);
6045 assert(inode_map.empty());
6046
6047 // stop tracing
6048 if (!cct->_conf->client_trace.empty()) {
6049 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6050 traceout.close();
6051 }
6052
6053 _close_sessions();
6054
6055 mounted = false;
6056
6057 ldout(cct, 2) << "unmounted." << dendl;
6058}
6059
b32b8144
FG
6060void Client::unmount()
6061{
6062 Mutex::Locker lock(client_lock);
6063 _unmount();
6064}
6065
7c673cae
FG
6066void Client::flush_cap_releases()
6067{
6068 // send any cap releases
6069 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6070 p != mds_sessions.end();
6071 ++p) {
6072 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
6073 p->first)) {
6074 if (cct->_conf->client_inject_release_failure) {
6075 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6076 p->second->release->put();
6077 } else {
6078 p->second->con->send_message(p->second->release);
6079 }
6080 p->second->release = 0;
6081 }
6082 }
6083}
6084
6085void Client::tick()
6086{
6087 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6088 sleep(cct->_conf->client_debug_inject_tick_delay);
6089 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6090 cct->_conf->apply_changes(NULL);
6091 }
6092
6093 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6094 tick_event = timer.add_event_after(
6095 cct->_conf->client_tick_interval,
6096 new FunctionContext([this](int) {
6097 // Called back via Timer, which takes client_lock for us
6098 assert(client_lock.is_locked_by_me());
6099 tick();
6100 }));
7c673cae
FG
6101 utime_t now = ceph_clock_now();
6102
6103 if (!mounted && !mds_requests.empty()) {
6104 MetaRequest *req = mds_requests.begin()->second;
6105 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6106 req->abort(-ETIMEDOUT);
6107 if (req->caller_cond) {
6108 req->kick = true;
6109 req->caller_cond->Signal();
6110 }
6111 signal_cond_list(waiting_for_mdsmap);
6112 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6113 p != mds_sessions.end();
6114 ++p)
6115 signal_context_list(p->second->waiting_for_open);
6116 }
6117 }
6118
6119 if (mdsmap->get_epoch()) {
6120 // renew caps?
6121 utime_t el = now - last_cap_renew;
6122 if (el > mdsmap->get_session_timeout() / 3.0)
6123 renew_caps();
6124
6125 flush_cap_releases();
6126 }
6127
6128 // delayed caps
28e407b8 6129 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6130 while (!p.end()) {
6131 Inode *in = *p;
6132 ++p;
6133 if (in->hold_caps_until > now)
6134 break;
28e407b8 6135 delayed_list.pop_front();
7c673cae
FG
6136 check_caps(in, CHECK_CAPS_NODELAY);
6137 }
6138
6139 trim_cache(true);
6140}
6141
6142void Client::renew_caps()
6143{
6144 ldout(cct, 10) << "renew_caps()" << dendl;
6145 last_cap_renew = ceph_clock_now();
6146
6147 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6148 p != mds_sessions.end();
6149 ++p) {
6150 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6151 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6152 renew_caps(p->second);
6153 }
6154}
6155
6156void Client::renew_caps(MetaSession *session)
6157{
6158 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6159 session->last_cap_renew_request = ceph_clock_now();
6160 uint64_t seq = ++session->cap_renew_seq;
6161 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6162}
6163
6164
6165// ===============================================================
6166// high level (POSIXy) interface
6167
6168int Client::_do_lookup(Inode *dir, const string& name, int mask,
6169 InodeRef *target, const UserPerm& perms)
6170{
6171 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6172 MetaRequest *req = new MetaRequest(op);
6173 filepath path;
6174 dir->make_nosnap_relative_path(path);
6175 path.push_dentry(name);
6176 req->set_filepath(path);
6177 req->set_inode(dir);
6178 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6179 mask |= DEBUG_GETATTR_CAPS;
6180 req->head.args.getattr.mask = mask;
6181
6182 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6183
6184 int r = make_request(req, perms, target);
6185 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6186 return r;
6187}
6188
6189int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6190 const UserPerm& perms)
6191{
6192 int r = 0;
6193 Dentry *dn = NULL;
6194
6195 if (!dir->is_dir()) {
6196 r = -ENOTDIR;
6197 goto done;
6198 }
6199
6200 if (dname == "..") {
6201 if (dir->dn_set.empty())
6202 *target = dir;
6203 else
6204 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6205 goto done;
6206 }
6207
6208 if (dname == ".") {
6209 *target = dir;
6210 goto done;
6211 }
6212
6213 if (dname.length() > NAME_MAX) {
6214 r = -ENAMETOOLONG;
6215 goto done;
6216 }
6217
6218 if (dname == cct->_conf->client_snapdir &&
6219 dir->snapid == CEPH_NOSNAP) {
6220 *target = open_snapdir(dir);
6221 goto done;
6222 }
6223
6224 if (dir->dir &&
6225 dir->dir->dentries.count(dname)) {
6226 dn = dir->dir->dentries[dname];
6227
6228 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6229 << " seq " << dn->lease_seq
6230 << dendl;
6231
94b18763 6232 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6233 // is dn lease valid?
6234 utime_t now = ceph_clock_now();
6235 if (dn->lease_mds >= 0 &&
6236 dn->lease_ttl > now &&
6237 mds_sessions.count(dn->lease_mds)) {
6238 MetaSession *s = mds_sessions[dn->lease_mds];
6239 if (s->cap_ttl > now &&
6240 s->cap_gen == dn->lease_gen) {
6241 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6242 // make trim_caps() behave.
6243 dir->try_touch_cap(dn->lease_mds);
6244 goto hit_dn;
6245 }
6246 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6247 << " vs lease_gen " << dn->lease_gen << dendl;
6248 }
6249 // dir lease?
94b18763 6250 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6251 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6252 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6253 goto hit_dn;
6254 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6255 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6256 << *dir << " dn '" << dname << "'" << dendl;
6257 return -ENOENT;
6258 }
6259 }
6260 } else {
6261 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6262 }
6263 } else {
6264 // can we conclude ENOENT locally?
94b18763 6265 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae
FG
6266 (dir->flags & I_COMPLETE)) {
6267 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6268 return -ENOENT;
6269 }
6270 }
6271
6272 r = _do_lookup(dir, dname, mask, target, perms);
6273 goto done;
6274
6275 hit_dn:
6276 if (dn->inode) {
6277 *target = dn->inode;
6278 } else {
6279 r = -ENOENT;
6280 }
6281 touch_dn(dn);
6282
6283 done:
6284 if (r < 0)
6285 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6286 else
6287 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6288 return r;
6289}
6290
6291int Client::get_or_create(Inode *dir, const char* name,
6292 Dentry **pdn, bool expect_null)
6293{
6294 // lookup
6295 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6296 dir->open_dir();
6297 if (dir->dir->dentries.count(name)) {
6298 Dentry *dn = dir->dir->dentries[name];
6299
6300 // is dn lease valid?
6301 utime_t now = ceph_clock_now();
6302 if (dn->inode &&
6303 dn->lease_mds >= 0 &&
6304 dn->lease_ttl > now &&
6305 mds_sessions.count(dn->lease_mds)) {
6306 MetaSession *s = mds_sessions[dn->lease_mds];
6307 if (s->cap_ttl > now &&
6308 s->cap_gen == dn->lease_gen) {
6309 if (expect_null)
6310 return -EEXIST;
6311 }
6312 }
6313 *pdn = dn;
6314 } else {
6315 // otherwise link up a new one
6316 *pdn = link(dir->dir, name, NULL, NULL);
6317 }
6318
6319 // success
6320 return 0;
6321}
6322
6323int Client::path_walk(const filepath& origpath, InodeRef *end,
6324 const UserPerm& perms, bool followsym, int mask)
6325{
6326 filepath path = origpath;
6327 InodeRef cur;
6328 if (origpath.absolute())
6329 cur = root;
6330 else
6331 cur = cwd;
6332 assert(cur);
6333
6334 ldout(cct, 10) << "path_walk " << path << dendl;
6335
6336 int symlinks = 0;
6337
6338 unsigned i=0;
6339 while (i < path.depth() && cur) {
6340 int caps = 0;
6341 const string &dname = path[i];
6342 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6343 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6344 InodeRef next;
6345 if (cct->_conf->client_permissions) {
6346 int r = may_lookup(cur.get(), perms);
6347 if (r < 0)
6348 return r;
6349 caps = CEPH_CAP_AUTH_SHARED;
6350 }
6351
6352 /* Get extra requested caps on the last component */
6353 if (i == (path.depth() - 1))
6354 caps |= mask;
6355 int r = _lookup(cur.get(), dname, caps, &next, perms);
6356 if (r < 0)
6357 return r;
6358 // only follow trailing symlink if followsym. always follow
6359 // 'directory' symlinks.
6360 if (next && next->is_symlink()) {
6361 symlinks++;
6362 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6363 if (symlinks > MAXSYMLINKS) {
6364 return -ELOOP;
6365 }
6366
6367 if (i < path.depth() - 1) {
6368 // dir symlink
6369 // replace consumed components of path with symlink dir target
6370 filepath resolved(next->symlink.c_str());
6371 resolved.append(path.postfixpath(i + 1));
6372 path = resolved;
6373 i = 0;
6374 if (next->symlink[0] == '/') {
6375 cur = root;
6376 }
6377 continue;
6378 } else if (followsym) {
6379 if (next->symlink[0] == '/') {
6380 path = next->symlink.c_str();
6381 i = 0;
6382 // reset position
6383 cur = root;
6384 } else {
6385 filepath more(next->symlink.c_str());
6386 // we need to remove the symlink component from off of the path
6387 // before adding the target that the symlink points to. remain
6388 // at the same position in the path.
6389 path.pop_dentry();
6390 path.append(more);
6391 }
6392 continue;
6393 }
6394 }
6395 cur.swap(next);
6396 i++;
6397 }
6398 if (!cur)
6399 return -ENOENT;
6400 if (end)
6401 end->swap(cur);
6402 return 0;
6403}
6404
6405
6406// namespace ops
6407
6408int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6409{
6410 Mutex::Locker lock(client_lock);
6411 tout(cct) << "link" << std::endl;
6412 tout(cct) << relexisting << std::endl;
6413 tout(cct) << relpath << std::endl;
6414
181888fb
FG
6415 if (unmounting)
6416 return -ENOTCONN;
6417
7c673cae
FG
6418 filepath existing(relexisting);
6419
6420 InodeRef in, dir;
6421 int r = path_walk(existing, &in, perm, true);
6422 if (r < 0)
6423 return r;
6424 if (std::string(relpath) == "/") {
6425 r = -EEXIST;
6426 return r;
6427 }
6428 filepath path(relpath);
6429 string name = path.last_dentry();
6430 path.pop_dentry();
6431
6432 r = path_walk(path, &dir, perm, true);
6433 if (r < 0)
6434 return r;
6435 if (cct->_conf->client_permissions) {
6436 if (S_ISDIR(in->mode)) {
6437 r = -EPERM;
6438 return r;
6439 }
6440 r = may_hardlink(in.get(), perm);
6441 if (r < 0)
6442 return r;
6443 r = may_create(dir.get(), perm);
6444 if (r < 0)
6445 return r;
6446 }
6447 r = _link(in.get(), dir.get(), name.c_str(), perm);
6448 return r;
6449}
6450
6451int Client::unlink(const char *relpath, const UserPerm& perm)
6452{
6453 Mutex::Locker lock(client_lock);
6454 tout(cct) << "unlink" << std::endl;
6455 tout(cct) << relpath << std::endl;
6456
181888fb
FG
6457 if (unmounting)
6458 return -ENOTCONN;
6459
7c673cae
FG
6460 if (std::string(relpath) == "/")
6461 return -EISDIR;
6462
6463 filepath path(relpath);
6464 string name = path.last_dentry();
6465 path.pop_dentry();
6466 InodeRef dir;
6467 int r = path_walk(path, &dir, perm);
6468 if (r < 0)
6469 return r;
6470 if (cct->_conf->client_permissions) {
6471 r = may_delete(dir.get(), name.c_str(), perm);
6472 if (r < 0)
6473 return r;
6474 }
6475 return _unlink(dir.get(), name.c_str(), perm);
6476}
6477
6478int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6479{
6480 Mutex::Locker lock(client_lock);
6481 tout(cct) << "rename" << std::endl;
6482 tout(cct) << relfrom << std::endl;
6483 tout(cct) << relto << std::endl;
6484
181888fb
FG
6485 if (unmounting)
6486 return -ENOTCONN;
6487
7c673cae
FG
6488 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6489 return -EBUSY;
6490
6491 filepath from(relfrom);
6492 filepath to(relto);
6493 string fromname = from.last_dentry();
6494 from.pop_dentry();
6495 string toname = to.last_dentry();
6496 to.pop_dentry();
6497
6498 InodeRef fromdir, todir;
6499 int r = path_walk(from, &fromdir, perm);
6500 if (r < 0)
6501 goto out;
6502 r = path_walk(to, &todir, perm);
6503 if (r < 0)
6504 goto out;
6505
6506 if (cct->_conf->client_permissions) {
6507 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6508 if (r < 0)
6509 return r;
6510 r = may_delete(todir.get(), toname.c_str(), perm);
6511 if (r < 0 && r != -ENOENT)
6512 return r;
6513 }
6514 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6515out:
6516 return r;
6517}
6518
6519// dirs
6520
6521int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6522{
6523 Mutex::Locker lock(client_lock);
6524 tout(cct) << "mkdir" << std::endl;
6525 tout(cct) << relpath << std::endl;
6526 tout(cct) << mode << std::endl;
6527 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6528
181888fb
FG
6529 if (unmounting)
6530 return -ENOTCONN;
6531
7c673cae
FG
6532 if (std::string(relpath) == "/")
6533 return -EEXIST;
6534
6535 filepath path(relpath);
6536 string name = path.last_dentry();
6537 path.pop_dentry();
6538 InodeRef dir;
6539 int r = path_walk(path, &dir, perm);
6540 if (r < 0)
6541 return r;
6542 if (cct->_conf->client_permissions) {
6543 r = may_create(dir.get(), perm);
6544 if (r < 0)
6545 return r;
6546 }
6547 return _mkdir(dir.get(), name.c_str(), mode, perm);
6548}
6549
6550int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6551{
6552 Mutex::Locker lock(client_lock);
6553 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6554 tout(cct) << "mkdirs" << std::endl;
6555 tout(cct) << relpath << std::endl;
6556 tout(cct) << mode << std::endl;
6557
181888fb
FG
6558 if (unmounting)
6559 return -ENOTCONN;
6560
7c673cae
FG
6561 //get through existing parts of path
6562 filepath path(relpath);
6563 unsigned int i;
6564 int r = 0, caps = 0;
6565 InodeRef cur, next;
6566 cur = cwd;
6567 for (i=0; i<path.depth(); ++i) {
6568 if (cct->_conf->client_permissions) {
6569 r = may_lookup(cur.get(), perms);
6570 if (r < 0)
6571 break;
6572 caps = CEPH_CAP_AUTH_SHARED;
6573 }
6574 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6575 if (r < 0)
6576 break;
6577 cur.swap(next);
6578 }
6579 //check that we have work left to do
6580 if (i==path.depth()) return -EEXIST;
6581 if (r!=-ENOENT) return r;
6582 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6583 //make new directory at each level
6584 for (; i<path.depth(); ++i) {
6585 if (cct->_conf->client_permissions) {
6586 r = may_create(cur.get(), perms);
6587 if (r < 0)
6588 return r;
6589 }
6590 //make new dir
6591 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6592
7c673cae 6593 //check proper creation/existence
c07f9fc5
FG
6594 if(-EEXIST == r && i < path.depth() - 1) {
6595 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6596 }
6597 if (r < 0)
6598 return r;
7c673cae
FG
6599 //move to new dir and continue
6600 cur.swap(next);
6601 ldout(cct, 20) << "mkdirs: successfully created directory "
6602 << filepath(cur->ino).get_path() << dendl;
6603 }
6604 return 0;
6605}
6606
6607int Client::rmdir(const char *relpath, const UserPerm& perms)
6608{
6609 Mutex::Locker lock(client_lock);
6610 tout(cct) << "rmdir" << std::endl;
6611 tout(cct) << relpath << std::endl;
6612
181888fb
FG
6613 if (unmounting)
6614 return -ENOTCONN;
6615
7c673cae
FG
6616 if (std::string(relpath) == "/")
6617 return -EBUSY;
6618
6619 filepath path(relpath);
6620 string name = path.last_dentry();
6621 path.pop_dentry();
6622 InodeRef dir;
6623 int r = path_walk(path, &dir, perms);
6624 if (r < 0)
6625 return r;
6626 if (cct->_conf->client_permissions) {
6627 int r = may_delete(dir.get(), name.c_str(), perms);
6628 if (r < 0)
6629 return r;
6630 }
6631 return _rmdir(dir.get(), name.c_str(), perms);
6632}
6633
6634int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6635{
6636 Mutex::Locker lock(client_lock);
6637 tout(cct) << "mknod" << std::endl;
6638 tout(cct) << relpath << std::endl;
6639 tout(cct) << mode << std::endl;
6640 tout(cct) << rdev << std::endl;
6641
181888fb
FG
6642 if (unmounting)
6643 return -ENOTCONN;
6644
7c673cae
FG
6645 if (std::string(relpath) == "/")
6646 return -EEXIST;
6647
6648 filepath path(relpath);
6649 string name = path.last_dentry();
6650 path.pop_dentry();
6651 InodeRef dir;
6652 int r = path_walk(path, &dir, perms);
6653 if (r < 0)
6654 return r;
6655 if (cct->_conf->client_permissions) {
6656 int r = may_create(dir.get(), perms);
6657 if (r < 0)
6658 return r;
6659 }
6660 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6661}
6662
6663// symlinks
6664
6665int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6666{
6667 Mutex::Locker lock(client_lock);
6668 tout(cct) << "symlink" << std::endl;
6669 tout(cct) << target << std::endl;
6670 tout(cct) << relpath << std::endl;
6671
181888fb
FG
6672 if (unmounting)
6673 return -ENOTCONN;
6674
7c673cae
FG
6675 if (std::string(relpath) == "/")
6676 return -EEXIST;
6677
6678 filepath path(relpath);
6679 string name = path.last_dentry();
6680 path.pop_dentry();
6681 InodeRef dir;
6682 int r = path_walk(path, &dir, perms);
6683 if (r < 0)
6684 return r;
6685 if (cct->_conf->client_permissions) {
6686 int r = may_create(dir.get(), perms);
6687 if (r < 0)
6688 return r;
6689 }
6690 return _symlink(dir.get(), name.c_str(), target, perms);
6691}
6692
6693int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6694{
6695 Mutex::Locker lock(client_lock);
6696 tout(cct) << "readlink" << std::endl;
6697 tout(cct) << relpath << std::endl;
6698
181888fb
FG
6699 if (unmounting)
6700 return -ENOTCONN;
6701
7c673cae
FG
6702 filepath path(relpath);
6703 InodeRef in;
6704 int r = path_walk(path, &in, perms, false);
6705 if (r < 0)
6706 return r;
6707
6708 return _readlink(in.get(), buf, size);
6709}
6710
6711int Client::_readlink(Inode *in, char *buf, size_t size)
6712{
6713 if (!in->is_symlink())
6714 return -EINVAL;
6715
6716 // copy into buf (at most size bytes)
6717 int r = in->symlink.length();
6718 if (r > (int)size)
6719 r = size;
6720 memcpy(buf, in->symlink.c_str(), r);
6721 return r;
6722}
6723
6724
6725// inode stuff
6726
6727int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6728{
94b18763 6729 bool yes = in->caps_issued_mask(mask, true);
7c673cae
FG
6730
6731 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6732 if (yes && !force)
6733 return 0;
6734
6735 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6736 filepath path;
6737 in->make_nosnap_relative_path(path);
6738 req->set_filepath(path);
6739 req->set_inode(in);
6740 req->head.args.getattr.mask = mask;
6741
6742 int res = make_request(req, perms);
6743 ldout(cct, 10) << "_getattr result=" << res << dendl;
6744 return res;
6745}
6746
6747int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6748 const UserPerm& perms, InodeRef *inp)
6749{
6750 int issued = in->caps_issued();
6751
6752 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6753 ccap_string(issued) << dendl;
6754
6755 if (in->snapid != CEPH_NOSNAP) {
6756 return -EROFS;
6757 }
6758 if ((mask & CEPH_SETATTR_SIZE) &&
6759 (unsigned long)stx->stx_size > in->size &&
6760 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6761 perms)) {
6762 return -EDQUOT;
6763 }
6764
6765 // make the change locally?
6766 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6767 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6768 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6769 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6770 << in->cap_dirtier_gid << ", forcing sync setattr"
6771 << dendl;
6772 /*
6773 * This works because we implicitly flush the caps as part of the
6774 * request, so the cap update check will happen with the writeback
6775 * cap context, and then the setattr check will happen with the
6776 * caller's context.
6777 *
6778 * In reality this pattern is likely pretty rare (different users
6779 * setattr'ing the same file). If that turns out not to be the
6780 * case later, we can build a more complex pipelined cap writeback
6781 * infrastructure...
6782 */
6783 if (!mask)
6784 mask |= CEPH_SETATTR_CTIME;
6785 goto force_request;
6786 }
6787
6788 if (!mask) {
6789 // caller just needs us to bump the ctime
6790 in->ctime = ceph_clock_now();
6791 in->cap_dirtier_uid = perms.uid();
6792 in->cap_dirtier_gid = perms.gid();
6793 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6794 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6795 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6796 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6797 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6798 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6799 else
6800 mask |= CEPH_SETATTR_CTIME;
6801 }
6802
6803 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6804 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6805
6806 mask &= ~CEPH_SETATTR_KILL_SGUID;
6807
6808 if (mask & CEPH_SETATTR_UID) {
6809 in->ctime = ceph_clock_now();
6810 in->cap_dirtier_uid = perms.uid();
6811 in->cap_dirtier_gid = perms.gid();
6812 in->uid = stx->stx_uid;
28e407b8 6813 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6814 mask &= ~CEPH_SETATTR_UID;
6815 kill_sguid = true;
6816 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6817 }
6818 if (mask & CEPH_SETATTR_GID) {
6819 in->ctime = ceph_clock_now();
6820 in->cap_dirtier_uid = perms.uid();
6821 in->cap_dirtier_gid = perms.gid();
6822 in->gid = stx->stx_gid;
28e407b8 6823 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6824 mask &= ~CEPH_SETATTR_GID;
6825 kill_sguid = true;
6826 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6827 }
6828
6829 if (mask & CEPH_SETATTR_MODE) {
6830 in->ctime = ceph_clock_now();
6831 in->cap_dirtier_uid = perms.uid();
6832 in->cap_dirtier_gid = perms.gid();
6833 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6834 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6835 mask &= ~CEPH_SETATTR_MODE;
6836 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6837 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6838 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6839 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6840 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6841 }
6842
6843 if (mask & CEPH_SETATTR_BTIME) {
6844 in->ctime = ceph_clock_now();
6845 in->cap_dirtier_uid = perms.uid();
6846 in->cap_dirtier_gid = perms.gid();
6847 in->btime = utime_t(stx->stx_btime);
28e407b8 6848 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6849 mask &= ~CEPH_SETATTR_BTIME;
6850 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6851 }
6852 } else if (mask & CEPH_SETATTR_SIZE) {
6853 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6854 mask |= CEPH_SETATTR_KILL_SGUID;
6855 }
6856
6857 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6858 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6859 if (mask & CEPH_SETATTR_MTIME)
6860 in->mtime = utime_t(stx->stx_mtime);
6861 if (mask & CEPH_SETATTR_ATIME)
6862 in->atime = utime_t(stx->stx_atime);
6863 in->ctime = ceph_clock_now();
6864 in->cap_dirtier_uid = perms.uid();
6865 in->cap_dirtier_gid = perms.gid();
6866 in->time_warp_seq++;
28e407b8 6867 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6868 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6869 }
6870 }
6871 if (!mask) {
6872 in->change_attr++;
6873 return 0;
6874 }
6875
6876force_request:
6877 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6878
6879 filepath path;
6880
6881 in->make_nosnap_relative_path(path);
6882 req->set_filepath(path);
6883 req->set_inode(in);
6884
6885 if (mask & CEPH_SETATTR_KILL_SGUID) {
6886 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6887 }
6888 if (mask & CEPH_SETATTR_MODE) {
6889 req->head.args.setattr.mode = stx->stx_mode;
6890 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6891 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6892 }
6893 if (mask & CEPH_SETATTR_UID) {
6894 req->head.args.setattr.uid = stx->stx_uid;
6895 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6896 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6897 }
6898 if (mask & CEPH_SETATTR_GID) {
6899 req->head.args.setattr.gid = stx->stx_gid;
6900 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6901 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6902 }
6903 if (mask & CEPH_SETATTR_BTIME) {
6904 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6905 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6906 }
6907 if (mask & CEPH_SETATTR_MTIME) {
6908 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6909 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6910 CEPH_CAP_FILE_WR;
6911 }
6912 if (mask & CEPH_SETATTR_ATIME) {
6913 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6914 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6915 CEPH_CAP_FILE_WR;
6916 }
6917 if (mask & CEPH_SETATTR_SIZE) {
6918 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6919 req->head.args.setattr.size = stx->stx_size;
6920 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6921 } else { //too big!
6922 put_request(req);
6923 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6924 return -EFBIG;
6925 }
94b18763 6926 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6927 CEPH_CAP_FILE_WR;
6928 }
6929 req->head.args.setattr.mask = mask;
6930
6931 req->regetattr_mask = mask;
6932
6933 int res = make_request(req, perms, inp);
6934 ldout(cct, 10) << "_setattr result=" << res << dendl;
6935 return res;
6936}
6937
6938/* Note that we only care about attrs that setattr cares about */
6939void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6940{
6941 stx->stx_size = st->st_size;
6942 stx->stx_mode = st->st_mode;
6943 stx->stx_uid = st->st_uid;
6944 stx->stx_gid = st->st_gid;
6945 stx->stx_mtime = st->st_mtim;
6946 stx->stx_atime = st->st_atim;
6947}
6948
6949int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6950 const UserPerm& perms, InodeRef *inp)
6951{
6952 int ret = _do_setattr(in, stx, mask, perms, inp);
6953 if (ret < 0)
6954 return ret;
6955 if (mask & CEPH_SETATTR_MODE)
6956 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6957 return ret;
6958}
6959
6960int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6961 const UserPerm& perms)
6962{
6963 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6964 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6965 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6966 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6967 if (cct->_conf->client_permissions) {
6968 int r = may_setattr(in.get(), stx, mask, perms);
6969 if (r < 0)
6970 return r;
6971 }
6972 return __setattrx(in.get(), stx, mask, perms);
6973}
6974
6975int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6976 const UserPerm& perms)
6977{
6978 struct ceph_statx stx;
6979
6980 stat_to_statx(attr, &stx);
6981 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
6982
6983 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6984 mask &= ~CEPH_SETATTR_UID;
6985 }
6986 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6987 mask &= ~CEPH_SETATTR_GID;
6988 }
6989
7c673cae
FG
6990 return _setattrx(in, &stx, mask, perms);
6991}
6992
6993int Client::setattr(const char *relpath, struct stat *attr, int mask,
6994 const UserPerm& perms)
6995{
6996 Mutex::Locker lock(client_lock);
6997 tout(cct) << "setattr" << std::endl;
6998 tout(cct) << relpath << std::endl;
6999 tout(cct) << mask << std::endl;
7000
181888fb
FG
7001 if (unmounting)
7002 return -ENOTCONN;
7003
7c673cae
FG
7004 filepath path(relpath);
7005 InodeRef in;
7006 int r = path_walk(path, &in, perms);
7007 if (r < 0)
7008 return r;
7009 return _setattr(in, attr, mask, perms);
7010}
7011
7012int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7013 const UserPerm& perms, int flags)
7014{
7015 Mutex::Locker lock(client_lock);
7016 tout(cct) << "setattrx" << std::endl;
7017 tout(cct) << relpath << std::endl;
7018 tout(cct) << mask << std::endl;
7019
181888fb
FG
7020 if (unmounting)
7021 return -ENOTCONN;
7022
7c673cae
FG
7023 filepath path(relpath);
7024 InodeRef in;
7025 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7026 if (r < 0)
7027 return r;
7028 return _setattrx(in, stx, mask, perms);
7029}
7030
7031int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7032{
7033 Mutex::Locker lock(client_lock);
7034 tout(cct) << "fsetattr" << std::endl;
7035 tout(cct) << fd << std::endl;
7036 tout(cct) << mask << std::endl;
7037
181888fb
FG
7038 if (unmounting)
7039 return -ENOTCONN;
7040
7c673cae
FG
7041 Fh *f = get_filehandle(fd);
7042 if (!f)
7043 return -EBADF;
7044#if defined(__linux__) && defined(O_PATH)
7045 if (f->flags & O_PATH)
7046 return -EBADF;
7047#endif
7048 return _setattr(f->inode, attr, mask, perms);
7049}
7050
7051int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7052{
7053 Mutex::Locker lock(client_lock);
7054 tout(cct) << "fsetattr" << std::endl;
7055 tout(cct) << fd << std::endl;
7056 tout(cct) << mask << std::endl;
7057
181888fb
FG
7058 if (unmounting)
7059 return -ENOTCONN;
7060
7c673cae
FG
7061 Fh *f = get_filehandle(fd);
7062 if (!f)
7063 return -EBADF;
7064#if defined(__linux__) && defined(O_PATH)
7065 if (f->flags & O_PATH)
7066 return -EBADF;
7067#endif
7068 return _setattrx(f->inode, stx, mask, perms);
7069}
7070
7071int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7072 frag_info_t *dirstat, int mask)
7073{
7074 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7075 Mutex::Locker lock(client_lock);
7076 tout(cct) << "stat" << std::endl;
7077 tout(cct) << relpath << std::endl;
181888fb
FG
7078
7079 if (unmounting)
7080 return -ENOTCONN;
7081
7c673cae
FG
7082 filepath path(relpath);
7083 InodeRef in;
7084 int r = path_walk(path, &in, perms, true, mask);
7085 if (r < 0)
7086 return r;
7087 r = _getattr(in, mask, perms);
7088 if (r < 0) {
7089 ldout(cct, 3) << "stat exit on error!" << dendl;
7090 return r;
7091 }
7092 fill_stat(in, stbuf, dirstat);
7093 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7094 return r;
7095}
7096
7097unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7098{
7099 unsigned mask = 0;
7100
7101 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7102 if (flags & AT_NO_ATTR_SYNC)
7103 goto out;
7104
7105 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7106 mask |= CEPH_CAP_PIN;
7107 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7108 mask |= CEPH_CAP_AUTH_SHARED;
7109 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7110 mask |= CEPH_CAP_LINK_SHARED;
7111 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7112 mask |= CEPH_CAP_FILE_SHARED;
7113 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7114 mask |= CEPH_CAP_XATTR_SHARED;
7115out:
7116 return mask;
7117}
7118
7119int Client::statx(const char *relpath, struct ceph_statx *stx,
7120 const UserPerm& perms,
7121 unsigned int want, unsigned int flags)
7122{
7123 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7124 Mutex::Locker lock(client_lock);
7125 tout(cct) << "statx" << std::endl;
7126 tout(cct) << relpath << std::endl;
181888fb
FG
7127
7128 if (unmounting)
7129 return -ENOTCONN;
7130
7c673cae
FG
7131 filepath path(relpath);
7132 InodeRef in;
7133
7134 unsigned mask = statx_to_mask(flags, want);
7135
7136 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7137 if (r < 0)
7138 return r;
7139
7140 r = _getattr(in, mask, perms);
7141 if (r < 0) {
7142 ldout(cct, 3) << "statx exit on error!" << dendl;
7143 return r;
7144 }
7145
7146 fill_statx(in, mask, stx);
7147 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7148 return r;
7149}
7150
7151int Client::lstat(const char *relpath, struct stat *stbuf,
7152 const UserPerm& perms, frag_info_t *dirstat, int mask)
7153{
7154 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7155 Mutex::Locker lock(client_lock);
7156 tout(cct) << "lstat" << std::endl;
7157 tout(cct) << relpath << std::endl;
181888fb
FG
7158
7159 if (unmounting)
7160 return -ENOTCONN;
7161
7c673cae
FG
7162 filepath path(relpath);
7163 InodeRef in;
7164 // don't follow symlinks
7165 int r = path_walk(path, &in, perms, false, mask);
7166 if (r < 0)
7167 return r;
7168 r = _getattr(in, mask, perms);
7169 if (r < 0) {
7170 ldout(cct, 3) << "lstat exit on error!" << dendl;
7171 return r;
7172 }
7173 fill_stat(in, stbuf, dirstat);
7174 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7175 return r;
7176}
7177
7178int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7179{
7180 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7181 << " mode 0" << oct << in->mode << dec
7182 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7183 memset(st, 0, sizeof(struct stat));
7184 if (use_faked_inos())
7185 st->st_ino = in->faked_ino;
7186 else
7187 st->st_ino = in->ino;
7188 st->st_dev = in->snapid;
7189 st->st_mode = in->mode;
7190 st->st_rdev = in->rdev;
28e407b8
AA
7191 if (in->is_dir()) {
7192 switch (in->nlink) {
7193 case 0:
7194 st->st_nlink = 0; /* dir is unlinked */
7195 break;
7196 case 1:
7197 st->st_nlink = 1 /* parent dentry */
7198 + 1 /* <dir>/. */
7199 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7200 break;
7201 default:
7202 ceph_abort();
7203 }
7204 } else {
7205 st->st_nlink = in->nlink;
7206 }
7c673cae
FG
7207 st->st_uid = in->uid;
7208 st->st_gid = in->gid;
7209 if (in->ctime > in->mtime) {
7210 stat_set_ctime_sec(st, in->ctime.sec());
7211 stat_set_ctime_nsec(st, in->ctime.nsec());
7212 } else {
7213 stat_set_ctime_sec(st, in->mtime.sec());
7214 stat_set_ctime_nsec(st, in->mtime.nsec());
7215 }
7216 stat_set_atime_sec(st, in->atime.sec());
7217 stat_set_atime_nsec(st, in->atime.nsec());
7218 stat_set_mtime_sec(st, in->mtime.sec());
7219 stat_set_mtime_nsec(st, in->mtime.nsec());
7220 if (in->is_dir()) {
7221 if (cct->_conf->client_dirsize_rbytes)
7222 st->st_size = in->rstat.rbytes;
7223 else
7224 st->st_size = in->dirstat.size();
7225 st->st_blocks = 1;
7226 } else {
7227 st->st_size = in->size;
7228 st->st_blocks = (in->size + 511) >> 9;
7229 }
7230 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7231
7232 if (dirstat)
7233 *dirstat = in->dirstat;
7234 if (rstat)
7235 *rstat = in->rstat;
7236
7237 return in->caps_issued();
7238}
7239
7240void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7241{
7242 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7243 << " mode 0" << oct << in->mode << dec
7244 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7245 memset(stx, 0, sizeof(struct ceph_statx));
7246
7247 /*
7248 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7249 * so that all bits are set.
7250 */
7251 if (!mask)
7252 mask = ~0;
7253
7254 /* These are always considered to be available */
7255 stx->stx_dev = in->snapid;
7256 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7257
7258 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7259 stx->stx_mode = S_IFMT & in->mode;
7260 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7261 stx->stx_rdev = in->rdev;
7262 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7263
7264 if (mask & CEPH_CAP_AUTH_SHARED) {
7265 stx->stx_uid = in->uid;
7266 stx->stx_gid = in->gid;
7267 stx->stx_mode = in->mode;
7268 in->btime.to_timespec(&stx->stx_btime);
7269 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7270 }
7271
7272 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7273 if (in->is_dir()) {
7274 switch (in->nlink) {
7275 case 0:
7276 stx->stx_nlink = 0; /* dir is unlinked */
7277 break;
7278 case 1:
7279 stx->stx_nlink = 1 /* parent dentry */
7280 + 1 /* <dir>/. */
7281 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7282 break;
7283 default:
7284 ceph_abort();
7285 }
7286 } else {
7287 stx->stx_nlink = in->nlink;
7288 }
7c673cae
FG
7289 stx->stx_mask |= CEPH_STATX_NLINK;
7290 }
7291
7292 if (mask & CEPH_CAP_FILE_SHARED) {
7293
7294 in->atime.to_timespec(&stx->stx_atime);
7295 in->mtime.to_timespec(&stx->stx_mtime);
7296
7297 if (in->is_dir()) {
7298 if (cct->_conf->client_dirsize_rbytes)
7299 stx->stx_size = in->rstat.rbytes;
7300 else
7301 stx->stx_size = in->dirstat.size();
7302 stx->stx_blocks = 1;
7303 } else {
7304 stx->stx_size = in->size;
7305 stx->stx_blocks = (in->size + 511) >> 9;
7306 }
7307 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7308 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7309 }
7310
7311 /* Change time and change_attr both require all shared caps to view */
7312 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7313 stx->stx_version = in->change_attr;
7314 if (in->ctime > in->mtime)
7315 in->ctime.to_timespec(&stx->stx_ctime);
7316 else
7317 in->mtime.to_timespec(&stx->stx_ctime);
7318 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7319 }
7320
7321}
7322
7323void Client::touch_dn(Dentry *dn)
7324{
7325 lru.lru_touch(dn);
7326}
7327
7328int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7329{
7330 Mutex::Locker lock(client_lock);
7331 tout(cct) << "chmod" << std::endl;
7332 tout(cct) << relpath << std::endl;
7333 tout(cct) << mode << std::endl;
181888fb
FG
7334
7335 if (unmounting)
7336 return -ENOTCONN;
7337
7c673cae
FG
7338 filepath path(relpath);
7339 InodeRef in;
7340 int r = path_walk(path, &in, perms);
7341 if (r < 0)
7342 return r;
7343 struct stat attr;
7344 attr.st_mode = mode;
7345 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7346}
7347
7348int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7349{
7350 Mutex::Locker lock(client_lock);
7351 tout(cct) << "fchmod" << std::endl;
7352 tout(cct) << fd << std::endl;
7353 tout(cct) << mode << std::endl;
181888fb
FG
7354
7355 if (unmounting)
7356 return -ENOTCONN;
7357
7c673cae
FG
7358 Fh *f = get_filehandle(fd);
7359 if (!f)
7360 return -EBADF;
7361#if defined(__linux__) && defined(O_PATH)
7362 if (f->flags & O_PATH)
7363 return -EBADF;
7364#endif
7365 struct stat attr;
7366 attr.st_mode = mode;
7367 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7368}
7369
7370int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7371{
7372 Mutex::Locker lock(client_lock);
7373 tout(cct) << "lchmod" << std::endl;
7374 tout(cct) << relpath << std::endl;
7375 tout(cct) << mode << std::endl;
181888fb
FG
7376
7377 if (unmounting)
7378 return -ENOTCONN;
7379
7c673cae
FG
7380 filepath path(relpath);
7381 InodeRef in;
7382 // don't follow symlinks
7383 int r = path_walk(path, &in, perms, false);
7384 if (r < 0)
7385 return r;
7386 struct stat attr;
7387 attr.st_mode = mode;
7388 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7389}
7390
7391int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7392 const UserPerm& perms)
7393{
7394 Mutex::Locker lock(client_lock);
7395 tout(cct) << "chown" << std::endl;
7396 tout(cct) << relpath << std::endl;
7397 tout(cct) << new_uid << std::endl;
7398 tout(cct) << new_gid << std::endl;
181888fb
FG
7399
7400 if (unmounting)
7401 return -ENOTCONN;
7402
7c673cae
FG
7403 filepath path(relpath);
7404 InodeRef in;
7405 int r = path_walk(path, &in, perms);
7406 if (r < 0)
7407 return r;
7408 struct stat attr;
7409 attr.st_uid = new_uid;
7410 attr.st_gid = new_gid;
181888fb 7411 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7412}
7413
7414int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7415{
7416 Mutex::Locker lock(client_lock);
7417 tout(cct) << "fchown" << std::endl;
7418 tout(cct) << fd << std::endl;
7419 tout(cct) << new_uid << std::endl;
7420 tout(cct) << new_gid << std::endl;
181888fb
FG
7421
7422 if (unmounting)
7423 return -ENOTCONN;
7424
7c673cae
FG
7425 Fh *f = get_filehandle(fd);
7426 if (!f)
7427 return -EBADF;
7428#if defined(__linux__) && defined(O_PATH)
7429 if (f->flags & O_PATH)
7430 return -EBADF;
7431#endif
7432 struct stat attr;
7433 attr.st_uid = new_uid;
7434 attr.st_gid = new_gid;
7435 int mask = 0;
7436 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7437 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7438 return _setattr(f->inode, &attr, mask, perms);
7439}
7440
7441int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7442 const UserPerm& perms)
7443{
7444 Mutex::Locker lock(client_lock);
7445 tout(cct) << "lchown" << std::endl;
7446 tout(cct) << relpath << std::endl;
7447 tout(cct) << new_uid << std::endl;
7448 tout(cct) << new_gid << std::endl;
181888fb
FG
7449
7450 if (unmounting)
7451 return -ENOTCONN;
7452
7c673cae
FG
7453 filepath path(relpath);
7454 InodeRef in;
7455 // don't follow symlinks
7456 int r = path_walk(path, &in, perms, false);
7457 if (r < 0)
7458 return r;
7459 struct stat attr;
7460 attr.st_uid = new_uid;
7461 attr.st_gid = new_gid;
7462 int mask = 0;
7463 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7464 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7465 return _setattr(in, &attr, mask, perms);
7466}
7467
7468int Client::utime(const char *relpath, struct utimbuf *buf,
7469 const UserPerm& perms)
7470{
7471 Mutex::Locker lock(client_lock);
7472 tout(cct) << "utime" << std::endl;
7473 tout(cct) << relpath << std::endl;
7474 tout(cct) << buf->modtime << std::endl;
7475 tout(cct) << buf->actime << std::endl;
181888fb
FG
7476
7477 if (unmounting)
7478 return -ENOTCONN;
7479
7c673cae
FG
7480 filepath path(relpath);
7481 InodeRef in;
7482 int r = path_walk(path, &in, perms);
7483 if (r < 0)
7484 return r;
7485 struct stat attr;
7486 stat_set_mtime_sec(&attr, buf->modtime);
7487 stat_set_mtime_nsec(&attr, 0);
7488 stat_set_atime_sec(&attr, buf->actime);
7489 stat_set_atime_nsec(&attr, 0);
7490 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7491}
7492
7493int Client::lutime(const char *relpath, struct utimbuf *buf,
7494 const UserPerm& perms)
7495{
7496 Mutex::Locker lock(client_lock);
7497 tout(cct) << "lutime" << std::endl;
7498 tout(cct) << relpath << std::endl;
7499 tout(cct) << buf->modtime << std::endl;
7500 tout(cct) << buf->actime << std::endl;
181888fb
FG
7501
7502 if (unmounting)
7503 return -ENOTCONN;
7504
7c673cae
FG
7505 filepath path(relpath);
7506 InodeRef in;
7507 // don't follow symlinks
7508 int r = path_walk(path, &in, perms, false);
7509 if (r < 0)
7510 return r;
7511 struct stat attr;
7512 stat_set_mtime_sec(&attr, buf->modtime);
7513 stat_set_mtime_nsec(&attr, 0);
7514 stat_set_atime_sec(&attr, buf->actime);
7515 stat_set_atime_nsec(&attr, 0);
7516 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7517}
7518
7519int Client::flock(int fd, int operation, uint64_t owner)
7520{
7521 Mutex::Locker lock(client_lock);
7522 tout(cct) << "flock" << std::endl;
7523 tout(cct) << fd << std::endl;
7524 tout(cct) << operation << std::endl;
7525 tout(cct) << owner << std::endl;
181888fb
FG
7526
7527 if (unmounting)
7528 return -ENOTCONN;
7529
7c673cae
FG
7530 Fh *f = get_filehandle(fd);
7531 if (!f)
7532 return -EBADF;
7533
7534 return _flock(f, operation, owner);
7535}
7536
7537int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7538{
7539 Mutex::Locker lock(client_lock);
7540 tout(cct) << "opendir" << std::endl;
7541 tout(cct) << relpath << std::endl;
181888fb
FG
7542
7543 if (unmounting)
7544 return -ENOTCONN;
7545
7c673cae
FG
7546 filepath path(relpath);
7547 InodeRef in;
7548 int r = path_walk(path, &in, perms, true);
7549 if (r < 0)
7550 return r;
7551 if (cct->_conf->client_permissions) {
7552 int r = may_open(in.get(), O_RDONLY, perms);
7553 if (r < 0)
7554 return r;
7555 }
7556 r = _opendir(in.get(), dirpp, perms);
7557 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7558 if (r != -ENOTDIR)
7559 tout(cct) << (unsigned long)*dirpp << std::endl;
7560 return r;
7561}
7562
7563int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7564{
7565 if (!in->is_dir())
7566 return -ENOTDIR;
7567 *dirpp = new dir_result_t(in, perms);
7568 opened_dirs.insert(*dirpp);
1adf2230 7569 ldout(cct, 8) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7570 return 0;
7571}
7572
7573
7574int Client::closedir(dir_result_t *dir)
7575{
7576 Mutex::Locker lock(client_lock);
7577 tout(cct) << "closedir" << std::endl;
7578 tout(cct) << (unsigned long)dir << std::endl;
7579
7580 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7581 _closedir(dir);
7582 return 0;
7583}
7584
7585void Client::_closedir(dir_result_t *dirp)
7586{
7587 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7588 if (dirp->inode) {
7589 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7590 dirp->inode.reset();
7591 }
7592 _readdir_drop_dirp_buffer(dirp);
7593 opened_dirs.erase(dirp);
7594 delete dirp;
7595}
7596
7597void Client::rewinddir(dir_result_t *dirp)
7598{
7599 Mutex::Locker lock(client_lock);
7c673cae 7600 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
181888fb
FG
7601
7602 if (unmounting)
7603 return;
7604
7c673cae
FG
7605 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7606 _readdir_drop_dirp_buffer(d);
7607 d->reset();
7608}
7609
7610loff_t Client::telldir(dir_result_t *dirp)
7611{
7612 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7613 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7614 return d->offset;
7615}
7616
7617void Client::seekdir(dir_result_t *dirp, loff_t offset)
7618{
7619 Mutex::Locker lock(client_lock);
7620
7621 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7622
181888fb
FG
7623 if (unmounting)
7624 return;
7625
7c673cae
FG
7626 if (offset == dirp->offset)
7627 return;
7628
7629 if (offset > dirp->offset)
7630 dirp->release_count = 0; // bump if we do a forward seek
7631 else
7632 dirp->ordered_count = 0; // disable filling readdir cache
7633
7634 if (dirp->hash_order()) {
7635 if (dirp->offset > offset) {
7636 _readdir_drop_dirp_buffer(dirp);
7637 dirp->reset();
7638 }
7639 } else {
7640 if (offset == 0 ||
7641 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7642 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7643 _readdir_drop_dirp_buffer(dirp);
7644 dirp->reset();
7645 }
7646 }
7647
7648 dirp->offset = offset;
7649}
7650
7651
7652//struct dirent {
7653// ino_t d_ino; /* inode number */
7654// off_t d_off; /* offset to the next dirent */
7655// unsigned short d_reclen; /* length of this record */
7656// unsigned char d_type; /* type of file */
7657// char d_name[256]; /* filename */
7658//};
7659void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7660{
7661 strncpy(de->d_name, name, 255);
7662 de->d_name[255] = '\0';
7663#ifndef __CYGWIN__
7664 de->d_ino = ino;
7665#if !defined(DARWIN) && !defined(__FreeBSD__)
7666 de->d_off = next_off;
7667#endif
7668 de->d_reclen = 1;
7669 de->d_type = IFTODT(type);
7670 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7671 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7672#endif
7673}
7674
7675void Client::_readdir_next_frag(dir_result_t *dirp)
7676{
7677 frag_t fg = dirp->buffer_frag;
7678
7679 if (fg.is_rightmost()) {
7680 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7681 dirp->set_end();
7682 return;
7683 }
7684
7685 // advance
7686 fg = fg.next();
7687 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7688
7689 if (dirp->hash_order()) {
7690 // keep last_name
7691 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7692 if (dirp->offset < new_offset) // don't decrease offset
7693 dirp->offset = new_offset;
7694 } else {
7695 dirp->last_name.clear();
7696 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7697 _readdir_rechoose_frag(dirp);
7698 }
7699}
7700
7701void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7702{
7703 assert(dirp->inode);
7704
7705 if (dirp->hash_order())
7706 return;
7707
7708 frag_t cur = frag_t(dirp->offset_high());
7709 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7710 if (fg != cur) {
7711 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7712 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7713 dirp->last_name.clear();
7714 dirp->next_offset = 2;
7715 }
7716}
7717
7718void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7719{
7720 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7721 dirp->buffer.clear();
7722}
7723
7724int Client::_readdir_get_frag(dir_result_t *dirp)
7725{
7726 assert(dirp);
7727 assert(dirp->inode);
7728
7729 // get the current frag.
7730 frag_t fg;
7731 if (dirp->hash_order())
7732 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7733 else
7734 fg = frag_t(dirp->offset_high());
7735
7736 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7737 << " offset " << hex << dirp->offset << dec << dendl;
7738
7739 int op = CEPH_MDS_OP_READDIR;
7740 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7741 op = CEPH_MDS_OP_LSSNAP;
7742
7743 InodeRef& diri = dirp->inode;
7744
7745 MetaRequest *req = new MetaRequest(op);
7746 filepath path;
7747 diri->make_nosnap_relative_path(path);
7748 req->set_filepath(path);
7749 req->set_inode(diri.get());
7750 req->head.args.readdir.frag = fg;
7751 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7752 if (dirp->last_name.length()) {
94b18763 7753 req->path2.set_path(dirp->last_name);
7c673cae
FG
7754 } else if (dirp->hash_order()) {
7755 req->head.args.readdir.offset_hash = dirp->offset_high();
7756 }
7757 req->dirp = dirp;
7758
7759 bufferlist dirbl;
7760 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7761
7762 if (res == -EAGAIN) {
7763 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7764 _readdir_rechoose_frag(dirp);
7765 return _readdir_get_frag(dirp);
7766 }
7767
7768 if (res == 0) {
7769 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7770 << " size " << dirp->buffer.size() << dendl;
7771 } else {
7772 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7773 dirp->set_end();
7774 }
7775
7776 return res;
7777}
7778
7779struct dentry_off_lt {
7780 bool operator()(const Dentry* dn, int64_t off) const {
7781 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7782 }
7783};
7784
7785int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7786 int caps, bool getref)
7787{
7788 assert(client_lock.is_locked());
7789 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7790 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7791 << dendl;
7792 Dir *dir = dirp->inode->dir;
7793
7794 if (!dir) {
7795 ldout(cct, 10) << " dir is empty" << dendl;
7796 dirp->set_end();
7797 return 0;
7798 }
7799
7800 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7801 dir->readdir_cache.end(),
7802 dirp->offset, dentry_off_lt());
7803
7804 string dn_name;
7805 while (true) {
7806 if (!dirp->inode->is_complete_and_ordered())
7807 return -EAGAIN;
7808 if (pd == dir->readdir_cache.end())
7809 break;
7810 Dentry *dn = *pd;
7811 if (dn->inode == NULL) {
7812 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7813 ++pd;
7814 continue;
7815 }
7816 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7817 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7818 ++pd;
7819 continue;
7820 }
7821
7822 int r = _getattr(dn->inode, caps, dirp->perms);
7823 if (r < 0)
7824 return r;
7825
7826 struct ceph_statx stx;
7827 struct dirent de;
7828 fill_statx(dn->inode, caps, &stx);
7829
7830 uint64_t next_off = dn->offset + 1;
7831 ++pd;
7832 if (pd == dir->readdir_cache.end())
7833 next_off = dir_result_t::END;
7834
7835 Inode *in = NULL;
7836 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7837 if (getref) {
7838 in = dn->inode.get();
7839 _ll_get(in);
7840 }
7841
7842 dn_name = dn->name; // fill in name while we have lock
7843
7844 client_lock.Unlock();
7845 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7846 client_lock.Lock();
7847 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7848 << " = " << r << dendl;
7849 if (r < 0) {
7850 return r;
7851 }
7852
7853 dirp->offset = next_off;
7854 if (dirp->at_end())
7855 dirp->next_offset = 2;
7856 else
7857 dirp->next_offset = dirp->offset_low();
7858 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 7859 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
7860 if (r > 0)
7861 return r;
7862 }
7863
7864 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7865 dirp->set_end();
7866 return 0;
7867}
7868
7869int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7870 unsigned want, unsigned flags, bool getref)
7871{
7872 int caps = statx_to_mask(flags, want);
7873
7874 Mutex::Locker lock(client_lock);
7875
181888fb
FG
7876 if (unmounting)
7877 return -ENOTCONN;
7878
7c673cae
FG
7879 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7880
7881 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7882 << dec << " at_end=" << dirp->at_end()
7883 << " hash_order=" << dirp->hash_order() << dendl;
7884
7885 struct dirent de;
7886 struct ceph_statx stx;
7887 memset(&de, 0, sizeof(de));
7888 memset(&stx, 0, sizeof(stx));
7889
7890 InodeRef& diri = dirp->inode;
7891
7892 if (dirp->at_end())
7893 return 0;
7894
7895 if (dirp->offset == 0) {
7896 ldout(cct, 15) << " including ." << dendl;
7897 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7898 uint64_t next_off = 1;
7899
7900 int r;
7901 r = _getattr(diri, caps, dirp->perms);
7902 if (r < 0)
7903 return r;
7904
7905 fill_statx(diri, caps, &stx);
7906 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7907
7908 Inode *inode = NULL;
7909 if (getref) {
7910 inode = diri.get();
7911 _ll_get(inode);
7912 }
7913
7914 client_lock.Unlock();
7915 r = cb(p, &de, &stx, next_off, inode);
7916 client_lock.Lock();
7917 if (r < 0)
7918 return r;
7919
7920 dirp->offset = next_off;
7921 if (r > 0)
7922 return r;
7923 }
7924 if (dirp->offset == 1) {
7925 ldout(cct, 15) << " including .." << dendl;
7926 uint64_t next_off = 2;
7927 InodeRef in;
7928 if (diri->dn_set.empty())
7929 in = diri;
7930 else
94b18763 7931 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
7932
7933 int r;
94b18763 7934 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
7935 if (r < 0)
7936 return r;
7937
7938 fill_statx(in, caps, &stx);
7939 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7940
7941 Inode *inode = NULL;
7942 if (getref) {
7943 inode = in.get();
7944 _ll_get(inode);
7945 }
7946
7947 client_lock.Unlock();
7948 r = cb(p, &de, &stx, next_off, inode);
7949 client_lock.Lock();
7950 if (r < 0)
7951 return r;
7952
7953 dirp->offset = next_off;
7954 if (r > 0)
7955 return r;
7956 }
7957
7958 // can we read from our cache?
7959 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7960 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7961 << dirp->inode->is_complete_and_ordered()
7962 << " issued " << ccap_string(dirp->inode->caps_issued())
7963 << dendl;
7964 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7965 dirp->inode->is_complete_and_ordered() &&
94b18763 7966 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
7967 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7968 if (err != -EAGAIN)
7969 return err;
7970 }
7971
7972 while (1) {
7973 if (dirp->at_end())
7974 return 0;
7975
7976 bool check_caps = true;
7977 if (!dirp->is_cached()) {
7978 int r = _readdir_get_frag(dirp);
7979 if (r)
7980 return r;
7981 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7982 // different than the requested one. (our dirfragtree was outdated)
7983 check_caps = false;
7984 }
7985 frag_t fg = dirp->buffer_frag;
7986
7987 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7988 << " offset " << hex << dirp->offset << dendl;
7989
7990 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7991 dirp->offset, dir_result_t::dentry_off_lt());
7992 it != dirp->buffer.end();
7993 ++it) {
7994 dir_result_t::dentry &entry = *it;
7995
7996 uint64_t next_off = entry.offset + 1;
7997
7998 int r;
7999 if (check_caps) {
8000 r = _getattr(entry.inode, caps, dirp->perms);
8001 if (r < 0)
8002 return r;
8003 }
8004
8005 fill_statx(entry.inode, caps, &stx);
8006 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8007
8008 Inode *inode = NULL;
8009 if (getref) {
8010 inode = entry.inode.get();
8011 _ll_get(inode);
8012 }
8013
8014 client_lock.Unlock();
8015 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8016 client_lock.Lock();
8017
8018 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8019 << " = " << r << dendl;
8020 if (r < 0)
8021 return r;
8022
8023 dirp->offset = next_off;
8024 if (r > 0)
8025 return r;
8026 }
8027
8028 if (dirp->next_offset > 2) {
8029 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8030 _readdir_drop_dirp_buffer(dirp);
8031 continue; // more!
8032 }
8033
8034 if (!fg.is_rightmost()) {
8035 // next frag!
8036 _readdir_next_frag(dirp);
8037 continue;
8038 }
8039
8040 if (diri->shared_gen == dirp->start_shared_gen &&
8041 diri->dir_release_count == dirp->release_count) {
8042 if (diri->dir_ordered_count == dirp->ordered_count) {
8043 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8044 if (diri->dir) {
8045 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8046 diri->dir->readdir_cache.resize(dirp->cache_index);
8047 }
8048 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8049 } else {
8050 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8051 diri->flags |= I_COMPLETE;
8052 }
8053 }
8054
8055 dirp->set_end();
8056 return 0;
8057 }
8058 ceph_abort();
8059 return 0;
8060}
8061
8062
8063int Client::readdir_r(dir_result_t *d, struct dirent *de)
8064{
8065 return readdirplus_r(d, de, 0, 0, 0, NULL);
8066}
8067
8068/*
8069 * readdirplus_r
8070 *
8071 * returns
8072 * 1 if we got a dirent
8073 * 0 for end of directory
8074 * <0 on error
8075 */
8076
8077struct single_readdir {
8078 struct dirent *de;
8079 struct ceph_statx *stx;
8080 Inode *inode;
8081 bool full;
8082};
8083
8084static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8085 struct ceph_statx *stx, off_t off,
8086 Inode *in)
8087{
8088 single_readdir *c = static_cast<single_readdir *>(p);
8089
8090 if (c->full)
8091 return -1; // already filled this dirent
8092
8093 *c->de = *de;
8094 if (c->stx)
8095 *c->stx = *stx;
8096 c->inode = in;
8097 c->full = true;
8098 return 1;
8099}
8100
8101struct dirent *Client::readdir(dir_result_t *d)
8102{
8103 int ret;
8104 static struct dirent de;
8105 single_readdir sr;
8106 sr.de = &de;
8107 sr.stx = NULL;
8108 sr.inode = NULL;
8109 sr.full = false;
8110
8111 // our callback fills the dirent and sets sr.full=true on first
8112 // call, and returns -1 the second time around.
8113 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8114 if (ret < -1) {
8115 errno = -ret; // this sucks.
8116 return (dirent *) NULL;
8117 }
8118 if (sr.full) {
8119 return &de;
8120 }
8121 return (dirent *) NULL;
8122}
8123
8124int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8125 struct ceph_statx *stx, unsigned want,
8126 unsigned flags, Inode **out)
8127{
8128 single_readdir sr;
8129 sr.de = de;
8130 sr.stx = stx;
8131 sr.inode = NULL;
8132 sr.full = false;
8133
8134 // our callback fills the dirent and sets sr.full=true on first
8135 // call, and returns -1 the second time around.
8136 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8137 if (r < -1)
8138 return r;
8139 if (out)
8140 *out = sr.inode;
8141 if (sr.full)
8142 return 1;
8143 return 0;
8144}
8145
8146
8147/* getdents */
8148struct getdents_result {
8149 char *buf;
8150 int buflen;
8151 int pos;
8152 bool fullent;
8153};
8154
8155static int _readdir_getdent_cb(void *p, struct dirent *de,
8156 struct ceph_statx *stx, off_t off, Inode *in)
8157{
8158 struct getdents_result *c = static_cast<getdents_result *>(p);
8159
8160 int dlen;
8161 if (c->fullent)
8162 dlen = sizeof(*de);
8163 else
8164 dlen = strlen(de->d_name) + 1;
8165
8166 if (c->pos + dlen > c->buflen)
8167 return -1; // doesn't fit
8168
8169 if (c->fullent) {
8170 memcpy(c->buf + c->pos, de, sizeof(*de));
8171 } else {
8172 memcpy(c->buf + c->pos, de->d_name, dlen);
8173 }
8174 c->pos += dlen;
8175 return 0;
8176}
8177
8178int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8179{
8180 getdents_result gr;
8181 gr.buf = buf;
8182 gr.buflen = buflen;
8183 gr.fullent = fullent;
8184 gr.pos = 0;
8185
8186 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8187
8188 if (r < 0) { // some error
8189 if (r == -1) { // buffer ran out of space
8190 if (gr.pos) { // but we got some entries already!
8191 return gr.pos;
8192 } // or we need a larger buffer
8193 return -ERANGE;
8194 } else { // actual error, return it
8195 return r;
8196 }
8197 }
8198 return gr.pos;
8199}
8200
8201
8202/* getdir */
8203struct getdir_result {
8204 list<string> *contents;
8205 int num;
8206};
8207
8208static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8209{
8210 getdir_result *r = static_cast<getdir_result *>(p);
8211
8212 r->contents->push_back(de->d_name);
8213 r->num++;
8214 return 0;
8215}
8216
8217int Client::getdir(const char *relpath, list<string>& contents,
8218 const UserPerm& perms)
8219{
8220 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8221 {
8222 Mutex::Locker lock(client_lock);
8223 tout(cct) << "getdir" << std::endl;
8224 tout(cct) << relpath << std::endl;
8225 }
8226
8227 dir_result_t *d;
8228 int r = opendir(relpath, &d, perms);
8229 if (r < 0)
8230 return r;
8231
8232 getdir_result gr;
8233 gr.contents = &contents;
8234 gr.num = 0;
8235 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8236
8237 closedir(d);
8238
8239 if (r < 0)
8240 return r;
8241 return gr.num;
8242}
8243
8244
8245/****** file i/o **********/
8246int Client::open(const char *relpath, int flags, const UserPerm& perms,
8247 mode_t mode, int stripe_unit, int stripe_count,
8248 int object_size, const char *data_pool)
8249{
8250 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8251 Mutex::Locker lock(client_lock);
8252 tout(cct) << "open" << std::endl;
8253 tout(cct) << relpath << std::endl;
8254 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8255
181888fb
FG
8256 if (unmounting)
8257 return -ENOTCONN;
8258
7c673cae
FG
8259 Fh *fh = NULL;
8260
8261#if defined(__linux__) && defined(O_PATH)
8262 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8263 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8264 * in kernel (fs/open.c). */
8265 if (flags & O_PATH)
8266 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8267#endif
8268
8269 filepath path(relpath);
8270 InodeRef in;
8271 bool created = false;
8272 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8273 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8274 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8275
8276 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8277 return -EEXIST;
8278
8279#if defined(__linux__) && defined(O_PATH)
8280 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8281#else
8282 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8283#endif
8284 return -ELOOP;
8285
8286 if (r == -ENOENT && (flags & O_CREAT)) {
8287 filepath dirpath = path;
8288 string dname = dirpath.last_dentry();
8289 dirpath.pop_dentry();
8290 InodeRef dir;
8291 r = path_walk(dirpath, &dir, perms, true,
8292 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8293 if (r < 0)
8294 goto out;
8295 if (cct->_conf->client_permissions) {
8296 r = may_create(dir.get(), perms);
8297 if (r < 0)
8298 goto out;
8299 }
8300 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8301 stripe_count, object_size, data_pool, &created, perms);
8302 }
8303 if (r < 0)
8304 goto out;
8305
8306 if (!created) {
8307 // posix says we can only check permissions of existing files
8308 if (cct->_conf->client_permissions) {
8309 r = may_open(in.get(), flags, perms);
8310 if (r < 0)
8311 goto out;
8312 }
8313 }
8314
8315 if (!fh)
8316 r = _open(in.get(), flags, mode, &fh, perms);
8317 if (r >= 0) {
8318 // allocate a integer file descriptor
8319 assert(fh);
8320 r = get_fd();
8321 assert(fd_map.count(r) == 0);
8322 fd_map[r] = fh;
8323 }
8324
8325 out:
8326 tout(cct) << r << std::endl;
8327 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8328 return r;
8329}
8330
8331int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8332{
8333 /* Use default file striping parameters */
8334 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8335}
8336
8337int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8338 const UserPerm& perms)
8339{
8340 Mutex::Locker lock(client_lock);
8341 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8342
181888fb
FG
8343 if (unmounting)
8344 return -ENOTCONN;
8345
7c673cae
FG
8346 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8347 filepath path(ino);
8348 req->set_filepath(path);
8349
8350 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8351 char f[30];
8352 sprintf(f, "%u", h);
8353 filepath path2(dirino);
8354 path2.push_dentry(string(f));
8355 req->set_filepath2(path2);
8356
8357 int r = make_request(req, perms, NULL, NULL,
8358 rand() % mdsmap->get_num_in_mds());
8359 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8360 return r;
8361}
8362
8363
8364/**
8365 * Load inode into local cache.
8366 *
8367 * If inode pointer is non-NULL, and take a reference on
8368 * the resulting Inode object in one operation, so that caller
8369 * can safely assume inode will still be there after return.
8370 */
1adf2230 8371int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8372{
1adf2230 8373 ldout(cct, 8) << "lookup_ino enter(" << ino << ")" << dendl;
7c673cae 8374
181888fb
FG
8375 if (unmounting)
8376 return -ENOTCONN;
8377
7c673cae
FG
8378 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8379 filepath path(ino);
8380 req->set_filepath(path);
8381
8382 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8383 if (r == 0 && inode != NULL) {
8384 vinodeno_t vino(ino, CEPH_NOSNAP);
8385 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8386 assert(p != inode_map.end());
8387 *inode = p->second;
8388 _ll_get(*inode);
8389 }
1adf2230 8390 ldout(cct, 8) << "lookup_ino exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8391 return r;
8392}
8393
1adf2230
AA
8394int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8395{
8396 Mutex::Locker lock(client_lock);
8397 return _lookup_ino(ino, perms, inode);
8398}
7c673cae
FG
8399
8400/**
8401 * Find the parent inode of `ino` and insert it into
8402 * our cache. Conditionally also set `parent` to a referenced
8403 * Inode* if caller provides non-NULL value.
8404 */
1adf2230 8405int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8406{
1adf2230 8407 ldout(cct, 8) << "lookup_parent enter(" << ino->ino << ")" << dendl;
7c673cae 8408
181888fb
FG
8409 if (unmounting)
8410 return -ENOTCONN;
8411
7c673cae
FG
8412 if (!ino->dn_set.empty()) {
8413 // if we exposed the parent here, we'd need to check permissions,
8414 // but right now we just rely on the MDS doing so in make_request
1adf2230 8415 ldout(cct, 8) << "lookup_parent dentry already present" << dendl;
7c673cae
FG
8416 return 0;
8417 }
8418
8419 if (ino->is_root()) {
8420 *parent = NULL;
1adf2230 8421 ldout(cct, 8) << "ino is root, no parent" << dendl;
7c673cae
FG
8422 return -EINVAL;
8423 }
8424
8425 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8426 filepath path(ino->ino);
8427 req->set_filepath(path);
8428
8429 InodeRef target;
8430 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8431 // Give caller a reference to the parent ino if they provided a pointer.
8432 if (parent != NULL) {
8433 if (r == 0) {
8434 *parent = target.get();
8435 _ll_get(*parent);
1adf2230 8436 ldout(cct, 8) << "lookup_parent found parent " << (*parent)->ino << dendl;
7c673cae
FG
8437 } else {
8438 *parent = NULL;
8439 }
8440 }
1adf2230 8441 ldout(cct, 8) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8442 return r;
8443}
8444
1adf2230
AA
8445int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8446{
8447 Mutex::Locker lock(client_lock);
8448 return _lookup_parent(ino, perms, parent);
8449}
7c673cae
FG
8450
8451/**
8452 * Populate the parent dentry for `ino`, provided it is
8453 * a child of `parent`.
8454 */
1adf2230 8455int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae
FG
8456{
8457 assert(parent->is_dir());
7c673cae
FG
8458 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8459
181888fb
FG
8460 if (unmounting)
8461 return -ENOTCONN;
8462
7c673cae
FG
8463 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8464 req->set_filepath2(filepath(parent->ino));
8465 req->set_filepath(filepath(ino->ino));
8466 req->set_inode(ino);
8467
8468 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8469 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8470 return r;
8471}
8472
1adf2230
AA
8473int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8474{
8475 Mutex::Locker lock(client_lock);
8476 return _lookup_name(ino, parent, perms);
8477}
7c673cae
FG
8478
8479 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8480{
8481 assert(in);
8482 Fh *f = new Fh(in);
8483 f->mode = cmode;
8484 f->flags = flags;
8485
8486 // inode
8487 f->actor_perms = perms;
8488
8489 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8490
8491 if (in->snapid != CEPH_NOSNAP) {
8492 in->snap_cap_refs++;
8493 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8494 << ccap_string(in->caps_issued()) << dendl;
8495 }
8496
8497 const md_config_t *conf = cct->_conf;
8498 f->readahead.set_trigger_requests(1);
8499 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8500 uint64_t max_readahead = Readahead::NO_LIMIT;
8501 if (conf->client_readahead_max_bytes) {
8502 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8503 }
8504 if (conf->client_readahead_max_periods) {
8505 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8506 }
8507 f->readahead.set_max_readahead_size(max_readahead);
8508 vector<uint64_t> alignments;
8509 alignments.push_back(in->layout.get_period());
8510 alignments.push_back(in->layout.stripe_unit);
8511 f->readahead.set_alignments(alignments);
8512
8513 return f;
8514}
8515
8516int Client::_release_fh(Fh *f)
8517{
8518 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8519 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8520 Inode *in = f->inode.get();
1adf2230 8521 ldout(cct, 8) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8522
b32b8144
FG
8523 in->unset_deleg(f);
8524
7c673cae
FG
8525 if (in->snapid == CEPH_NOSNAP) {
8526 if (in->put_open_ref(f->mode)) {
8527 _flush(in, new C_Client_FlushComplete(this, in));
8528 check_caps(in, 0);
8529 }
8530 } else {
8531 assert(in->snap_cap_refs > 0);
8532 in->snap_cap_refs--;
8533 }
8534
8535 _release_filelocks(f);
8536
8537 // Finally, read any async err (i.e. from flushes)
8538 int err = f->take_async_err();
8539 if (err != 0) {
8540 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8541 << cpp_strerror(err) << dendl;
8542 } else {
8543 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8544 }
8545
8546 _put_fh(f);
8547
8548 return err;
8549}
8550
8551void Client::_put_fh(Fh *f)
8552{
8553 int left = f->put();
8554 if (!left) {
8555 delete f;
8556 }
8557}
8558
8559int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8560 const UserPerm& perms)
8561{
8562 if (in->snapid != CEPH_NOSNAP &&
8563 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8564 return -EROFS;
8565 }
8566
8567 // use normalized flags to generate cmode
8568 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8569 if (cmode < 0)
8570 return -EINVAL;
8571 int want = ceph_caps_for_mode(cmode);
8572 int result = 0;
8573
8574 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8575
b32b8144 8576 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8577 // update wanted?
8578 check_caps(in, CHECK_CAPS_NODELAY);
8579 } else {
b32b8144 8580
7c673cae
FG
8581 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8582 filepath path;
8583 in->make_nosnap_relative_path(path);
8584 req->set_filepath(path);
8585 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8586 req->head.args.open.mode = mode;
8587 req->head.args.open.pool = -1;
8588 if (cct->_conf->client_debug_getattr_caps)
8589 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8590 else
8591 req->head.args.open.mask = 0;
8592 req->head.args.open.old_size = in->size; // for O_TRUNC
8593 req->set_inode(in);
8594 result = make_request(req, perms);
b32b8144
FG
8595
8596 /*
8597 * NFS expects that delegations will be broken on a conflicting open,
8598 * not just when there is actual conflicting access to the file. SMB leases
8599 * and oplocks also have similar semantics.
8600 *
8601 * Ensure that clients that have delegations enabled will wait on minimal
8602 * caps during open, just to ensure that other clients holding delegations
8603 * return theirs first.
8604 */
8605 if (deleg_timeout && result == 0) {
8606 int need = 0, have;
8607
8608 if (cmode & CEPH_FILE_MODE_WR)
8609 need |= CEPH_CAP_FILE_WR;
8610 if (cmode & CEPH_FILE_MODE_RD)
8611 need |= CEPH_CAP_FILE_RD;
8612
8613 result = get_caps(in, need, want, &have, -1);
8614 if (result < 0) {
1adf2230 8615 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8616 " . Denying open: " <<
8617 cpp_strerror(result) << dendl;
8618 in->put_open_ref(cmode);
8619 } else {
8620 put_cap_ref(in, need);
8621 }
8622 }
7c673cae
FG
8623 }
8624
8625 // success?
8626 if (result >= 0) {
8627 if (fhp)
8628 *fhp = _create_fh(in, flags, cmode, perms);
8629 } else {
8630 in->put_open_ref(cmode);
8631 }
8632
8633 trim_cache();
8634
8635 return result;
8636}
8637
8638int Client::_renew_caps(Inode *in)
8639{
8640 int wanted = in->caps_file_wanted();
8641 if (in->is_any_caps() &&
8642 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8643 check_caps(in, CHECK_CAPS_NODELAY);
8644 return 0;
8645 }
8646
8647 int flags = 0;
8648 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8649 flags = O_RDWR;
8650 else if (wanted & CEPH_CAP_FILE_RD)
8651 flags = O_RDONLY;
8652 else if (wanted & CEPH_CAP_FILE_WR)
8653 flags = O_WRONLY;
8654
8655 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8656 filepath path;
8657 in->make_nosnap_relative_path(path);
8658 req->set_filepath(path);
8659 req->head.args.open.flags = flags;
8660 req->head.args.open.pool = -1;
8661 if (cct->_conf->client_debug_getattr_caps)
8662 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8663 else
8664 req->head.args.open.mask = 0;
8665 req->set_inode(in);
8666
8667 // duplicate in case Cap goes away; not sure if that race is a concern?
8668 const UserPerm *pperm = in->get_best_perms();
8669 UserPerm perms;
8670 if (pperm != NULL)
8671 perms = *pperm;
8672 int ret = make_request(req, perms);
8673 return ret;
8674}
8675
8676int Client::close(int fd)
8677{
8678 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8679 Mutex::Locker lock(client_lock);
8680 tout(cct) << "close" << std::endl;
8681 tout(cct) << fd << std::endl;
8682
181888fb
FG
8683 if (unmounting)
8684 return -ENOTCONN;
8685
7c673cae
FG
8686 Fh *fh = get_filehandle(fd);
8687 if (!fh)
8688 return -EBADF;
8689 int err = _release_fh(fh);
8690 fd_map.erase(fd);
8691 put_fd(fd);
8692 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8693 return err;
8694}
8695
8696
8697// ------------
8698// read, write
8699
8700loff_t Client::lseek(int fd, loff_t offset, int whence)
8701{
8702 Mutex::Locker lock(client_lock);
8703 tout(cct) << "lseek" << std::endl;
8704 tout(cct) << fd << std::endl;
8705 tout(cct) << offset << std::endl;
8706 tout(cct) << whence << std::endl;
8707
181888fb
FG
8708 if (unmounting)
8709 return -ENOTCONN;
8710
7c673cae
FG
8711 Fh *f = get_filehandle(fd);
8712 if (!f)
8713 return -EBADF;
8714#if defined(__linux__) && defined(O_PATH)
8715 if (f->flags & O_PATH)
8716 return -EBADF;
8717#endif
8718 return _lseek(f, offset, whence);
8719}
8720
8721loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8722{
8723 Inode *in = f->inode.get();
8724 int r;
8725
8726 switch (whence) {
8727 case SEEK_SET:
8728 f->pos = offset;
8729 break;
8730
8731 case SEEK_CUR:
8732 f->pos += offset;
8733 break;
8734
8735 case SEEK_END:
8736 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8737 if (r < 0)
8738 return r;
8739 f->pos = in->size + offset;
8740 break;
8741
8742 default:
8743 ceph_abort();
8744 }
8745
1adf2230 8746 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8747 return f->pos;
8748}
8749
8750
8751void Client::lock_fh_pos(Fh *f)
8752{
8753 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8754
8755 if (f->pos_locked || !f->pos_waiters.empty()) {
8756 Cond cond;
8757 f->pos_waiters.push_back(&cond);
8758 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8759 while (f->pos_locked || f->pos_waiters.front() != &cond)
8760 cond.Wait(client_lock);
8761 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8762 assert(f->pos_waiters.front() == &cond);
8763 f->pos_waiters.pop_front();
8764 }
8765
8766 f->pos_locked = true;
8767}
8768
8769void Client::unlock_fh_pos(Fh *f)
8770{
8771 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8772 f->pos_locked = false;
8773}
8774
8775int Client::uninline_data(Inode *in, Context *onfinish)
8776{
8777 if (!in->inline_data.length()) {
8778 onfinish->complete(0);
8779 return 0;
8780 }
8781
8782 char oid_buf[32];
8783 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8784 object_t oid = oid_buf;
8785
8786 ObjectOperation create_ops;
8787 create_ops.create(false);
8788
8789 objecter->mutate(oid,
8790 OSDMap::file_to_object_locator(in->layout),
8791 create_ops,
8792 in->snaprealm->get_snap_context(),
8793 ceph::real_clock::now(),
8794 0,
8795 NULL);
8796
8797 bufferlist inline_version_bl;
8798 ::encode(in->inline_version, inline_version_bl);
8799
8800 ObjectOperation uninline_ops;
8801 uninline_ops.cmpxattr("inline_version",
8802 CEPH_OSD_CMPXATTR_OP_GT,
8803 CEPH_OSD_CMPXATTR_MODE_U64,
8804 inline_version_bl);
8805 bufferlist inline_data = in->inline_data;
8806 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8807 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8808
8809 objecter->mutate(oid,
8810 OSDMap::file_to_object_locator(in->layout),
8811 uninline_ops,
8812 in->snaprealm->get_snap_context(),
8813 ceph::real_clock::now(),
8814 0,
8815 onfinish);
8816
8817 return 0;
8818}
8819
8820//
8821
8822// blocking osd interface
8823
8824int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8825{
8826 Mutex::Locker lock(client_lock);
8827 tout(cct) << "read" << std::endl;
8828 tout(cct) << fd << std::endl;
8829 tout(cct) << size << std::endl;
8830 tout(cct) << offset << std::endl;
8831
181888fb
FG
8832 if (unmounting)
8833 return -ENOTCONN;
8834
7c673cae
FG
8835 Fh *f = get_filehandle(fd);
8836 if (!f)
8837 return -EBADF;
8838#if defined(__linux__) && defined(O_PATH)
8839 if (f->flags & O_PATH)
8840 return -EBADF;
8841#endif
8842 bufferlist bl;
8843 int r = _read(f, offset, size, &bl);
8844 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8845 if (r >= 0) {
8846 bl.copy(0, bl.length(), buf);
8847 r = bl.length();
8848 }
8849 return r;
8850}
8851
8852int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8853{
8854 if (iovcnt < 0)
8855 return -EINVAL;
8856 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8857}
8858
8859int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8860{
8861 const md_config_t *conf = cct->_conf;
8862 Inode *in = f->inode.get();
8863
8864 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8865 return -EBADF;
8866 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8867
8868 bool movepos = false;
8869 if (offset < 0) {
8870 lock_fh_pos(f);
8871 offset = f->pos;
8872 movepos = true;
8873 }
8874 loff_t start_pos = offset;
8875
8876 if (in->inline_version == 0) {
8877 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8878 if (r < 0) {
8879 if (movepos)
8880 unlock_fh_pos(f);
7c673cae 8881 return r;
c07f9fc5 8882 }
7c673cae
FG
8883 assert(in->inline_version > 0);
8884 }
8885
8886retry:
8887 int have;
8888 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8889 if (r < 0) {
8890 if (movepos)
8891 unlock_fh_pos(f);
7c673cae 8892 return r;
c07f9fc5 8893 }
7c673cae
FG
8894 if (f->flags & O_DIRECT)
8895 have &= ~CEPH_CAP_FILE_CACHE;
8896
8897 Mutex uninline_flock("Client::_read_uninline_data flock");
8898 Cond uninline_cond;
8899 bool uninline_done = false;
8900 int uninline_ret = 0;
8901 Context *onuninline = NULL;
8902
8903 if (in->inline_version < CEPH_INLINE_NONE) {
8904 if (!(have & CEPH_CAP_FILE_CACHE)) {
8905 onuninline = new C_SafeCond(&uninline_flock,
8906 &uninline_cond,
8907 &uninline_done,
8908 &uninline_ret);
8909 uninline_data(in, onuninline);
8910 } else {
8911 uint32_t len = in->inline_data.length();
8912
8913 uint64_t endoff = offset + size;
8914 if (endoff > in->size)
8915 endoff = in->size;
8916
8917 if (offset < len) {
8918 if (endoff <= len) {
8919 bl->substr_of(in->inline_data, offset, endoff - offset);
8920 } else {
8921 bl->substr_of(in->inline_data, offset, len - offset);
8922 bl->append_zero(endoff - len);
8923 }
8924 } else if ((uint64_t)offset < endoff) {
8925 bl->append_zero(endoff - offset);
8926 }
8927
8928 goto success;
8929 }
8930 }
8931
8932 if (!conf->client_debug_force_sync_read &&
8933 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8934
8935 if (f->flags & O_RSYNC) {
8936 _flush_range(in, offset, size);
8937 }
8938 r = _read_async(f, offset, size, bl);
8939 if (r < 0)
8940 goto done;
8941 } else {
8942 if (f->flags & O_DIRECT)
8943 _flush_range(in, offset, size);
8944
8945 bool checkeof = false;
8946 r = _read_sync(f, offset, size, bl, &checkeof);
8947 if (r < 0)
8948 goto done;
8949 if (checkeof) {
8950 offset += r;
8951 size -= r;
8952
8953 put_cap_ref(in, CEPH_CAP_FILE_RD);
8954 have = 0;
8955 // reverify size
8956 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8957 if (r < 0)
8958 goto done;
8959
8960 // eof? short read.
8961 if ((uint64_t)offset < in->size)
8962 goto retry;
8963 }
8964 }
8965
8966success:
8967 if (movepos) {
8968 // adjust fd pos
8969 f->pos = start_pos + bl->length();
8970 unlock_fh_pos(f);
8971 }
8972
8973done:
8974 // done!
8975
8976 if (onuninline) {
8977 client_lock.Unlock();
8978 uninline_flock.Lock();
8979 while (!uninline_done)
8980 uninline_cond.Wait(uninline_flock);
8981 uninline_flock.Unlock();
8982 client_lock.Lock();
8983
8984 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8985 in->inline_data.clear();
8986 in->inline_version = CEPH_INLINE_NONE;
28e407b8 8987 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
8988 check_caps(in, 0);
8989 } else
8990 r = uninline_ret;
8991 }
8992
8993 if (have)
8994 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8995 if (r < 0) {
8996 if (movepos)
8997 unlock_fh_pos(f);
8998 return r;
8999 } else
9000 return bl->length();
7c673cae
FG
9001}
9002
9003Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9004 client(c), f(f) {
9005 f->get();
9006 f->readahead.inc_pending();
9007}
9008
9009Client::C_Readahead::~C_Readahead() {
9010 f->readahead.dec_pending();
9011 client->_put_fh(f);
9012}
9013
9014void Client::C_Readahead::finish(int r) {
9015 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9016 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9017}
9018
9019int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9020{
9021 const md_config_t *conf = cct->_conf;
9022 Inode *in = f->inode.get();
9023
9024 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
9025
9026 // trim read based on file size?
9027 if (off >= in->size)
9028 return 0;
9029 if (len == 0)
9030 return 0;
9031 if (off + len > in->size) {
9032 len = in->size - off;
9033 }
9034
9035 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9036 << " max_bytes=" << f->readahead.get_max_readahead_size()
9037 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9038
9039 // read (and possibly block)
9040 int r, rvalue = 0;
9041 Mutex flock("Client::_read_async flock");
9042 Cond cond;
9043 bool done = false;
9044 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
9045 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9046 off, len, bl, 0, onfinish);
9047 if (r == 0) {
9048 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9049 client_lock.Unlock();
9050 flock.Lock();
9051 while (!done)
9052 cond.Wait(flock);
9053 flock.Unlock();
9054 client_lock.Lock();
9055 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9056 r = rvalue;
9057 } else {
9058 // it was cached.
9059 delete onfinish;
9060 }
9061
9062 if(f->readahead.get_min_readahead_size() > 0) {
9063 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9064 if (readahead_extent.second > 0) {
9065 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9066 << " (caller wants " << off << "~" << len << ")" << dendl;
9067 Context *onfinish2 = new C_Readahead(this, f);
9068 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9069 readahead_extent.first, readahead_extent.second,
9070 NULL, 0, onfinish2);
9071 if (r2 == 0) {
9072 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9073 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9074 } else {
9075 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9076 delete onfinish2;
9077 }
9078 }
9079 }
9080
9081 return r;
9082}
9083
9084int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9085 bool *checkeof)
9086{
9087 Inode *in = f->inode.get();
9088 uint64_t pos = off;
9089 int left = len;
9090 int read = 0;
9091
9092 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
9093
9094 Mutex flock("Client::_read_sync flock");
9095 Cond cond;
9096 while (left > 0) {
9097 int r = 0;
9098 bool done = false;
9099 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
9100 bufferlist tbl;
9101
9102 int wanted = left;
9103 filer->read_trunc(in->ino, &in->layout, in->snapid,
9104 pos, left, &tbl, 0,
9105 in->truncate_size, in->truncate_seq,
9106 onfinish);
9107 client_lock.Unlock();
9108 flock.Lock();
9109 while (!done)
9110 cond.Wait(flock);
9111 flock.Unlock();
9112 client_lock.Lock();
9113
9114 // if we get ENOENT from OSD, assume 0 bytes returned
9115 if (r == -ENOENT)
9116 r = 0;
9117 if (r < 0)
9118 return r;
9119 if (tbl.length()) {
9120 r = tbl.length();
9121
9122 read += r;
9123 pos += r;
9124 left -= r;
9125 bl->claim_append(tbl);
9126 }
9127 // short read?
9128 if (r >= 0 && r < wanted) {
9129 if (pos < in->size) {
9130 // zero up to known EOF
9131 int64_t some = in->size - pos;
9132 if (some > left)
9133 some = left;
9134 bufferptr z(some);
9135 z.zero();
9136 bl->push_back(z);
9137 read += some;
9138 pos += some;
9139 left -= some;
9140 if (left == 0)
9141 return read;
9142 }
9143
9144 *checkeof = true;
9145 return read;
9146 }
9147 }
9148 return read;
9149}
9150
9151
9152/*
9153 * we keep count of uncommitted sync writes on the inode, so that
9154 * fsync can DDRT.
9155 */
9156void Client::_sync_write_commit(Inode *in)
9157{
9158 assert(unsafe_sync_write > 0);
9159 unsafe_sync_write--;
9160
9161 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9162
9163 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9164 if (unsafe_sync_write == 0 && unmounting) {
9165 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9166 mount_cond.Signal();
9167 }
9168}
9169
9170int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9171{
9172 Mutex::Locker lock(client_lock);
9173 tout(cct) << "write" << std::endl;
9174 tout(cct) << fd << std::endl;
9175 tout(cct) << size << std::endl;
9176 tout(cct) << offset << std::endl;
9177
181888fb
FG
9178 if (unmounting)
9179 return -ENOTCONN;
9180
7c673cae
FG
9181 Fh *fh = get_filehandle(fd);
9182 if (!fh)
9183 return -EBADF;
9184#if defined(__linux__) && defined(O_PATH)
9185 if (fh->flags & O_PATH)
9186 return -EBADF;
9187#endif
9188 int r = _write(fh, offset, size, buf, NULL, 0);
9189 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9190 return r;
9191}
9192
9193int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9194{
9195 if (iovcnt < 0)
9196 return -EINVAL;
9197 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9198}
9199
9200int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9201{
9202 Mutex::Locker lock(client_lock);
9203 tout(cct) << fd << std::endl;
9204 tout(cct) << offset << std::endl;
9205
181888fb
FG
9206 if (unmounting)
9207 return -ENOTCONN;
9208
7c673cae
FG
9209 Fh *fh = get_filehandle(fd);
9210 if (!fh)
9211 return -EBADF;
9212#if defined(__linux__) && defined(O_PATH)
9213 if (fh->flags & O_PATH)
9214 return -EBADF;
9215#endif
9216 loff_t totallen = 0;
9217 for (unsigned i = 0; i < iovcnt; i++) {
9218 totallen += iov[i].iov_len;
9219 }
9220 if (write) {
9221 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9222 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9223 return w;
9224 } else {
9225 bufferlist bl;
9226 int r = _read(fh, offset, totallen, &bl);
9227 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9228 if (r <= 0)
9229 return r;
9230
9231 int bufoff = 0;
9232 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9233 /*
9234 * This piece of code aims to handle the case that bufferlist does not have enough data
9235 * to fill in the iov
9236 */
9237 if (resid < iov[j].iov_len) {
9238 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9239 break;
9240 } else {
9241 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9242 }
9243 resid -= iov[j].iov_len;
9244 bufoff += iov[j].iov_len;
9245 }
9246 return r;
9247 }
9248}
9249
9250int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9251 const struct iovec *iov, int iovcnt)
9252{
f64942e4
AA
9253 uint64_t fpos = 0;
9254
7c673cae
FG
9255 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9256 return -EFBIG;
9257
9258 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9259 Inode *in = f->inode.get();
9260
9261 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9262 return -ENOSPC;
9263 }
9264
9265 assert(in->snapid == CEPH_NOSNAP);
9266
9267 // was Fh opened as writeable?
9268 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9269 return -EBADF;
9270
9271 // check quota
9272 uint64_t endoff = offset + size;
28e407b8
AA
9273 std::list<InodeRef> quota_roots;
9274 if (endoff > in->size &&
9275 is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
7c673cae
FG
9276 return -EDQUOT;
9277 }
9278
9279 // use/adjust fd pos?
9280 if (offset < 0) {
9281 lock_fh_pos(f);
9282 /*
9283 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9284 * change out from under us.
9285 */
9286 if (f->flags & O_APPEND) {
9287 int r = _lseek(f, 0, SEEK_END);
9288 if (r < 0) {
9289 unlock_fh_pos(f);
9290 return r;
9291 }
9292 }
9293 offset = f->pos;
f64942e4 9294 fpos = offset+size;
7c673cae
FG
9295 unlock_fh_pos(f);
9296 }
9297
9298 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9299
9300 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9301
9302 // time it.
9303 utime_t start = ceph_clock_now();
9304
9305 if (in->inline_version == 0) {
9306 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9307 if (r < 0)
9308 return r;
9309 assert(in->inline_version > 0);
9310 }
9311
9312 // copy into fresh buffer (since our write may be resub, async)
9313 bufferlist bl;
9314 if (buf) {
9315 if (size > 0)
9316 bl.append(buf, size);
9317 } else if (iov){
9318 for (int i = 0; i < iovcnt; i++) {
9319 if (iov[i].iov_len > 0) {
9320 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9321 }
9322 }
9323 }
9324
9325 utime_t lat;
9326 uint64_t totalwritten;
9327 int have;
9328 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9329 CEPH_CAP_FILE_BUFFER, &have, endoff);
9330 if (r < 0)
9331 return r;
9332
9333 /* clear the setuid/setgid bits, if any */
181888fb 9334 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9335 struct ceph_statx stx = { 0 };
9336
9337 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9338 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9339 if (r < 0)
9340 return r;
9341 } else {
9342 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9343 }
9344
9345 if (f->flags & O_DIRECT)
9346 have &= ~CEPH_CAP_FILE_BUFFER;
9347
9348 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9349
9350 Mutex uninline_flock("Client::_write_uninline_data flock");
9351 Cond uninline_cond;
9352 bool uninline_done = false;
9353 int uninline_ret = 0;
9354 Context *onuninline = NULL;
9355
9356 if (in->inline_version < CEPH_INLINE_NONE) {
9357 if (endoff > cct->_conf->client_max_inline_size ||
9358 endoff > CEPH_INLINE_MAX_SIZE ||
9359 !(have & CEPH_CAP_FILE_BUFFER)) {
9360 onuninline = new C_SafeCond(&uninline_flock,
9361 &uninline_cond,
9362 &uninline_done,
9363 &uninline_ret);
9364 uninline_data(in, onuninline);
9365 } else {
9366 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9367
9368 uint32_t len = in->inline_data.length();
9369
9370 if (endoff < len)
9371 in->inline_data.copy(endoff, len - endoff, bl);
9372
9373 if (offset < len)
9374 in->inline_data.splice(offset, len - offset);
9375 else if (offset > len)
9376 in->inline_data.append_zero(offset - len);
9377
9378 in->inline_data.append(bl);
9379 in->inline_version++;
9380
9381 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9382
9383 goto success;
9384 }
9385 }
9386
9387 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9388 // do buffered write
9389 if (!in->oset.dirty_or_tx)
9390 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9391
9392 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9393
9394 // async, caching, non-blocking.
9395 r = objectcacher->file_write(&in->oset, &in->layout,
9396 in->snaprealm->get_snap_context(),
9397 offset, size, bl, ceph::real_clock::now(),
9398 0);
9399 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9400
9401 if (r < 0)
9402 goto done;
9403
9404 // flush cached write if O_SYNC is set on file fh
9405 // O_DSYNC == O_SYNC on linux < 2.6.33
9406 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9407 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9408 _flush_range(in, offset, size);
9409 }
9410 } else {
9411 if (f->flags & O_DIRECT)
9412 _flush_range(in, offset, size);
9413
9414 // simple, non-atomic sync write
9415 Mutex flock("Client::_write flock");
9416 Cond cond;
9417 bool done = false;
9418 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9419
9420 unsafe_sync_write++;
9421 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9422
9423 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9424 offset, size, bl, ceph::real_clock::now(), 0,
9425 in->truncate_size, in->truncate_seq,
9426 onfinish);
9427 client_lock.Unlock();
9428 flock.Lock();
9429
9430 while (!done)
9431 cond.Wait(flock);
9432 flock.Unlock();
9433 client_lock.Lock();
9434 _sync_write_commit(in);
9435 }
9436
9437 // if we get here, write was successful, update client metadata
9438success:
9439 // time
9440 lat = ceph_clock_now();
9441 lat -= start;
9442 logger->tinc(l_c_wrlat, lat);
9443
f64942e4
AA
9444 if (fpos) {
9445 lock_fh_pos(f);
9446 f->pos = fpos;
9447 unlock_fh_pos(f);
9448 }
7c673cae
FG
9449 totalwritten = size;
9450 r = (int)totalwritten;
9451
9452 // extend file?
9453 if (totalwritten + offset > in->size) {
9454 in->size = totalwritten + offset;
28e407b8 9455 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9456
28e407b8 9457 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 9458 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9459 } else if (is_max_size_approaching(in)) {
9460 check_caps(in, 0);
7c673cae
FG
9461 }
9462
9463 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9464 } else {
9465 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9466 }
9467
9468 // mtime
91327a77 9469 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9470 in->change_attr++;
28e407b8 9471 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9472
9473done:
9474
9475 if (onuninline) {
9476 client_lock.Unlock();
9477 uninline_flock.Lock();
9478 while (!uninline_done)
9479 uninline_cond.Wait(uninline_flock);
9480 uninline_flock.Unlock();
9481 client_lock.Lock();
9482
9483 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9484 in->inline_data.clear();
9485 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9486 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9487 check_caps(in, 0);
9488 } else
9489 r = uninline_ret;
9490 }
9491
9492 put_cap_ref(in, CEPH_CAP_FILE_WR);
9493 return r;
9494}
9495
9496int Client::_flush(Fh *f)
9497{
9498 Inode *in = f->inode.get();
9499 int err = f->take_async_err();
9500 if (err != 0) {
9501 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9502 << cpp_strerror(err) << dendl;
9503 } else {
9504 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9505 }
9506
9507 return err;
9508}
9509
9510int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9511{
9512 struct ceph_statx stx;
9513 stx.stx_size = length;
9514 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9515}
9516
9517int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9518{
9519 Mutex::Locker lock(client_lock);
9520 tout(cct) << "ftruncate" << std::endl;
9521 tout(cct) << fd << std::endl;
9522 tout(cct) << length << std::endl;
9523
181888fb
FG
9524 if (unmounting)
9525 return -ENOTCONN;
9526
7c673cae
FG
9527 Fh *f = get_filehandle(fd);
9528 if (!f)
9529 return -EBADF;
9530#if defined(__linux__) && defined(O_PATH)
9531 if (f->flags & O_PATH)
9532 return -EBADF;
9533#endif
9534 struct stat attr;
9535 attr.st_size = length;
9536 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9537}
9538
9539int Client::fsync(int fd, bool syncdataonly)
9540{
9541 Mutex::Locker lock(client_lock);
9542 tout(cct) << "fsync" << std::endl;
9543 tout(cct) << fd << std::endl;
9544 tout(cct) << syncdataonly << std::endl;
9545
181888fb
FG
9546 if (unmounting)
9547 return -ENOTCONN;
9548
7c673cae
FG
9549 Fh *f = get_filehandle(fd);
9550 if (!f)
9551 return -EBADF;
9552#if defined(__linux__) && defined(O_PATH)
9553 if (f->flags & O_PATH)
9554 return -EBADF;
9555#endif
9556 int r = _fsync(f, syncdataonly);
9557 if (r == 0) {
9558 // The IOs in this fsync were okay, but maybe something happened
9559 // in the background that we shoudl be reporting?
9560 r = f->take_async_err();
1adf2230 9561 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9562 << ") = 0, async_err = " << r << dendl;
9563 } else {
9564 // Assume that an error we encountered during fsync, even reported
9565 // synchronously, would also have applied the error to the Fh, and we
9566 // should clear it here to avoid returning the same error again on next
9567 // call.
1adf2230 9568 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9569 << r << dendl;
9570 f->take_async_err();
9571 }
9572 return r;
9573}
9574
9575int Client::_fsync(Inode *in, bool syncdataonly)
9576{
9577 int r = 0;
9578 Mutex lock("Client::_fsync::lock");
9579 Cond cond;
9580 bool done = false;
9581 C_SafeCond *object_cacher_completion = NULL;
9582 ceph_tid_t flush_tid = 0;
9583 InodeRef tmp_ref;
9584
1adf2230 9585 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9586
9587 if (cct->_conf->client_oc) {
9588 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9589 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9590 _flush(in, object_cacher_completion);
9591 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9592 }
9593
9594 if (!syncdataonly && in->dirty_caps) {
9595 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9596 if (in->flushing_caps)
9597 flush_tid = last_flush_tid;
9598 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9599
9600 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9601 flush_mdlog_sync();
9602
7c673cae
FG
9603 MetaRequest *req = in->unsafe_ops.back();
9604 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9605
9606 req->get();
9607 wait_on_list(req->waitfor_safe);
9608 put_request(req);
9609 }
9610
9611 if (object_cacher_completion) { // wait on a real reply instead of guessing
9612 client_lock.Unlock();
9613 lock.Lock();
9614 ldout(cct, 15) << "waiting on data to flush" << dendl;
9615 while (!done)
9616 cond.Wait(lock);
9617 lock.Unlock();
9618 client_lock.Lock();
9619 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9620 } else {
9621 // FIXME: this can starve
9622 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9623 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9624 << " uncommitted, waiting" << dendl;
9625 wait_on_list(in->waitfor_commit);
9626 }
9627 }
9628
9629 if (!r) {
9630 if (flush_tid > 0)
9631 wait_sync_caps(in, flush_tid);
9632
9633 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9634 } else {
1adf2230 9635 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9636 << cpp_strerror(-r) << dendl;
9637 }
9638
9639 return r;
9640}
9641
9642int Client::_fsync(Fh *f, bool syncdataonly)
9643{
1adf2230 9644 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9645 return _fsync(f->inode.get(), syncdataonly);
9646}
9647
9648int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9649{
9650 Mutex::Locker lock(client_lock);
9651 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9652 tout(cct) << fd << std::endl;
9653
181888fb
FG
9654 if (unmounting)
9655 return -ENOTCONN;
9656
7c673cae
FG
9657 Fh *f = get_filehandle(fd);
9658 if (!f)
9659 return -EBADF;
9660 int r = _getattr(f->inode, mask, perms);
9661 if (r < 0)
9662 return r;
9663 fill_stat(f->inode, stbuf, NULL);
1adf2230 9664 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9665 return r;
9666}
9667
9668int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9669 unsigned int want, unsigned int flags)
9670{
9671 Mutex::Locker lock(client_lock);
9672 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9673 tout(cct) << fd << std::endl;
9674
181888fb
FG
9675 if (unmounting)
9676 return -ENOTCONN;
9677
7c673cae
FG
9678 Fh *f = get_filehandle(fd);
9679 if (!f)
9680 return -EBADF;
9681
9682 unsigned mask = statx_to_mask(flags, want);
9683
9684 int r = 0;
94b18763 9685 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9686 r = _getattr(f->inode, mask, perms);
9687 if (r < 0) {
9688 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9689 return r;
9690 }
9691 }
9692
9693 fill_statx(f->inode, mask, stx);
9694 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9695 return r;
9696}
9697
9698// not written yet, but i want to link!
9699
9700int Client::chdir(const char *relpath, std::string &new_cwd,
9701 const UserPerm& perms)
9702{
9703 Mutex::Locker lock(client_lock);
9704 tout(cct) << "chdir" << std::endl;
9705 tout(cct) << relpath << std::endl;
181888fb
FG
9706
9707 if (unmounting)
9708 return -ENOTCONN;
9709
7c673cae
FG
9710 filepath path(relpath);
9711 InodeRef in;
9712 int r = path_walk(path, &in, perms);
9713 if (r < 0)
9714 return r;
9715 if (cwd != in)
9716 cwd.swap(in);
9717 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9718
b5b8bbf5 9719 _getcwd(new_cwd, perms);
7c673cae
FG
9720 return 0;
9721}
9722
b5b8bbf5 9723void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9724{
9725 filepath path;
9726 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9727
9728 Inode *in = cwd.get();
9729 while (in != root) {
9730 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9731
9732 // A cwd or ancester is unlinked
9733 if (in->dn_set.empty()) {
9734 return;
9735 }
9736
9737 Dentry *dn = in->get_first_parent();
9738
9739
9740 if (!dn) {
9741 // look it up
9742 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9743 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9744 filepath path(in->ino);
9745 req->set_filepath(path);
9746 req->set_inode(in);
9747 int res = make_request(req, perms);
9748 if (res < 0)
9749 break;
9750
9751 // start over
9752 path = filepath();
9753 in = cwd.get();
9754 continue;
9755 }
9756 path.push_front_dentry(dn->name);
9757 in = dn->dir->parent_inode;
9758 }
9759 dir = "/";
9760 dir += path.get_path();
9761}
9762
b5b8bbf5
FG
9763void Client::getcwd(string& dir, const UserPerm& perms)
9764{
9765 Mutex::Locker l(client_lock);
181888fb
FG
9766 if (!unmounting)
9767 _getcwd(dir, perms);
b5b8bbf5
FG
9768}
9769
7c673cae
FG
9770int Client::statfs(const char *path, struct statvfs *stbuf,
9771 const UserPerm& perms)
9772{
9773 Mutex::Locker l(client_lock);
9774 tout(cct) << "statfs" << std::endl;
91327a77 9775 unsigned long int total_files_on_fs;
7c673cae 9776
181888fb
FG
9777 if (unmounting)
9778 return -ENOTCONN;
9779
7c673cae
FG
9780 ceph_statfs stats;
9781 C_SaferCond cond;
d2e6a577
FG
9782
9783 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9784 if (data_pools.size() == 1) {
9785 objecter->get_fs_stats(stats, data_pools[0], &cond);
9786 } else {
9787 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9788 }
7c673cae
FG
9789
9790 client_lock.Unlock();
9791 int rval = cond.wait();
91327a77
AA
9792 assert(root);
9793 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9794 client_lock.Lock();
9795
9796 if (rval < 0) {
9797 ldout(cct, 1) << "underlying call to statfs returned error: "
9798 << cpp_strerror(rval)
9799 << dendl;
9800 return rval;
9801 }
9802
9803 memset(stbuf, 0, sizeof(*stbuf));
9804
9805 /*
9806 * we're going to set a block size of 4MB so we can represent larger
9807 * FSes without overflowing. Additionally convert the space
9808 * measurements from KB to bytes while making them in terms of
9809 * blocks. We use 4MB only because it is big enough, and because it
9810 * actually *is* the (ceph) default block size.
9811 */
9812 const int CEPH_BLOCK_SHIFT = 22;
9813 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9814 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9815 stbuf->f_files = total_files_on_fs;
9816 stbuf->f_ffree = 0;
7c673cae
FG
9817 stbuf->f_favail = -1;
9818 stbuf->f_fsid = -1; // ??
9819 stbuf->f_flag = 0; // ??
9820 stbuf->f_namemax = NAME_MAX;
9821
9822 // Usually quota_root will == root_ancestor, but if the mount root has no
9823 // quota but we can see a parent of it that does have a quota, we'll
9824 // respect that one instead.
9825 assert(root != nullptr);
9826 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9827
9828 // get_quota_root should always give us something
9829 // because client quotas are always enabled
9830 assert(quota_root != nullptr);
9831
9832 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9833
9834 // Skip the getattr if any sessions are stale, as we don't want to
9835 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9836 // is unhealthy.
9837 if (!_any_stale_sessions()) {
9838 int r = _getattr(quota_root, 0, perms, true);
9839 if (r != 0) {
9840 // Ignore return value: error getting latest inode metadata is not a good
9841 // reason to break "df".
9842 lderr(cct) << "Error in getattr on quota root 0x"
9843 << std::hex << quota_root->ino << std::dec
9844 << " statfs result may be outdated" << dendl;
9845 }
9846 }
9847
9848 // Special case: if there is a size quota set on the Inode acting
9849 // as the root for this client mount, then report the quota status
9850 // as the filesystem statistics.
9851 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9852 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9853 // It is possible for a quota to be exceeded: arithmetic here must
9854 // handle case where used > total.
9855 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9856
9857 stbuf->f_blocks = total;
9858 stbuf->f_bfree = free;
9859 stbuf->f_bavail = free;
9860 } else {
d2e6a577 9861 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9862 // multiple pools may be used without one filesystem namespace via
9863 // layouts, this is the most correct thing we can do.
9864 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9865 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9866 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9867 }
9868
9869 return rval;
9870}
9871
9872int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9873 struct flock *fl, uint64_t owner, bool removing)
9874{
9875 ldout(cct, 10) << "_do_filelock ino " << in->ino
9876 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9877 << " type " << fl->l_type << " owner " << owner
9878 << " " << fl->l_start << "~" << fl->l_len << dendl;
9879
9880 int lock_cmd;
9881 if (F_RDLCK == fl->l_type)
9882 lock_cmd = CEPH_LOCK_SHARED;
9883 else if (F_WRLCK == fl->l_type)
9884 lock_cmd = CEPH_LOCK_EXCL;
9885 else if (F_UNLCK == fl->l_type)
9886 lock_cmd = CEPH_LOCK_UNLOCK;
9887 else
9888 return -EIO;
9889
9890 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9891 sleep = 0;
9892
9893 /*
9894 * Set the most significant bit, so that MDS knows the 'owner'
9895 * is sufficient to identify the owner of lock. (old code uses
9896 * both 'owner' and 'pid')
9897 */
9898 owner |= (1ULL << 63);
9899
9900 MetaRequest *req = new MetaRequest(op);
9901 filepath path;
9902 in->make_nosnap_relative_path(path);
9903 req->set_filepath(path);
9904 req->set_inode(in);
9905
9906 req->head.args.filelock_change.rule = lock_type;
9907 req->head.args.filelock_change.type = lock_cmd;
9908 req->head.args.filelock_change.owner = owner;
9909 req->head.args.filelock_change.pid = fl->l_pid;
9910 req->head.args.filelock_change.start = fl->l_start;
9911 req->head.args.filelock_change.length = fl->l_len;
9912 req->head.args.filelock_change.wait = sleep;
9913
9914 int ret;
9915 bufferlist bl;
9916
9917 if (sleep && switch_interrupt_cb) {
9918 // enable interrupt
9919 switch_interrupt_cb(callback_handle, req->get());
9920 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9921 // disable interrupt
9922 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9923 if (ret == 0 && req->aborted()) {
9924 // effect of this lock request has been revoked by the 'lock intr' request
9925 ret = req->get_abort_code();
9926 }
7c673cae
FG
9927 put_request(req);
9928 } else {
9929 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9930 }
9931
9932 if (ret == 0) {
9933 if (op == CEPH_MDS_OP_GETFILELOCK) {
9934 ceph_filelock filelock;
9935 bufferlist::iterator p = bl.begin();
9936 ::decode(filelock, p);
9937
9938 if (CEPH_LOCK_SHARED == filelock.type)
9939 fl->l_type = F_RDLCK;
9940 else if (CEPH_LOCK_EXCL == filelock.type)
9941 fl->l_type = F_WRLCK;
9942 else
9943 fl->l_type = F_UNLCK;
9944
9945 fl->l_whence = SEEK_SET;
9946 fl->l_start = filelock.start;
9947 fl->l_len = filelock.length;
9948 fl->l_pid = filelock.pid;
9949 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9950 ceph_lock_state_t *lock_state;
9951 if (lock_type == CEPH_LOCK_FCNTL) {
9952 if (!in->fcntl_locks)
9953 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9954 lock_state = in->fcntl_locks;
9955 } else if (lock_type == CEPH_LOCK_FLOCK) {
9956 if (!in->flock_locks)
9957 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9958 lock_state = in->flock_locks;
9959 } else {
9960 ceph_abort();
9961 return -EINVAL;
9962 }
9963 _update_lock_state(fl, owner, lock_state);
9964
9965 if (!removing) {
9966 if (lock_type == CEPH_LOCK_FCNTL) {
9967 if (!fh->fcntl_locks)
9968 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9969 lock_state = fh->fcntl_locks;
9970 } else {
9971 if (!fh->flock_locks)
9972 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9973 lock_state = fh->flock_locks;
9974 }
9975 _update_lock_state(fl, owner, lock_state);
9976 }
9977 } else
9978 ceph_abort();
9979 }
9980 return ret;
9981}
9982
9983int Client::_interrupt_filelock(MetaRequest *req)
9984{
31f18b77
FG
9985 // Set abort code, but do not kick. The abort code prevents the request
9986 // from being re-sent.
9987 req->abort(-EINTR);
9988 if (req->mds < 0)
9989 return 0; // haven't sent the request
9990
7c673cae
FG
9991 Inode *in = req->inode();
9992
9993 int lock_type;
9994 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9995 lock_type = CEPH_LOCK_FLOCK_INTR;
9996 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9997 lock_type = CEPH_LOCK_FCNTL_INTR;
9998 else {
9999 ceph_abort();
10000 return -EINVAL;
10001 }
10002
10003 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10004 filepath path;
10005 in->make_nosnap_relative_path(path);
10006 intr_req->set_filepath(path);
10007 intr_req->set_inode(in);
10008 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10009 intr_req->head.args.filelock_change.rule = lock_type;
10010 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10011
10012 UserPerm perms(req->get_uid(), req->get_gid());
10013 return make_request(intr_req, perms, NULL, NULL, -1);
10014}
10015
10016void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10017{
10018 if (!in->fcntl_locks && !in->flock_locks)
10019 return;
10020
10021 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10022 ::encode(nr_fcntl_locks, bl);
10023 if (nr_fcntl_locks) {
10024 ceph_lock_state_t* lock_state = in->fcntl_locks;
10025 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10026 p != lock_state->held_locks.end();
10027 ++p)
10028 ::encode(p->second, bl);
10029 }
10030
10031 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10032 ::encode(nr_flock_locks, bl);
10033 if (nr_flock_locks) {
10034 ceph_lock_state_t* lock_state = in->flock_locks;
10035 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10036 p != lock_state->held_locks.end();
10037 ++p)
10038 ::encode(p->second, bl);
10039 }
10040
10041 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
10042 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10043}
10044
10045void Client::_release_filelocks(Fh *fh)
10046{
10047 if (!fh->fcntl_locks && !fh->flock_locks)
10048 return;
10049
10050 Inode *in = fh->inode.get();
10051 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
10052
10053 list<pair<int, ceph_filelock> > to_release;
10054
10055 if (fh->fcntl_locks) {
10056 ceph_lock_state_t* lock_state = fh->fcntl_locks;
10057 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10058 p != lock_state->held_locks.end();
10059 ++p)
10060 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10061 delete fh->fcntl_locks;
10062 }
10063 if (fh->flock_locks) {
10064 ceph_lock_state_t* lock_state = fh->flock_locks;
10065 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10066 p != lock_state->held_locks.end();
10067 ++p)
10068 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10069 delete fh->flock_locks;
10070 }
10071
10072 if (to_release.empty())
10073 return;
10074
10075 struct flock fl;
10076 memset(&fl, 0, sizeof(fl));
10077 fl.l_whence = SEEK_SET;
10078 fl.l_type = F_UNLCK;
10079
10080 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10081 p != to_release.end();
10082 ++p) {
10083 fl.l_start = p->second.start;
10084 fl.l_len = p->second.length;
10085 fl.l_pid = p->second.pid;
10086 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10087 p->second.owner, true);
10088 }
10089}
10090
10091void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10092 ceph_lock_state_t *lock_state)
10093{
10094 int lock_cmd;
10095 if (F_RDLCK == fl->l_type)
10096 lock_cmd = CEPH_LOCK_SHARED;
10097 else if (F_WRLCK == fl->l_type)
10098 lock_cmd = CEPH_LOCK_EXCL;
10099 else
10100 lock_cmd = CEPH_LOCK_UNLOCK;;
10101
10102 ceph_filelock filelock;
10103 filelock.start = fl->l_start;
10104 filelock.length = fl->l_len;
10105 filelock.client = 0;
10106 // see comment in _do_filelock()
10107 filelock.owner = owner | (1ULL << 63);
10108 filelock.pid = fl->l_pid;
10109 filelock.type = lock_cmd;
10110
10111 if (filelock.type == CEPH_LOCK_UNLOCK) {
10112 list<ceph_filelock> activated_locks;
10113 lock_state->remove_lock(filelock, activated_locks);
10114 } else {
10115 bool r = lock_state->add_lock(filelock, false, false, NULL);
10116 assert(r);
10117 }
10118}
10119
10120int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10121{
10122 Inode *in = fh->inode.get();
10123 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10124 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10125 return ret;
10126}
10127
10128int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10129{
10130 Inode *in = fh->inode.get();
10131 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10132 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10133 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10134 return ret;
10135}
10136
10137int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10138{
10139 Inode *in = fh->inode.get();
10140 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10141
10142 int sleep = !(cmd & LOCK_NB);
10143 cmd &= ~LOCK_NB;
10144
10145 int type;
10146 switch (cmd) {
10147 case LOCK_SH:
10148 type = F_RDLCK;
10149 break;
10150 case LOCK_EX:
10151 type = F_WRLCK;
10152 break;
10153 case LOCK_UN:
10154 type = F_UNLCK;
10155 break;
10156 default:
10157 return -EINVAL;
10158 }
10159
10160 struct flock fl;
10161 memset(&fl, 0, sizeof(fl));
10162 fl.l_type = type;
10163 fl.l_whence = SEEK_SET;
10164
10165 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10166 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10167 return ret;
10168}
10169
10170int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10171{
10172 /* Since the only thing this does is wrap a call to statfs, and
10173 statfs takes a lock, it doesn't seem we have a need to split it
10174 out. */
10175 return statfs(0, stbuf, perms);
10176}
10177
10178void Client::ll_register_callbacks(struct client_callback_args *args)
10179{
10180 if (!args)
10181 return;
10182 Mutex::Locker l(client_lock);
10183 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10184 << " invalidate_ino_cb " << args->ino_cb
10185 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10186 << " switch_interrupt_cb " << args->switch_intr_cb
10187 << " remount_cb " << args->remount_cb
10188 << dendl;
10189 callback_handle = args->handle;
10190 if (args->ino_cb) {
10191 ino_invalidate_cb = args->ino_cb;
10192 async_ino_invalidator.start();
10193 }
10194 if (args->dentry_cb) {
10195 dentry_invalidate_cb = args->dentry_cb;
10196 async_dentry_invalidator.start();
10197 }
10198 if (args->switch_intr_cb) {
10199 switch_interrupt_cb = args->switch_intr_cb;
10200 interrupt_finisher.start();
10201 }
10202 if (args->remount_cb) {
10203 remount_cb = args->remount_cb;
10204 remount_finisher.start();
10205 }
7c673cae
FG
10206 umask_cb = args->umask_cb;
10207}
10208
10209int Client::test_dentry_handling(bool can_invalidate)
10210{
10211 int r = 0;
10212
10213 can_invalidate_dentries = can_invalidate;
10214
10215 if (can_invalidate_dentries) {
10216 assert(dentry_invalidate_cb);
10217 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10218 r = 0;
7c673cae
FG
10219 } else if (remount_cb) {
10220 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10221 r = _do_remount(false);
b32b8144
FG
10222 }
10223 if (r) {
10224 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10225 if (should_abort) {
10226 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
7c673cae 10227 ceph_abort();
b32b8144
FG
10228 } else {
10229 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10230 }
7c673cae
FG
10231 }
10232 return r;
10233}
10234
10235int Client::_sync_fs()
10236{
10237 ldout(cct, 10) << "_sync_fs" << dendl;
10238
10239 // flush file data
10240 Mutex lock("Client::_fsync::lock");
10241 Cond cond;
10242 bool flush_done = false;
10243 if (cct->_conf->client_oc)
10244 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10245 else
10246 flush_done = true;
10247
10248 // flush caps
10249 flush_caps_sync();
10250 ceph_tid_t flush_tid = last_flush_tid;
10251
10252 // wait for unsafe mds requests
10253 wait_unsafe_requests();
10254
10255 wait_sync_caps(flush_tid);
10256
10257 if (!flush_done) {
10258 client_lock.Unlock();
10259 lock.Lock();
10260 ldout(cct, 15) << "waiting on data to flush" << dendl;
10261 while (!flush_done)
10262 cond.Wait(lock);
10263 lock.Unlock();
10264 client_lock.Lock();
10265 }
10266
10267 return 0;
10268}
10269
10270int Client::sync_fs()
10271{
10272 Mutex::Locker l(client_lock);
181888fb
FG
10273
10274 if (unmounting)
10275 return -ENOTCONN;
10276
7c673cae
FG
10277 return _sync_fs();
10278}
10279
10280int64_t Client::drop_caches()
10281{
10282 Mutex::Locker l(client_lock);
10283 return objectcacher->release_all();
10284}
10285
10286
10287int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10288{
10289 Mutex::Locker l(client_lock);
10290 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10291 << ", " << offset << ", " << count << ")" << dendl;
10292
10293 Fh *f = get_filehandle(fd);
10294 if (!f)
10295 return -EBADF;
10296
10297 // for now
10298 _fsync(f, true);
10299
10300 return 0;
10301}
10302
10303int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10304{
10305 Mutex::Locker l(client_lock);
10306 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10307 << ", " << offset << ", " << count << ")" << dendl;
10308
10309 Fh *f = get_filehandle(fd);
10310 if (!f)
10311 return -EBADF;
10312 Inode *in = f->inode.get();
10313
10314 _fsync(f, true);
10315 if (_release(in))
10316 check_caps(in, 0);
10317 return 0;
10318}
10319
10320
10321// =============================
10322// snaps
10323
10324int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10325{
10326 Mutex::Locker l(client_lock);
181888fb
FG
10327
10328 if (unmounting)
10329 return -ENOTCONN;
10330
7c673cae
FG
10331 filepath path(relpath);
10332 InodeRef in;
10333 int r = path_walk(path, &in, perm);
10334 if (r < 0)
10335 return r;
10336 if (cct->_conf->client_permissions) {
10337 r = may_create(in.get(), perm);
10338 if (r < 0)
10339 return r;
10340 }
10341 Inode *snapdir = open_snapdir(in.get());
10342 return _mkdir(snapdir, name, 0, perm);
10343}
181888fb 10344
7c673cae
FG
10345int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10346{
10347 Mutex::Locker l(client_lock);
181888fb
FG
10348
10349 if (unmounting)
10350 return -ENOTCONN;
10351
7c673cae
FG
10352 filepath path(relpath);
10353 InodeRef in;
10354 int r = path_walk(path, &in, perms);
10355 if (r < 0)
10356 return r;
10357 if (cct->_conf->client_permissions) {
10358 r = may_delete(in.get(), NULL, perms);
10359 if (r < 0)
10360 return r;
10361 }
10362 Inode *snapdir = open_snapdir(in.get());
10363 return _rmdir(snapdir, name, perms);
10364}
10365
10366// =============================
10367// expose caps
10368
10369int Client::get_caps_issued(int fd) {
10370
10371 Mutex::Locker lock(client_lock);
10372
181888fb
FG
10373 if (unmounting)
10374 return -ENOTCONN;
10375
7c673cae
FG
10376 Fh *f = get_filehandle(fd);
10377 if (!f)
10378 return -EBADF;
10379
10380 return f->inode->caps_issued();
10381}
10382
10383int Client::get_caps_issued(const char *path, const UserPerm& perms)
10384{
10385 Mutex::Locker lock(client_lock);
181888fb
FG
10386
10387 if (unmounting)
10388 return -ENOTCONN;
10389
7c673cae
FG
10390 filepath p(path);
10391 InodeRef in;
10392 int r = path_walk(p, &in, perms, true);
10393 if (r < 0)
10394 return r;
10395 return in->caps_issued();
10396}
10397
10398// =========================================
10399// low level
10400
10401Inode *Client::open_snapdir(Inode *diri)
10402{
10403 Inode *in;
10404 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10405 if (!inode_map.count(vino)) {
10406 in = new Inode(this, vino, &diri->layout);
10407
10408 in->ino = diri->ino;
10409 in->snapid = CEPH_SNAPDIR;
10410 in->mode = diri->mode;
10411 in->uid = diri->uid;
10412 in->gid = diri->gid;
10413 in->mtime = diri->mtime;
10414 in->ctime = diri->ctime;
10415 in->btime = diri->btime;
10416 in->size = diri->size;
10417 in->change_attr = diri->change_attr;
10418
10419 in->dirfragtree.clear();
10420 in->snapdir_parent = diri;
10421 diri->flags |= I_SNAPDIR_OPEN;
10422 inode_map[vino] = in;
10423 if (use_faked_inos())
10424 _assign_faked_ino(in);
10425 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10426 } else {
10427 in = inode_map[vino];
10428 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10429 }
10430 return in;
10431}
10432
10433int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10434 Inode **out, const UserPerm& perms)
10435{
10436 Mutex::Locker lock(client_lock);
31f18b77
FG
10437 vinodeno_t vparent = _get_vino(parent);
10438 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10439 tout(cct) << "ll_lookup" << std::endl;
10440 tout(cct) << name << std::endl;
10441
181888fb
FG
10442 if (unmounting)
10443 return -ENOTCONN;
10444
7c673cae
FG
10445 int r = 0;
10446 if (!cct->_conf->fuse_default_permissions) {
10447 r = may_lookup(parent, perms);
10448 if (r < 0)
10449 return r;
10450 }
10451
10452 string dname(name);
10453 InodeRef in;
10454
10455 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10456 if (r < 0) {
10457 attr->st_ino = 0;
10458 goto out;
10459 }
10460
10461 assert(in);
10462 fill_stat(in, attr);
10463 _ll_get(in.get());
10464
10465 out:
31f18b77 10466 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10467 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10468 tout(cct) << attr->st_ino << std::endl;
10469 *out = in.get();
10470 return r;
10471}
10472
1adf2230
AA
10473int Client::ll_lookup_inode(
10474 struct inodeno_t ino,
10475 const UserPerm& perms,
10476 Inode **inode)
10477{
10478 Mutex::Locker lock(client_lock);
10479 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10480
10481 // Num1: get inode and *inode
10482 int r = _lookup_ino(ino, perms, inode);
10483 if (r) {
10484 return r;
10485 }
10486 assert(inode != NULL);
10487 assert(*inode != NULL);
10488
10489 // Num2: Request the parent inode, so that we can look up the name
10490 Inode *parent;
10491 r = _lookup_parent(*inode, perms, &parent);
10492 if (r && r != -EINVAL) {
10493 // Unexpected error
10494 _ll_forget(*inode, 1);
10495 return r;
10496 } else if (r == -EINVAL) {
10497 // EINVAL indicates node without parents (root), drop out now
10498 // and don't try to look up the non-existent dentry.
10499 return 0;
10500 }
10501 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10502 // is already in cache
10503 assert(parent != NULL);
10504
10505 // Num3: Finally, get the name (dentry) of the requested inode
10506 r = _lookup_name(*inode, parent, perms);
10507 if (r) {
10508 // Unexpected error
10509 _ll_forget(parent, 1);
10510 _ll_forget(*inode, 1);
10511 return r;
10512 }
10513
10514 _ll_forget(parent, 1);
10515 return 0;
10516}
10517
7c673cae
FG
10518int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10519 struct ceph_statx *stx, unsigned want, unsigned flags,
10520 const UserPerm& perms)
10521{
10522 Mutex::Locker lock(client_lock);
31f18b77
FG
10523 vinodeno_t vparent = _get_vino(parent);
10524 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10525 tout(cct) << "ll_lookupx" << std::endl;
10526 tout(cct) << name << std::endl;
10527
181888fb
FG
10528 if (unmounting)
10529 return -ENOTCONN;
10530
7c673cae
FG
10531 int r = 0;
10532 if (!cct->_conf->fuse_default_permissions) {
10533 r = may_lookup(parent, perms);
10534 if (r < 0)
10535 return r;
10536 }
10537
10538 string dname(name);
10539 InodeRef in;
10540
10541 unsigned mask = statx_to_mask(flags, want);
10542 r = _lookup(parent, dname, mask, &in, perms);
10543 if (r < 0) {
10544 stx->stx_ino = 0;
10545 stx->stx_mask = 0;
10546 } else {
10547 assert(in);
10548 fill_statx(in, mask, stx);
10549 _ll_get(in.get());
10550 }
10551
31f18b77 10552 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10553 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10554 tout(cct) << stx->stx_ino << std::endl;
10555 *out = in.get();
10556 return r;
10557}
10558
10559int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10560 unsigned int want, unsigned int flags, const UserPerm& perms)
10561{
10562 Mutex::Locker lock(client_lock);
181888fb
FG
10563
10564 if (unmounting)
10565 return -ENOTCONN;
10566
7c673cae
FG
10567 filepath fp(name, 0);
10568 InodeRef in;
10569 int rc;
10570 unsigned mask = statx_to_mask(flags, want);
10571
10572 ldout(cct, 3) << "ll_walk" << name << dendl;
10573 tout(cct) << "ll_walk" << std::endl;
10574 tout(cct) << name << std::endl;
10575
10576 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10577 if (rc < 0) {
10578 /* zero out mask, just in case... */
10579 stx->stx_mask = 0;
10580 stx->stx_ino = 0;
10581 *out = NULL;
10582 return rc;
10583 } else {
10584 assert(in);
10585 fill_statx(in, mask, stx);
10586 _ll_get(in.get());
10587 *out = in.get();
10588 return 0;
10589 }
10590}
10591
10592void Client::_ll_get(Inode *in)
10593{
10594 if (in->ll_ref == 0) {
10595 in->get();
10596 if (in->is_dir() && !in->dn_set.empty()) {
10597 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10598 in->get_first_parent()->get(); // pin dentry
10599 }
10600 }
10601 in->ll_get();
10602 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10603}
10604
10605int Client::_ll_put(Inode *in, int num)
10606{
10607 in->ll_put(num);
10608 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10609 if (in->ll_ref == 0) {
10610 if (in->is_dir() && !in->dn_set.empty()) {
10611 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10612 in->get_first_parent()->put(); // unpin dentry
10613 }
10614 put_inode(in);
10615 return 0;
10616 } else {
10617 return in->ll_ref;
10618 }
10619}
10620
10621void Client::_ll_drop_pins()
10622{
10623 ldout(cct, 10) << "_ll_drop_pins" << dendl;
1adf2230 10624 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10625 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10626 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10627 it != inode_map.end();
10628 it = next) {
10629 Inode *in = it->second;
10630 next = it;
10631 ++next;
1adf2230
AA
10632 if (in->ll_ref){
10633 to_be_put.insert(in);
7c673cae 10634 _ll_put(in, in->ll_ref);
1adf2230 10635 }
7c673cae
FG
10636 }
10637}
10638
1adf2230 10639bool Client::_ll_forget(Inode *in, int count)
7c673cae 10640{
7c673cae
FG
10641 inodeno_t ino = _get_inodeno(in);
10642
1adf2230 10643 ldout(cct, 8) << "ll_forget " << ino << " " << count << dendl;
7c673cae
FG
10644 tout(cct) << "ll_forget" << std::endl;
10645 tout(cct) << ino.val << std::endl;
10646 tout(cct) << count << std::endl;
10647
181888fb
FG
10648 // Ignore forget if we're no longer mounted
10649 if (unmounting)
10650 return true;
10651
7c673cae
FG
10652 if (ino == 1) return true; // ignore forget on root.
10653
10654 bool last = false;
10655 if (in->ll_ref < count) {
10656 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10657 << ", which only has ll_ref=" << in->ll_ref << dendl;
10658 _ll_put(in, in->ll_ref);
10659 last = true;
10660 } else {
10661 if (_ll_put(in, count) == 0)
10662 last = true;
10663 }
10664
10665 return last;
10666}
10667
1adf2230
AA
10668bool Client::ll_forget(Inode *in, int count)
10669{
10670 Mutex::Locker lock(client_lock);
10671 return _ll_forget(in, count);
10672}
10673
7c673cae
FG
10674bool Client::ll_put(Inode *in)
10675{
10676 /* ll_forget already takes the lock */
10677 return ll_forget(in, 1);
10678}
10679
10680snapid_t Client::ll_get_snapid(Inode *in)
10681{
10682 Mutex::Locker lock(client_lock);
10683 return in->snapid;
10684}
10685
10686Inode *Client::ll_get_inode(ino_t ino)
10687{
10688 Mutex::Locker lock(client_lock);
181888fb
FG
10689
10690 if (unmounting)
10691 return NULL;
10692
7c673cae
FG
10693 vinodeno_t vino = _map_faked_ino(ino);
10694 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10695 if (p == inode_map.end())
10696 return NULL;
10697 Inode *in = p->second;
10698 _ll_get(in);
10699 return in;
10700}
10701
10702Inode *Client::ll_get_inode(vinodeno_t vino)
10703{
10704 Mutex::Locker lock(client_lock);
181888fb
FG
10705
10706 if (unmounting)
10707 return NULL;
10708
7c673cae
FG
10709 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10710 if (p == inode_map.end())
10711 return NULL;
10712 Inode *in = p->second;
10713 _ll_get(in);
10714 return in;
10715}
10716
10717int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10718{
10719 vinodeno_t vino = _get_vino(in);
10720
1adf2230 10721 ldout(cct, 8) << "ll_getattr " << vino << dendl;
7c673cae
FG
10722 tout(cct) << "ll_getattr" << std::endl;
10723 tout(cct) << vino.ino.val << std::endl;
10724
10725 if (vino.snapid < CEPH_NOSNAP)
10726 return 0;
10727 else
10728 return _getattr(in, caps, perms);
10729}
10730
10731int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10732{
10733 Mutex::Locker lock(client_lock);
10734
181888fb
FG
10735 if (unmounting)
10736 return -ENOTCONN;
10737
7c673cae
FG
10738 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10739
10740 if (res == 0)
10741 fill_stat(in, attr);
10742 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10743 return res;
10744}
10745
10746int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10747 unsigned int flags, const UserPerm& perms)
10748{
10749 Mutex::Locker lock(client_lock);
10750
181888fb
FG
10751 if (unmounting)
10752 return -ENOTCONN;
10753
7c673cae
FG
10754 int res = 0;
10755 unsigned mask = statx_to_mask(flags, want);
10756
94b18763 10757 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10758 res = _ll_getattr(in, mask, perms);
10759
10760 if (res == 0)
10761 fill_statx(in, mask, stx);
10762 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10763 return res;
10764}
10765
10766int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10767 const UserPerm& perms, InodeRef *inp)
10768{
10769 vinodeno_t vino = _get_vino(in);
10770
1adf2230 10771 ldout(cct, 8) << "ll_setattrx " << vino << " mask " << hex << mask << dec
7c673cae
FG
10772 << dendl;
10773 tout(cct) << "ll_setattrx" << std::endl;
10774 tout(cct) << vino.ino.val << std::endl;
10775 tout(cct) << stx->stx_mode << std::endl;
10776 tout(cct) << stx->stx_uid << std::endl;
10777 tout(cct) << stx->stx_gid << std::endl;
10778 tout(cct) << stx->stx_size << std::endl;
10779 tout(cct) << stx->stx_mtime << std::endl;
10780 tout(cct) << stx->stx_atime << std::endl;
10781 tout(cct) << stx->stx_btime << std::endl;
10782 tout(cct) << mask << std::endl;
10783
10784 if (!cct->_conf->fuse_default_permissions) {
10785 int res = may_setattr(in, stx, mask, perms);
10786 if (res < 0)
10787 return res;
10788 }
10789
10790 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10791
10792 return __setattrx(in, stx, mask, perms, inp);
10793}
10794
10795int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10796 const UserPerm& perms)
10797{
10798 Mutex::Locker lock(client_lock);
181888fb
FG
10799
10800 if (unmounting)
10801 return -ENOTCONN;
10802
7c673cae
FG
10803 InodeRef target(in);
10804 int res = _ll_setattrx(in, stx, mask, perms, &target);
10805 if (res == 0) {
10806 assert(in == target.get());
10807 fill_statx(in, in->caps_issued(), stx);
10808 }
10809
10810 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10811 return res;
10812}
10813
10814int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10815 const UserPerm& perms)
10816{
10817 struct ceph_statx stx;
10818 stat_to_statx(attr, &stx);
10819
10820 Mutex::Locker lock(client_lock);
181888fb
FG
10821
10822 if (unmounting)
10823 return -ENOTCONN;
10824
7c673cae
FG
10825 InodeRef target(in);
10826 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10827 if (res == 0) {
10828 assert(in == target.get());
10829 fill_stat(in, attr);
10830 }
10831
10832 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10833 return res;
10834}
10835
10836
10837// ----------
10838// xattrs
10839
10840int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10841 const UserPerm& perms)
10842{
10843 Mutex::Locker lock(client_lock);
181888fb
FG
10844
10845 if (unmounting)
10846 return -ENOTCONN;
10847
7c673cae
FG
10848 InodeRef in;
10849 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10850 if (r < 0)
10851 return r;
10852 return _getxattr(in, name, value, size, perms);
10853}
10854
10855int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10856 const UserPerm& perms)
10857{
10858 Mutex::Locker lock(client_lock);
181888fb
FG
10859
10860 if (unmounting)
10861 return -ENOTCONN;
10862
7c673cae
FG
10863 InodeRef in;
10864 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10865 if (r < 0)
10866 return r;
10867 return _getxattr(in, name, value, size, perms);
10868}
10869
10870int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10871 const UserPerm& perms)
10872{
10873 Mutex::Locker lock(client_lock);
181888fb
FG
10874
10875 if (unmounting)
10876 return -ENOTCONN;
10877
7c673cae
FG
10878 Fh *f = get_filehandle(fd);
10879 if (!f)
10880 return -EBADF;
10881 return _getxattr(f->inode, name, value, size, perms);
10882}
10883
10884int Client::listxattr(const char *path, char *list, size_t size,
10885 const UserPerm& perms)
10886{
10887 Mutex::Locker lock(client_lock);
181888fb
FG
10888
10889 if (unmounting)
10890 return -ENOTCONN;
10891
7c673cae
FG
10892 InodeRef in;
10893 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10894 if (r < 0)
10895 return r;
10896 return Client::_listxattr(in.get(), list, size, perms);
10897}
10898
10899int Client::llistxattr(const char *path, char *list, size_t size,
10900 const UserPerm& perms)
10901{
10902 Mutex::Locker lock(client_lock);
181888fb
FG
10903
10904 if (unmounting)
10905 return -ENOTCONN;
10906
7c673cae
FG
10907 InodeRef in;
10908 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10909 if (r < 0)
10910 return r;
10911 return Client::_listxattr(in.get(), list, size, perms);
10912}
10913
10914int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10915{
10916 Mutex::Locker lock(client_lock);
181888fb
FG
10917
10918 if (unmounting)
10919 return -ENOTCONN;
10920
7c673cae
FG
10921 Fh *f = get_filehandle(fd);
10922 if (!f)
10923 return -EBADF;
10924 return Client::_listxattr(f->inode.get(), list, size, perms);
10925}
10926
10927int Client::removexattr(const char *path, const char *name,
10928 const UserPerm& perms)
10929{
10930 Mutex::Locker lock(client_lock);
181888fb
FG
10931
10932 if (unmounting)
10933 return -ENOTCONN;
10934
7c673cae
FG
10935 InodeRef in;
10936 int r = Client::path_walk(path, &in, perms, true);
10937 if (r < 0)
10938 return r;
10939 return _removexattr(in, name, perms);
10940}
10941
10942int Client::lremovexattr(const char *path, const char *name,
10943 const UserPerm& perms)
10944{
10945 Mutex::Locker lock(client_lock);
181888fb
FG
10946
10947 if (unmounting)
10948 return -ENOTCONN;
10949
7c673cae
FG
10950 InodeRef in;
10951 int r = Client::path_walk(path, &in, perms, false);
10952 if (r < 0)
10953 return r;
10954 return _removexattr(in, name, perms);
10955}
10956
10957int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10958{
10959 Mutex::Locker lock(client_lock);
181888fb
FG
10960
10961 if (unmounting)
10962 return -ENOTCONN;
10963
7c673cae
FG
10964 Fh *f = get_filehandle(fd);
10965 if (!f)
10966 return -EBADF;
10967 return _removexattr(f->inode, name, perms);
10968}
10969
10970int Client::setxattr(const char *path, const char *name, const void *value,
10971 size_t size, int flags, const UserPerm& perms)
10972{
10973 _setxattr_maybe_wait_for_osdmap(name, value, size);
10974
10975 Mutex::Locker lock(client_lock);
181888fb
FG
10976
10977 if (unmounting)
10978 return -ENOTCONN;
10979
7c673cae
FG
10980 InodeRef in;
10981 int r = Client::path_walk(path, &in, perms, true);
10982 if (r < 0)
10983 return r;
10984 return _setxattr(in, name, value, size, flags, perms);
10985}
10986
10987int Client::lsetxattr(const char *path, const char *name, const void *value,
10988 size_t size, int flags, const UserPerm& perms)
10989{
10990 _setxattr_maybe_wait_for_osdmap(name, value, size);
10991
10992 Mutex::Locker lock(client_lock);
181888fb
FG
10993
10994 if (unmounting)
10995 return -ENOTCONN;
10996
7c673cae
FG
10997 InodeRef in;
10998 int r = Client::path_walk(path, &in, perms, false);
10999 if (r < 0)
11000 return r;
11001 return _setxattr(in, name, value, size, flags, perms);
11002}
11003
11004int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11005 int flags, const UserPerm& perms)
11006{
11007 _setxattr_maybe_wait_for_osdmap(name, value, size);
11008
11009 Mutex::Locker lock(client_lock);
181888fb
FG
11010
11011 if (unmounting)
11012 return -ENOTCONN;
11013
7c673cae
FG
11014 Fh *f = get_filehandle(fd);
11015 if (!f)
11016 return -EBADF;
11017 return _setxattr(f->inode, name, value, size, flags, perms);
11018}
11019
11020int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11021 const UserPerm& perms)
11022{
11023 int r;
11024
11025 const VXattr *vxattr = _match_vxattr(in, name);
11026 if (vxattr) {
11027 r = -ENODATA;
11028
11029 // Do a force getattr to get the latest quota before returning
11030 // a value to userspace.
28e407b8
AA
11031 int flags = 0;
11032 if (vxattr->flags & VXATTR_RSTAT) {
11033 flags |= CEPH_STAT_RSTAT;
11034 }
11035 r = _getattr(in, flags, perms, true);
7c673cae
FG
11036 if (r != 0) {
11037 // Error from getattr!
11038 return r;
11039 }
11040
11041 // call pointer-to-member function
11042 char buf[256];
11043 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11044 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11045 } else {
11046 r = -ENODATA;
11047 }
11048
11049 if (size != 0) {
11050 if (r > (int)size) {
11051 r = -ERANGE;
11052 } else if (r > 0) {
11053 memcpy(value, buf, r);
11054 }
11055 }
11056 goto out;
11057 }
11058
11059 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11060 r = -EOPNOTSUPP;
11061 goto out;
11062 }
11063
11064 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11065 if (r == 0) {
11066 string n(name);
11067 r = -ENODATA;
11068 if (in->xattrs.count(n)) {
11069 r = in->xattrs[n].length();
11070 if (r > 0 && size != 0) {
11071 if (size >= (unsigned)r)
11072 memcpy(value, in->xattrs[n].c_str(), r);
11073 else
11074 r = -ERANGE;
11075 }
11076 }
11077 }
11078 out:
1adf2230 11079 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11080 return r;
11081}
11082
11083int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11084 const UserPerm& perms)
11085{
11086 if (cct->_conf->client_permissions) {
11087 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11088 if (r < 0)
11089 return r;
11090 }
11091 return _getxattr(in.get(), name, value, size, perms);
11092}
11093
11094int Client::ll_getxattr(Inode *in, const char *name, void *value,
11095 size_t size, const UserPerm& perms)
11096{
11097 Mutex::Locker lock(client_lock);
11098
181888fb
FG
11099 if (unmounting)
11100 return -ENOTCONN;
11101
7c673cae
FG
11102 vinodeno_t vino = _get_vino(in);
11103
11104 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
11105 tout(cct) << "ll_getxattr" << std::endl;
11106 tout(cct) << vino.ino.val << std::endl;
11107 tout(cct) << name << std::endl;
11108
11109 if (!cct->_conf->fuse_default_permissions) {
11110 int r = xattr_permission(in, name, MAY_READ, perms);
11111 if (r < 0)
11112 return r;
11113 }
11114
11115 return _getxattr(in, name, value, size, perms);
11116}
11117
11118int Client::_listxattr(Inode *in, char *name, size_t size,
11119 const UserPerm& perms)
11120{
11121 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11122 if (r == 0) {
11123 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11124 p != in->xattrs.end();
11125 ++p)
11126 r += p->first.length() + 1;
11127
11128 const VXattr *vxattrs = _get_vxattrs(in);
11129 r += _vxattrs_name_size(vxattrs);
11130
11131 if (size != 0) {
11132 if (size >= (unsigned)r) {
11133 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11134 p != in->xattrs.end();
11135 ++p) {
11136 memcpy(name, p->first.c_str(), p->first.length());
11137 name += p->first.length();
11138 *name = '\0';
11139 name++;
11140 }
11141 if (vxattrs) {
11142 for (int i = 0; !vxattrs[i].name.empty(); i++) {
11143 const VXattr& vxattr = vxattrs[i];
11144 if (vxattr.hidden)
11145 continue;
11146 // call pointer-to-member function
11147 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11148 continue;
11149 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11150 name += vxattr.name.length();
11151 *name = '\0';
11152 name++;
11153 }
11154 }
11155 } else
11156 r = -ERANGE;
11157 }
11158 }
1adf2230 11159 ldout(cct, 8) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11160 return r;
11161}
11162
11163int Client::ll_listxattr(Inode *in, char *names, size_t size,
11164 const UserPerm& perms)
11165{
11166 Mutex::Locker lock(client_lock);
11167
181888fb
FG
11168 if (unmounting)
11169 return -ENOTCONN;
11170
7c673cae
FG
11171 vinodeno_t vino = _get_vino(in);
11172
11173 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
11174 tout(cct) << "ll_listxattr" << std::endl;
11175 tout(cct) << vino.ino.val << std::endl;
11176 tout(cct) << size << std::endl;
11177
11178 return _listxattr(in, names, size, perms);
11179}
11180
11181int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11182 size_t size, int flags, const UserPerm& perms)
11183{
11184
11185 int xattr_flags = 0;
11186 if (!value)
11187 xattr_flags |= CEPH_XATTR_REMOVE;
11188 if (flags & XATTR_CREATE)
11189 xattr_flags |= CEPH_XATTR_CREATE;
11190 if (flags & XATTR_REPLACE)
11191 xattr_flags |= CEPH_XATTR_REPLACE;
11192
11193 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11194 filepath path;
11195 in->make_nosnap_relative_path(path);
11196 req->set_filepath(path);
11197 req->set_string2(name);
11198 req->set_inode(in);
11199 req->head.args.setxattr.flags = xattr_flags;
11200
11201 bufferlist bl;
11202 bl.append((const char*)value, size);
11203 req->set_data(bl);
11204
11205 int res = make_request(req, perms);
11206
11207 trim_cache();
11208 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11209 res << dendl;
11210 return res;
11211}
11212
11213int Client::_setxattr(Inode *in, const char *name, const void *value,
11214 size_t size, int flags, const UserPerm& perms)
11215{
11216 if (in->snapid != CEPH_NOSNAP) {
11217 return -EROFS;
11218 }
11219
11220 bool posix_acl_xattr = false;
11221 if (acl_type == POSIX_ACL)
11222 posix_acl_xattr = !strncmp(name, "system.", 7);
11223
11224 if (strncmp(name, "user.", 5) &&
11225 strncmp(name, "security.", 9) &&
11226 strncmp(name, "trusted.", 8) &&
11227 strncmp(name, "ceph.", 5) &&
11228 !posix_acl_xattr)
11229 return -EOPNOTSUPP;
11230
11231 if (posix_acl_xattr) {
11232 if (!strcmp(name, ACL_EA_ACCESS)) {
11233 mode_t new_mode = in->mode;
11234 if (value) {
11235 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11236 if (ret < 0)
11237 return ret;
11238 if (ret == 0) {
11239 value = NULL;
11240 size = 0;
11241 }
11242 if (new_mode != in->mode) {
11243 struct ceph_statx stx;
11244 stx.stx_mode = new_mode;
11245 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11246 if (ret < 0)
11247 return ret;
11248 }
11249 }
11250 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11251 if (value) {
11252 if (!S_ISDIR(in->mode))
11253 return -EACCES;
11254 int ret = posix_acl_check(value, size);
11255 if (ret < 0)
11256 return -EINVAL;
11257 if (ret == 0) {
11258 value = NULL;
11259 size = 0;
11260 }
11261 }
11262 } else {
11263 return -EOPNOTSUPP;
11264 }
11265 } else {
11266 const VXattr *vxattr = _match_vxattr(in, name);
11267 if (vxattr && vxattr->readonly)
11268 return -EOPNOTSUPP;
11269 }
11270
11271 return _do_setxattr(in, name, value, size, flags, perms);
11272}
11273
11274int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11275 size_t size, int flags, const UserPerm& perms)
11276{
11277 if (cct->_conf->client_permissions) {
11278 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11279 if (r < 0)
11280 return r;
11281 }
11282 return _setxattr(in.get(), name, value, size, flags, perms);
11283}
11284
11285int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11286{
11287 string tmp;
11288 if (name == "layout") {
11289 string::iterator begin = value.begin();
11290 string::iterator end = value.end();
11291 keys_and_values<string::iterator> p; // create instance of parser
11292 std::map<string, string> m; // map to receive results
11293 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11294 return -EINVAL;
11295 }
11296 if (begin != end)
11297 return -EINVAL;
11298 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11299 if (q->first == "pool") {
11300 tmp = q->second;
11301 break;
11302 }
11303 }
11304 } else if (name == "layout.pool") {
11305 tmp = value;
11306 }
11307
11308 if (tmp.length()) {
11309 int64_t pool;
11310 try {
11311 pool = boost::lexical_cast<unsigned>(tmp);
11312 if (!osdmap->have_pg_pool(pool))
11313 return -ENOENT;
11314 } catch (boost::bad_lexical_cast const&) {
11315 pool = osdmap->lookup_pg_pool_name(tmp);
11316 if (pool < 0) {
11317 return -ENOENT;
11318 }
11319 }
11320 }
11321
11322 return 0;
11323}
11324
11325void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11326{
11327 // For setting pool of layout, MetaRequest need osdmap epoch.
11328 // There is a race which create a new data pool but client and mds both don't have.
11329 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11330 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11331 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11332 string rest(strstr(name, "layout"));
11333 string v((const char*)value, size);
11334 int r = objecter->with_osdmap([&](const OSDMap& o) {
11335 return _setxattr_check_data_pool(rest, v, &o);
11336 });
11337
11338 if (r == -ENOENT) {
11339 C_SaferCond ctx;
11340 objecter->wait_for_latest_osdmap(&ctx);
11341 ctx.wait();
11342 }
11343 }
11344}
11345
11346int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11347 size_t size, int flags, const UserPerm& perms)
11348{
11349 _setxattr_maybe_wait_for_osdmap(name, value, size);
11350
11351 Mutex::Locker lock(client_lock);
11352
181888fb
FG
11353 if (unmounting)
11354 return -ENOTCONN;
11355
7c673cae
FG
11356 vinodeno_t vino = _get_vino(in);
11357
11358 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11359 tout(cct) << "ll_setxattr" << std::endl;
11360 tout(cct) << vino.ino.val << std::endl;
11361 tout(cct) << name << std::endl;
11362
11363 if (!cct->_conf->fuse_default_permissions) {
11364 int r = xattr_permission(in, name, MAY_WRITE, perms);
11365 if (r < 0)
11366 return r;
11367 }
11368 return _setxattr(in, name, value, size, flags, perms);
11369}
11370
11371int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11372{
11373 if (in->snapid != CEPH_NOSNAP) {
11374 return -EROFS;
11375 }
11376
11377 // same xattrs supported by kernel client
11378 if (strncmp(name, "user.", 5) &&
11379 strncmp(name, "system.", 7) &&
11380 strncmp(name, "security.", 9) &&
11381 strncmp(name, "trusted.", 8) &&
11382 strncmp(name, "ceph.", 5))
11383 return -EOPNOTSUPP;
11384
11385 const VXattr *vxattr = _match_vxattr(in, name);
11386 if (vxattr && vxattr->readonly)
11387 return -EOPNOTSUPP;
11388
11389 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11390 filepath path;
11391 in->make_nosnap_relative_path(path);
11392 req->set_filepath(path);
11393 req->set_filepath2(name);
11394 req->set_inode(in);
11395
11396 int res = make_request(req, perms);
11397
11398 trim_cache();
1adf2230 11399 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11400 return res;
11401}
11402
11403int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11404{
11405 if (cct->_conf->client_permissions) {
11406 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11407 if (r < 0)
11408 return r;
11409 }
11410 return _removexattr(in.get(), name, perms);
11411}
11412
11413int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11414{
11415 Mutex::Locker lock(client_lock);
11416
181888fb
FG
11417 if (unmounting)
11418 return -ENOTCONN;
11419
7c673cae
FG
11420 vinodeno_t vino = _get_vino(in);
11421
11422 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11423 tout(cct) << "ll_removexattr" << std::endl;
11424 tout(cct) << vino.ino.val << std::endl;
11425 tout(cct) << name << std::endl;
11426
11427 if (!cct->_conf->fuse_default_permissions) {
11428 int r = xattr_permission(in, name, MAY_WRITE, perms);
11429 if (r < 0)
11430 return r;
11431 }
11432
11433 return _removexattr(in, name, perms);
11434}
11435
11436bool Client::_vxattrcb_quota_exists(Inode *in)
11437{
11438 return in->quota.is_enable();
11439}
11440size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11441{
11442 return snprintf(val, size,
11443 "max_bytes=%lld max_files=%lld",
11444 (long long int)in->quota.max_bytes,
11445 (long long int)in->quota.max_files);
11446}
11447size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11448{
11449 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11450}
11451size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11452{
11453 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11454}
11455
11456bool Client::_vxattrcb_layout_exists(Inode *in)
11457{
11458 return in->layout != file_layout_t();
11459}
11460size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11461{
11462 int r = snprintf(val, size,
11463 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11464 (unsigned long long)in->layout.stripe_unit,
11465 (unsigned long long)in->layout.stripe_count,
11466 (unsigned long long)in->layout.object_size);
11467 objecter->with_osdmap([&](const OSDMap& o) {
11468 if (o.have_pg_pool(in->layout.pool_id))
11469 r += snprintf(val + r, size - r, "%s",
11470 o.get_pool_name(in->layout.pool_id).c_str());
11471 else
11472 r += snprintf(val + r, size - r, "%" PRIu64,
11473 (uint64_t)in->layout.pool_id);
11474 });
11475 if (in->layout.pool_ns.length())
11476 r += snprintf(val + r, size - r, " pool_namespace=%s",
11477 in->layout.pool_ns.c_str());
11478 return r;
11479}
11480size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11481{
11482 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11483}
11484size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11485{
11486 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11487}
11488size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11489{
11490 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11491}
11492size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11493{
11494 size_t r;
11495 objecter->with_osdmap([&](const OSDMap& o) {
11496 if (o.have_pg_pool(in->layout.pool_id))
11497 r = snprintf(val, size, "%s", o.get_pool_name(
11498 in->layout.pool_id).c_str());
11499 else
11500 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11501 });
11502 return r;
11503}
11504size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11505{
11506 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11507}
11508size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11509{
11510 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11511}
11512size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11513{
11514 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11515}
11516size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11517{
11518 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11519}
11520size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11521{
11522 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11523}
11524size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11525{
11526 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11527}
11528size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11529{
11530 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11531}
11532size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11533{
11534 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11535}
11536size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11537{
11538 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11539 (long)in->rstat.rctime.nsec());
11540}
11541
11542#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11543#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11544
11545#define XATTR_NAME_CEPH(_type, _name) \
11546{ \
11547 name: CEPH_XATTR_NAME(_type, _name), \
11548 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11549 readonly: true, \
11550 hidden: false, \
11551 exists_cb: NULL, \
28e407b8
AA
11552 flags: 0, \
11553}
11554#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11555{ \
11556 name: CEPH_XATTR_NAME(_type, _name), \
11557 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11558 readonly: true, \
11559 hidden: false, \
11560 exists_cb: NULL, \
11561 flags: _flags, \
7c673cae
FG
11562}
11563#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11564{ \
11565 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11566 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11567 readonly: false, \
11568 hidden: true, \
11569 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11570 flags: 0, \
7c673cae
FG
11571}
11572#define XATTR_QUOTA_FIELD(_type, _name) \
11573{ \
11574 name: CEPH_XATTR_NAME(_type, _name), \
11575 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11576 readonly: false, \
11577 hidden: true, \
11578 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11579 flags: 0, \
7c673cae
FG
11580}
11581
11582const Client::VXattr Client::_dir_vxattrs[] = {
11583 {
11584 name: "ceph.dir.layout",
11585 getxattr_cb: &Client::_vxattrcb_layout,
11586 readonly: false,
11587 hidden: true,
11588 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11589 flags: 0,
7c673cae
FG
11590 },
11591 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11592 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11593 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11594 XATTR_LAYOUT_FIELD(dir, layout, pool),
11595 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11596 XATTR_NAME_CEPH(dir, entries),
11597 XATTR_NAME_CEPH(dir, files),
11598 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11599 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11600 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11601 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11602 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11603 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11604 {
11605 name: "ceph.quota",
11606 getxattr_cb: &Client::_vxattrcb_quota,
11607 readonly: false,
11608 hidden: true,
11609 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11610 flags: 0,
7c673cae
FG
11611 },
11612 XATTR_QUOTA_FIELD(quota, max_bytes),
11613 XATTR_QUOTA_FIELD(quota, max_files),
11614 { name: "" } /* Required table terminator */
11615};
11616
11617const Client::VXattr Client::_file_vxattrs[] = {
11618 {
11619 name: "ceph.file.layout",
11620 getxattr_cb: &Client::_vxattrcb_layout,
11621 readonly: false,
11622 hidden: true,
11623 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11624 flags: 0,
7c673cae
FG
11625 },
11626 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11627 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11628 XATTR_LAYOUT_FIELD(file, layout, object_size),
11629 XATTR_LAYOUT_FIELD(file, layout, pool),
11630 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11631 { name: "" } /* Required table terminator */
11632};
11633
11634const Client::VXattr *Client::_get_vxattrs(Inode *in)
11635{
11636 if (in->is_dir())
11637 return _dir_vxattrs;
11638 else if (in->is_file())
11639 return _file_vxattrs;
11640 return NULL;
11641}
11642
11643const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11644{
11645 if (strncmp(name, "ceph.", 5) == 0) {
11646 const VXattr *vxattr = _get_vxattrs(in);
11647 if (vxattr) {
11648 while (!vxattr->name.empty()) {
11649 if (vxattr->name == name)
11650 return vxattr;
11651 vxattr++;
11652 }
11653 }
11654 }
11655 return NULL;
11656}
11657
11658size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11659{
11660 size_t len = 0;
11661 while (!vxattr->name.empty()) {
11662 if (!vxattr->hidden)
11663 len += vxattr->name.length() + 1;
11664 vxattr++;
11665 }
11666 return len;
11667}
11668
11669int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11670{
11671 Mutex::Locker lock(client_lock);
11672
181888fb
FG
11673 if (unmounting)
11674 return -ENOTCONN;
11675
7c673cae
FG
11676 vinodeno_t vino = _get_vino(in);
11677
11678 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11679 tout(cct) << "ll_readlink" << std::endl;
11680 tout(cct) << vino.ino.val << std::endl;
11681
11682 set<Dentry*>::iterator dn = in->dn_set.begin();
11683 while (dn != in->dn_set.end()) {
11684 touch_dn(*dn);
11685 ++dn;
11686 }
11687
11688 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11689 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11690 return r;
11691}
11692
11693int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11694 const UserPerm& perms, InodeRef *inp)
11695{
1adf2230 11696 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11697 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11698 << ", gid " << perms.gid() << ")" << dendl;
11699
11700 if (strlen(name) > NAME_MAX)
11701 return -ENAMETOOLONG;
11702
11703 if (dir->snapid != CEPH_NOSNAP) {
11704 return -EROFS;
11705 }
11706 if (is_quota_files_exceeded(dir, perms)) {
11707 return -EDQUOT;
11708 }
11709
11710 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11711
11712 filepath path;
11713 dir->make_nosnap_relative_path(path);
11714 path.push_dentry(name);
11715 req->set_filepath(path);
11716 req->set_inode(dir);
11717 req->head.args.mknod.rdev = rdev;
11718 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11719 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11720
11721 bufferlist xattrs_bl;
11722 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11723 if (res < 0)
11724 goto fail;
11725 req->head.args.mknod.mode = mode;
11726 if (xattrs_bl.length() > 0)
11727 req->set_data(xattrs_bl);
11728
11729 Dentry *de;
11730 res = get_or_create(dir, name, &de);
11731 if (res < 0)
11732 goto fail;
11733 req->set_dentry(de);
11734
11735 res = make_request(req, perms, inp);
11736
11737 trim_cache();
11738
1adf2230 11739 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11740 return res;
11741
11742 fail:
11743 put_request(req);
11744 return res;
11745}
11746
11747int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11748 dev_t rdev, struct stat *attr, Inode **out,
11749 const UserPerm& perms)
11750{
11751 Mutex::Locker lock(client_lock);
11752
181888fb
FG
11753 if (unmounting)
11754 return -ENOTCONN;
11755
7c673cae
FG
11756 vinodeno_t vparent = _get_vino(parent);
11757
11758 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11759 tout(cct) << "ll_mknod" << std::endl;
11760 tout(cct) << vparent.ino.val << std::endl;
11761 tout(cct) << name << std::endl;
11762 tout(cct) << mode << std::endl;
11763 tout(cct) << rdev << std::endl;
11764
11765 if (!cct->_conf->fuse_default_permissions) {
11766 int r = may_create(parent, perms);
11767 if (r < 0)
11768 return r;
11769 }
11770
11771 InodeRef in;
11772 int r = _mknod(parent, name, mode, rdev, perms, &in);
11773 if (r == 0) {
11774 fill_stat(in, attr);
11775 _ll_get(in.get());
11776 }
11777 tout(cct) << attr->st_ino << std::endl;
11778 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11779 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11780 *out = in.get();
11781 return r;
11782}
11783
11784int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11785 dev_t rdev, Inode **out,
11786 struct ceph_statx *stx, unsigned want, unsigned flags,
11787 const UserPerm& perms)
11788{
11789 unsigned caps = statx_to_mask(flags, want);
11790 Mutex::Locker lock(client_lock);
11791
181888fb
FG
11792 if (unmounting)
11793 return -ENOTCONN;
11794
7c673cae
FG
11795 vinodeno_t vparent = _get_vino(parent);
11796
11797 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11798 tout(cct) << "ll_mknodx" << std::endl;
11799 tout(cct) << vparent.ino.val << std::endl;
11800 tout(cct) << name << std::endl;
11801 tout(cct) << mode << std::endl;
11802 tout(cct) << rdev << std::endl;
11803
11804 if (!cct->_conf->fuse_default_permissions) {
11805 int r = may_create(parent, perms);
11806 if (r < 0)
11807 return r;
11808 }
11809
11810 InodeRef in;
11811 int r = _mknod(parent, name, mode, rdev, perms, &in);
11812 if (r == 0) {
11813 fill_statx(in, caps, stx);
11814 _ll_get(in.get());
11815 }
11816 tout(cct) << stx->stx_ino << std::endl;
11817 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11818 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11819 *out = in.get();
11820 return r;
11821}
11822
11823int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11824 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11825 int object_size, const char *data_pool, bool *created,
11826 const UserPerm& perms)
11827{
1adf2230 11828 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
11829 mode << dec << ")" << dendl;
11830
11831 if (strlen(name) > NAME_MAX)
11832 return -ENAMETOOLONG;
11833 if (dir->snapid != CEPH_NOSNAP) {
11834 return -EROFS;
11835 }
11836 if (is_quota_files_exceeded(dir, perms)) {
11837 return -EDQUOT;
11838 }
11839
11840 // use normalized flags to generate cmode
11841 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11842 if (cmode < 0)
11843 return -EINVAL;
11844
11845 int64_t pool_id = -1;
11846 if (data_pool && *data_pool) {
11847 pool_id = objecter->with_osdmap(
11848 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11849 if (pool_id < 0)
11850 return -EINVAL;
11851 if (pool_id > 0xffffffffll)
11852 return -ERANGE; // bummer!
11853 }
11854
11855 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11856
11857 filepath path;
11858 dir->make_nosnap_relative_path(path);
11859 path.push_dentry(name);
11860 req->set_filepath(path);
11861 req->set_inode(dir);
11862 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11863
11864 req->head.args.open.stripe_unit = stripe_unit;
11865 req->head.args.open.stripe_count = stripe_count;
11866 req->head.args.open.object_size = object_size;
11867 if (cct->_conf->client_debug_getattr_caps)
11868 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11869 else
11870 req->head.args.open.mask = 0;
11871 req->head.args.open.pool = pool_id;
11872 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11873 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11874
11875 mode |= S_IFREG;
11876 bufferlist xattrs_bl;
11877 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11878 if (res < 0)
11879 goto fail;
11880 req->head.args.open.mode = mode;
11881 if (xattrs_bl.length() > 0)
11882 req->set_data(xattrs_bl);
11883
11884 Dentry *de;
11885 res = get_or_create(dir, name, &de);
11886 if (res < 0)
11887 goto fail;
11888 req->set_dentry(de);
11889
11890 res = make_request(req, perms, inp, created);
11891 if (res < 0) {
11892 goto reply_error;
11893 }
11894
11895 /* If the caller passed a value in fhp, do the open */
11896 if(fhp) {
11897 (*inp)->get_open_ref(cmode);
11898 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11899 }
11900
11901 reply_error:
11902 trim_cache();
11903
1adf2230 11904 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
11905 << " layout " << stripe_unit
11906 << ' ' << stripe_count
11907 << ' ' << object_size
11908 <<") = " << res << dendl;
11909 return res;
11910
11911 fail:
11912 put_request(req);
11913 return res;
11914}
11915
11916
11917int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11918 InodeRef *inp)
11919{
1adf2230 11920 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11921 << mode << dec << ", uid " << perm.uid()
11922 << ", gid " << perm.gid() << ")" << dendl;
11923
11924 if (strlen(name) > NAME_MAX)
11925 return -ENAMETOOLONG;
11926
11927 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11928 return -EROFS;
11929 }
11930 if (is_quota_files_exceeded(dir, perm)) {
11931 return -EDQUOT;
11932 }
11933 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11934 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11935
11936 filepath path;
11937 dir->make_nosnap_relative_path(path);
11938 path.push_dentry(name);
11939 req->set_filepath(path);
11940 req->set_inode(dir);
11941 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11942 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11943
11944 mode |= S_IFDIR;
11945 bufferlist xattrs_bl;
11946 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11947 if (res < 0)
11948 goto fail;
11949 req->head.args.mkdir.mode = mode;
11950 if (xattrs_bl.length() > 0)
11951 req->set_data(xattrs_bl);
11952
11953 Dentry *de;
11954 res = get_or_create(dir, name, &de);
11955 if (res < 0)
11956 goto fail;
11957 req->set_dentry(de);
11958
11959 ldout(cct, 10) << "_mkdir: making request" << dendl;
11960 res = make_request(req, perm, inp);
11961 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11962
11963 trim_cache();
11964
1adf2230 11965 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11966 return res;
11967
11968 fail:
11969 put_request(req);
11970 return res;
11971}
11972
11973int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11974 struct stat *attr, Inode **out, const UserPerm& perm)
11975{
11976 Mutex::Locker lock(client_lock);
11977
181888fb
FG
11978 if (unmounting)
11979 return -ENOTCONN;
11980
7c673cae
FG
11981 vinodeno_t vparent = _get_vino(parent);
11982
11983 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11984 tout(cct) << "ll_mkdir" << std::endl;
11985 tout(cct) << vparent.ino.val << std::endl;
11986 tout(cct) << name << std::endl;
11987 tout(cct) << mode << std::endl;
11988
11989 if (!cct->_conf->fuse_default_permissions) {
11990 int r = may_create(parent, perm);
11991 if (r < 0)
11992 return r;
11993 }
11994
11995 InodeRef in;
11996 int r = _mkdir(parent, name, mode, perm, &in);
11997 if (r == 0) {
11998 fill_stat(in, attr);
11999 _ll_get(in.get());
12000 }
12001 tout(cct) << attr->st_ino << std::endl;
12002 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12003 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12004 *out = in.get();
12005 return r;
12006}
12007
12008int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12009 struct ceph_statx *stx, unsigned want, unsigned flags,
12010 const UserPerm& perms)
12011{
12012 Mutex::Locker lock(client_lock);
12013
181888fb
FG
12014 if (unmounting)
12015 return -ENOTCONN;
12016
7c673cae
FG
12017 vinodeno_t vparent = _get_vino(parent);
12018
12019 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12020 tout(cct) << "ll_mkdirx" << std::endl;
12021 tout(cct) << vparent.ino.val << std::endl;
12022 tout(cct) << name << std::endl;
12023 tout(cct) << mode << std::endl;
12024
12025 if (!cct->_conf->fuse_default_permissions) {
12026 int r = may_create(parent, perms);
12027 if (r < 0)
12028 return r;
12029 }
12030
12031 InodeRef in;
12032 int r = _mkdir(parent, name, mode, perms, &in);
12033 if (r == 0) {
12034 fill_statx(in, statx_to_mask(flags, want), stx);
12035 _ll_get(in.get());
12036 } else {
12037 stx->stx_ino = 0;
12038 stx->stx_mask = 0;
12039 }
12040 tout(cct) << stx->stx_ino << std::endl;
12041 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12042 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12043 *out = in.get();
12044 return r;
12045}
12046
12047int Client::_symlink(Inode *dir, const char *name, const char *target,
12048 const UserPerm& perms, InodeRef *inp)
12049{
1adf2230 12050 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12051 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12052 << dendl;
12053
12054 if (strlen(name) > NAME_MAX)
12055 return -ENAMETOOLONG;
12056
12057 if (dir->snapid != CEPH_NOSNAP) {
12058 return -EROFS;
12059 }
12060 if (is_quota_files_exceeded(dir, perms)) {
12061 return -EDQUOT;
12062 }
12063
12064 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12065
12066 filepath path;
12067 dir->make_nosnap_relative_path(path);
12068 path.push_dentry(name);
12069 req->set_filepath(path);
12070 req->set_inode(dir);
12071 req->set_string2(target);
12072 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12073 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12074
12075 Dentry *de;
12076 int res = get_or_create(dir, name, &de);
12077 if (res < 0)
12078 goto fail;
12079 req->set_dentry(de);
12080
12081 res = make_request(req, perms, inp);
12082
12083 trim_cache();
1adf2230 12084 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12085 res << dendl;
12086 return res;
12087
12088 fail:
12089 put_request(req);
12090 return res;
12091}
12092
12093int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12094 struct stat *attr, Inode **out, const UserPerm& perms)
12095{
12096 Mutex::Locker lock(client_lock);
12097
181888fb
FG
12098 if (unmounting)
12099 return -ENOTCONN;
12100
7c673cae
FG
12101 vinodeno_t vparent = _get_vino(parent);
12102
12103 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12104 << dendl;
12105 tout(cct) << "ll_symlink" << std::endl;
12106 tout(cct) << vparent.ino.val << std::endl;
12107 tout(cct) << name << std::endl;
12108 tout(cct) << value << std::endl;
12109
12110 if (!cct->_conf->fuse_default_permissions) {
12111 int r = may_create(parent, perms);
12112 if (r < 0)
12113 return r;
12114 }
12115
12116 InodeRef in;
12117 int r = _symlink(parent, name, value, perms, &in);
12118 if (r == 0) {
12119 fill_stat(in, attr);
12120 _ll_get(in.get());
12121 }
12122 tout(cct) << attr->st_ino << std::endl;
12123 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12124 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12125 *out = in.get();
12126 return r;
12127}
12128
12129int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12130 Inode **out, struct ceph_statx *stx, unsigned want,
12131 unsigned flags, const UserPerm& perms)
12132{
12133 Mutex::Locker lock(client_lock);
12134
181888fb
FG
12135 if (unmounting)
12136 return -ENOTCONN;
12137
7c673cae
FG
12138 vinodeno_t vparent = _get_vino(parent);
12139
12140 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12141 << dendl;
12142 tout(cct) << "ll_symlinkx" << std::endl;
12143 tout(cct) << vparent.ino.val << std::endl;
12144 tout(cct) << name << std::endl;
12145 tout(cct) << value << std::endl;
12146
12147 if (!cct->_conf->fuse_default_permissions) {
12148 int r = may_create(parent, perms);
12149 if (r < 0)
12150 return r;
12151 }
12152
12153 InodeRef in;
12154 int r = _symlink(parent, name, value, perms, &in);
12155 if (r == 0) {
12156 fill_statx(in, statx_to_mask(flags, want), stx);
12157 _ll_get(in.get());
12158 }
12159 tout(cct) << stx->stx_ino << std::endl;
12160 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12161 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12162 *out = in.get();
12163 return r;
12164}
12165
12166int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12167{
1adf2230 12168 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12169 << " uid " << perm.uid() << " gid " << perm.gid()
12170 << ")" << dendl;
12171
12172 if (dir->snapid != CEPH_NOSNAP) {
12173 return -EROFS;
12174 }
12175
12176 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12177
12178 filepath path;
12179 dir->make_nosnap_relative_path(path);
12180 path.push_dentry(name);
12181 req->set_filepath(path);
12182
12183 InodeRef otherin;
b32b8144 12184 Inode *in;
7c673cae 12185 Dentry *de;
b32b8144 12186
7c673cae
FG
12187 int res = get_or_create(dir, name, &de);
12188 if (res < 0)
12189 goto fail;
12190 req->set_dentry(de);
12191 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12192 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12193
12194 res = _lookup(dir, name, 0, &otherin, perm);
12195 if (res < 0)
12196 goto fail;
b32b8144
FG
12197
12198 in = otherin.get();
12199 req->set_other_inode(in);
12200 in->break_all_delegs();
7c673cae
FG
12201 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12202
12203 req->set_inode(dir);
12204
12205 res = make_request(req, perm);
12206
12207 trim_cache();
1adf2230 12208 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12209 return res;
12210
12211 fail:
12212 put_request(req);
12213 return res;
12214}
12215
12216int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12217{
12218 Mutex::Locker lock(client_lock);
12219
181888fb
FG
12220 if (unmounting)
12221 return -ENOTCONN;
12222
7c673cae
FG
12223 vinodeno_t vino = _get_vino(in);
12224
12225 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12226 tout(cct) << "ll_unlink" << std::endl;
12227 tout(cct) << vino.ino.val << std::endl;
12228 tout(cct) << name << std::endl;
12229
12230 if (!cct->_conf->fuse_default_permissions) {
12231 int r = may_delete(in, name, perm);
12232 if (r < 0)
12233 return r;
12234 }
12235 return _unlink(in, name, perm);
12236}
12237
12238int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12239{
1adf2230 12240 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12241 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12242
12243 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12244 return -EROFS;
12245 }
b32b8144
FG
12246
12247 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12248 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12249 filepath path;
12250 dir->make_nosnap_relative_path(path);
12251 path.push_dentry(name);
12252 req->set_filepath(path);
12253
12254 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12255 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12256 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12257
12258 InodeRef in;
12259
12260 Dentry *de;
12261 int res = get_or_create(dir, name, &de);
12262 if (res < 0)
12263 goto fail;
b32b8144
FG
12264 if (op == CEPH_MDS_OP_RMDIR)
12265 req->set_dentry(de);
12266 else
12267 de->get();
12268
7c673cae
FG
12269 res = _lookup(dir, name, 0, &in, perms);
12270 if (res < 0)
12271 goto fail;
b32b8144 12272 if (op == CEPH_MDS_OP_RMDIR) {
7c673cae 12273 req->set_inode(dir);
7c673cae
FG
12274 req->set_other_inode(in.get());
12275 } else {
12276 unlink(de, true, true);
b32b8144 12277 de->put();
7c673cae
FG
12278 req->set_other_inode(in.get());
12279 }
12280
12281 res = make_request(req, perms);
12282
12283 trim_cache();
1adf2230 12284 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12285 return res;
12286
12287 fail:
12288 put_request(req);
12289 return res;
12290}
12291
12292int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12293{
12294 Mutex::Locker lock(client_lock);
12295
181888fb
FG
12296 if (unmounting)
12297 return -ENOTCONN;
12298
7c673cae
FG
12299 vinodeno_t vino = _get_vino(in);
12300
12301 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12302 tout(cct) << "ll_rmdir" << std::endl;
12303 tout(cct) << vino.ino.val << std::endl;
12304 tout(cct) << name << std::endl;
12305
12306 if (!cct->_conf->fuse_default_permissions) {
12307 int r = may_delete(in, name, perms);
12308 if (r < 0)
12309 return r;
12310 }
12311
12312 return _rmdir(in, name, perms);
12313}
12314
12315int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12316{
1adf2230 12317 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12318 << todir->ino << " " << toname
12319 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12320 << dendl;
12321
12322 if (fromdir->snapid != todir->snapid)
12323 return -EXDEV;
12324
12325 int op = CEPH_MDS_OP_RENAME;
12326 if (fromdir->snapid != CEPH_NOSNAP) {
12327 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12328 op = CEPH_MDS_OP_RENAMESNAP;
12329 else
12330 return -EROFS;
12331 }
12332 if (fromdir != todir) {
12333 Inode *fromdir_root =
12334 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12335 Inode *todir_root =
12336 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12337 if (fromdir_root != todir_root) {
12338 return -EXDEV;
12339 }
12340 }
12341
12342 InodeRef target;
12343 MetaRequest *req = new MetaRequest(op);
12344
12345 filepath from;
12346 fromdir->make_nosnap_relative_path(from);
12347 from.push_dentry(fromname);
12348 filepath to;
12349 todir->make_nosnap_relative_path(to);
12350 to.push_dentry(toname);
12351 req->set_filepath(to);
12352 req->set_filepath2(from);
12353
12354 Dentry *oldde;
12355 int res = get_or_create(fromdir, fromname, &oldde);
12356 if (res < 0)
12357 goto fail;
12358 Dentry *de;
12359 res = get_or_create(todir, toname, &de);
12360 if (res < 0)
12361 goto fail;
12362
12363 if (op == CEPH_MDS_OP_RENAME) {
12364 req->set_old_dentry(oldde);
12365 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12366 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12367
12368 req->set_dentry(de);
12369 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12370 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12371
12372 InodeRef oldin, otherin;
12373 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12374 if (res < 0)
12375 goto fail;
b32b8144
FG
12376
12377 Inode *oldinode = oldin.get();
12378 oldinode->break_all_delegs();
12379 req->set_old_inode(oldinode);
7c673cae
FG
12380 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12381
12382 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12383 switch (res) {
12384 case 0:
12385 {
12386 Inode *in = otherin.get();
12387 req->set_other_inode(in);
12388 in->break_all_delegs();
12389 }
7c673cae 12390 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12391 break;
12392 case -ENOENT:
12393 break;
12394 default:
12395 goto fail;
7c673cae
FG
12396 }
12397
12398 req->set_inode(todir);
12399 } else {
12400 // renamesnap reply contains no tracedn, so we need to invalidate
12401 // dentry manually
12402 unlink(oldde, true, true);
12403 unlink(de, true, true);
12404 }
12405
12406 res = make_request(req, perm, &target);
12407 ldout(cct, 10) << "rename result is " << res << dendl;
12408
12409 // renamed item from our cache
12410
12411 trim_cache();
1adf2230 12412 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12413 return res;
12414
12415 fail:
12416 put_request(req);
12417 return res;
12418}
12419
12420int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12421 const char *newname, const UserPerm& perm)
12422{
12423 Mutex::Locker lock(client_lock);
12424
181888fb
FG
12425 if (unmounting)
12426 return -ENOTCONN;
12427
7c673cae
FG
12428 vinodeno_t vparent = _get_vino(parent);
12429 vinodeno_t vnewparent = _get_vino(newparent);
12430
12431 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12432 << vnewparent << " " << newname << dendl;
12433 tout(cct) << "ll_rename" << std::endl;
12434 tout(cct) << vparent.ino.val << std::endl;
12435 tout(cct) << name << std::endl;
12436 tout(cct) << vnewparent.ino.val << std::endl;
12437 tout(cct) << newname << std::endl;
12438
12439 if (!cct->_conf->fuse_default_permissions) {
12440 int r = may_delete(parent, name, perm);
12441 if (r < 0)
12442 return r;
12443 r = may_delete(newparent, newname, perm);
12444 if (r < 0 && r != -ENOENT)
12445 return r;
12446 }
12447
12448 return _rename(parent, name, newparent, newname, perm);
12449}
12450
12451int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12452{
1adf2230 12453 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12454 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12455
12456 if (strlen(newname) > NAME_MAX)
12457 return -ENAMETOOLONG;
12458
12459 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12460 return -EROFS;
12461 }
12462 if (is_quota_files_exceeded(dir, perm)) {
12463 return -EDQUOT;
12464 }
12465
b32b8144 12466 in->break_all_delegs();
7c673cae
FG
12467 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12468
12469 filepath path(newname, dir->ino);
12470 req->set_filepath(path);
12471 filepath existing(in->ino);
12472 req->set_filepath2(existing);
12473
12474 req->set_inode(dir);
12475 req->inode_drop = CEPH_CAP_FILE_SHARED;
12476 req->inode_unless = CEPH_CAP_FILE_EXCL;
12477
12478 Dentry *de;
12479 int res = get_or_create(dir, newname, &de);
12480 if (res < 0)
12481 goto fail;
12482 req->set_dentry(de);
12483
12484 res = make_request(req, perm, inp);
12485 ldout(cct, 10) << "link result is " << res << dendl;
12486
12487 trim_cache();
1adf2230 12488 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12489 return res;
12490
12491 fail:
12492 put_request(req);
12493 return res;
12494}
12495
12496int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12497 const UserPerm& perm)
12498{
12499 Mutex::Locker lock(client_lock);
12500
181888fb
FG
12501 if (unmounting)
12502 return -ENOTCONN;
12503
7c673cae
FG
12504 vinodeno_t vino = _get_vino(in);
12505 vinodeno_t vnewparent = _get_vino(newparent);
12506
31f18b77 12507 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12508 newname << dendl;
12509 tout(cct) << "ll_link" << std::endl;
12510 tout(cct) << vino.ino.val << std::endl;
12511 tout(cct) << vnewparent << std::endl;
12512 tout(cct) << newname << std::endl;
12513
12514 int r = 0;
12515 InodeRef target;
12516
12517 if (!cct->_conf->fuse_default_permissions) {
12518 if (S_ISDIR(in->mode))
12519 return -EPERM;
12520
12521 r = may_hardlink(in, perm);
12522 if (r < 0)
12523 return r;
12524
12525 r = may_create(newparent, perm);
12526 if (r < 0)
12527 return r;
12528 }
12529
12530 return _link(in, newparent, newname, perm, &target);
12531}
12532
12533int Client::ll_num_osds(void)
12534{
12535 Mutex::Locker lock(client_lock);
12536 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12537}
12538
12539int Client::ll_osdaddr(int osd, uint32_t *addr)
12540{
12541 Mutex::Locker lock(client_lock);
181888fb 12542
7c673cae
FG
12543 entity_addr_t g;
12544 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12545 if (!o.exists(osd))
12546 return false;
12547 g = o.get_addr(osd);
12548 return true;
12549 });
12550 if (!exists)
12551 return -1;
12552 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12553 *addr = ntohl(nb_addr);
12554 return 0;
12555}
181888fb 12556
7c673cae
FG
12557uint32_t Client::ll_stripe_unit(Inode *in)
12558{
12559 Mutex::Locker lock(client_lock);
12560 return in->layout.stripe_unit;
12561}
12562
12563uint64_t Client::ll_snap_seq(Inode *in)
12564{
12565 Mutex::Locker lock(client_lock);
12566 return in->snaprealm->seq;
12567}
12568
12569int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12570{
12571 Mutex::Locker lock(client_lock);
12572 *layout = in->layout;
12573 return 0;
12574}
12575
12576int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12577{
12578 return ll_file_layout(fh->inode.get(), layout);
12579}
12580
12581/* Currently we cannot take advantage of redundancy in reads, since we
12582 would have to go through all possible placement groups (a
12583 potentially quite large number determined by a hash), and use CRUSH
12584 to calculate the appropriate set of OSDs for each placement group,
12585 then index into that. An array with one entry per OSD is much more
12586 tractable and works for demonstration purposes. */
12587
12588int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12589 file_layout_t* layout)
12590{
12591 Mutex::Locker lock(client_lock);
181888fb 12592
28e407b8 12593 inodeno_t ino = in->ino;
7c673cae
FG
12594 uint32_t object_size = layout->object_size;
12595 uint32_t su = layout->stripe_unit;
12596 uint32_t stripe_count = layout->stripe_count;
12597 uint64_t stripes_per_object = object_size / su;
12598
12599 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12600 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12601 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12602 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12603
12604 object_t oid = file_object_t(ino, objectno);
12605 return objecter->with_osdmap([&](const OSDMap& o) {
12606 ceph_object_layout olayout =
12607 o.file_to_object_layout(oid, *layout);
12608 pg_t pg = (pg_t)olayout.ol_pgid;
12609 vector<int> osds;
12610 int primary;
12611 o.pg_to_acting_osds(pg, &osds, &primary);
12612 return primary;
12613 });
12614}
12615
12616/* Return the offset of the block, internal to the object */
12617
12618uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12619{
12620 Mutex::Locker lock(client_lock);
12621 file_layout_t *layout=&(in->layout);
12622 uint32_t object_size = layout->object_size;
12623 uint32_t su = layout->stripe_unit;
12624 uint64_t stripes_per_object = object_size / su;
12625
12626 return (blockno % stripes_per_object) * su;
12627}
12628
12629int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12630 const UserPerm& perms)
12631{
12632 Mutex::Locker lock(client_lock);
12633
181888fb
FG
12634 if (unmounting)
12635 return -ENOTCONN;
12636
7c673cae
FG
12637 vinodeno_t vino = _get_vino(in);
12638
12639 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12640 tout(cct) << "ll_opendir" << std::endl;
12641 tout(cct) << vino.ino.val << std::endl;
12642
12643 if (!cct->_conf->fuse_default_permissions) {
12644 int r = may_open(in, flags, perms);
12645 if (r < 0)
12646 return r;
12647 }
12648
12649 int r = _opendir(in, dirpp, perms);
12650 tout(cct) << (unsigned long)*dirpp << std::endl;
12651
12652 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12653 << dendl;
12654 return r;
12655}
12656
12657int Client::ll_releasedir(dir_result_t *dirp)
12658{
12659 Mutex::Locker lock(client_lock);
12660 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12661 tout(cct) << "ll_releasedir" << std::endl;
12662 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12663
12664 if (unmounting)
12665 return -ENOTCONN;
12666
7c673cae
FG
12667 _closedir(dirp);
12668 return 0;
12669}
12670
12671int Client::ll_fsyncdir(dir_result_t *dirp)
12672{
12673 Mutex::Locker lock(client_lock);
12674 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12675 tout(cct) << "ll_fsyncdir" << std::endl;
12676 tout(cct) << (unsigned long)dirp << std::endl;
12677
181888fb
FG
12678 if (unmounting)
12679 return -ENOTCONN;
12680
7c673cae
FG
12681 return _fsync(dirp->inode.get(), false);
12682}
12683
12684int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12685{
12686 assert(!(flags & O_CREAT));
12687
12688 Mutex::Locker lock(client_lock);
12689
181888fb
FG
12690 if (unmounting)
12691 return -ENOTCONN;
12692
7c673cae
FG
12693 vinodeno_t vino = _get_vino(in);
12694
12695 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12696 tout(cct) << "ll_open" << std::endl;
12697 tout(cct) << vino.ino.val << std::endl;
12698 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12699
12700 int r;
12701 if (!cct->_conf->fuse_default_permissions) {
12702 r = may_open(in, flags, perms);
12703 if (r < 0)
12704 goto out;
12705 }
12706
12707 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12708
12709 out:
12710 Fh *fhptr = fhp ? *fhp : NULL;
12711 if (fhptr) {
12712 ll_unclosed_fh_set.insert(fhptr);
12713 }
12714 tout(cct) << (unsigned long)fhptr << std::endl;
12715 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12716 " = " << r << " (" << fhptr << ")" << dendl;
12717 return r;
12718}
12719
12720int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12721 int flags, InodeRef *in, int caps, Fh **fhp,
12722 const UserPerm& perms)
12723{
12724 *fhp = NULL;
12725
12726 vinodeno_t vparent = _get_vino(parent);
12727
1adf2230 12728 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12729 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12730 << ", gid " << perms.gid() << dendl;
12731 tout(cct) << "ll_create" << std::endl;
12732 tout(cct) << vparent.ino.val << std::endl;
12733 tout(cct) << name << std::endl;
12734 tout(cct) << mode << std::endl;
12735 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12736
12737 bool created = false;
12738 int r = _lookup(parent, name, caps, in, perms);
12739
12740 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12741 return -EEXIST;
12742
12743 if (r == -ENOENT && (flags & O_CREAT)) {
12744 if (!cct->_conf->fuse_default_permissions) {
12745 r = may_create(parent, perms);
12746 if (r < 0)
12747 goto out;
12748 }
12749 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12750 perms);
12751 if (r < 0)
12752 goto out;
12753 }
12754
12755 if (r < 0)
12756 goto out;
12757
12758 assert(*in);
12759
12760 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12761 if (!created) {
12762 if (!cct->_conf->fuse_default_permissions) {
12763 r = may_open(in->get(), flags, perms);
12764 if (r < 0) {
12765 if (*fhp) {
12766 int release_r = _release_fh(*fhp);
12767 assert(release_r == 0); // during create, no async data ops should have happened
12768 }
12769 goto out;
12770 }
12771 }
12772 if (*fhp == NULL) {
12773 r = _open(in->get(), flags, mode, fhp, perms);
12774 if (r < 0)
12775 goto out;
12776 }
12777 }
12778
12779out:
12780 if (*fhp) {
12781 ll_unclosed_fh_set.insert(*fhp);
12782 }
12783
12784 ino_t ino = 0;
12785 if (r >= 0) {
12786 Inode *inode = in->get();
12787 if (use_faked_inos())
12788 ino = inode->faked_ino;
12789 else
12790 ino = inode->ino;
12791 }
12792
12793 tout(cct) << (unsigned long)*fhp << std::endl;
12794 tout(cct) << ino << std::endl;
1adf2230 12795 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12796 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12797 *fhp << " " << hex << ino << dec << ")" << dendl;
12798
12799 return r;
12800}
12801
12802int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12803 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12804 const UserPerm& perms)
12805{
12806 Mutex::Locker lock(client_lock);
12807 InodeRef in;
12808
181888fb
FG
12809 if (unmounting)
12810 return -ENOTCONN;
12811
7c673cae
FG
12812 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12813 fhp, perms);
12814 if (r >= 0) {
12815 assert(in);
12816
12817 // passing an Inode in outp requires an additional ref
12818 if (outp) {
12819 _ll_get(in.get());
12820 *outp = in.get();
12821 }
12822 fill_stat(in, attr);
12823 } else {
12824 attr->st_ino = 0;
12825 }
12826
12827 return r;
12828}
12829
12830int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12831 int oflags, Inode **outp, Fh **fhp,
12832 struct ceph_statx *stx, unsigned want, unsigned lflags,
12833 const UserPerm& perms)
12834{
12835 unsigned caps = statx_to_mask(lflags, want);
12836 Mutex::Locker lock(client_lock);
12837 InodeRef in;
12838
181888fb
FG
12839 if (unmounting)
12840 return -ENOTCONN;
7c673cae
FG
12841
12842 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12843 if (r >= 0) {
12844 assert(in);
12845
12846 // passing an Inode in outp requires an additional ref
12847 if (outp) {
12848 _ll_get(in.get());
12849 *outp = in.get();
12850 }
12851 fill_statx(in, caps, stx);
12852 } else {
12853 stx->stx_ino = 0;
12854 stx->stx_mask = 0;
12855 }
12856
12857 return r;
12858}
12859
12860loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12861{
12862 Mutex::Locker lock(client_lock);
12863 tout(cct) << "ll_lseek" << std::endl;
12864 tout(cct) << offset << std::endl;
12865 tout(cct) << whence << std::endl;
12866
181888fb
FG
12867 if (unmounting)
12868 return -ENOTCONN;
12869
7c673cae
FG
12870 return _lseek(fh, offset, whence);
12871}
12872
12873int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12874{
12875 Mutex::Locker lock(client_lock);
12876 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12877 tout(cct) << "ll_read" << std::endl;
12878 tout(cct) << (unsigned long)fh << std::endl;
12879 tout(cct) << off << std::endl;
12880 tout(cct) << len << std::endl;
12881
181888fb
FG
12882 if (unmounting)
12883 return -ENOTCONN;
12884
7c673cae
FG
12885 return _read(fh, off, len, bl);
12886}
12887
12888int Client::ll_read_block(Inode *in, uint64_t blockid,
12889 char *buf,
12890 uint64_t offset,
12891 uint64_t length,
12892 file_layout_t* layout)
12893{
12894 Mutex::Locker lock(client_lock);
181888fb
FG
12895
12896 if (unmounting)
12897 return -ENOTCONN;
12898
b32b8144 12899 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12900 object_t oid = file_object_t(vino.ino, blockid);
12901 C_SaferCond onfinish;
12902 bufferlist bl;
12903
12904 objecter->read(oid,
12905 object_locator_t(layout->pool_id),
12906 offset,
12907 length,
12908 vino.snapid,
12909 &bl,
12910 CEPH_OSD_FLAG_READ,
12911 &onfinish);
12912
12913 client_lock.Unlock();
12914 int r = onfinish.wait();
12915 client_lock.Lock();
12916
12917 if (r >= 0) {
12918 bl.copy(0, bl.length(), buf);
12919 r = bl.length();
12920 }
12921
12922 return r;
12923}
12924
12925/* It appears that the OSD doesn't return success unless the entire
12926 buffer was written, return the write length on success. */
12927
12928int Client::ll_write_block(Inode *in, uint64_t blockid,
12929 char* buf, uint64_t offset,
12930 uint64_t length, file_layout_t* layout,
12931 uint64_t snapseq, uint32_t sync)
12932{
12933 Mutex flock("Client::ll_write_block flock");
12934 vinodeno_t vino = ll_get_vino(in);
12935 Cond cond;
12936 bool done;
12937 int r = 0;
181888fb 12938 Context *onsafe = nullptr;
7c673cae
FG
12939
12940 if (length == 0) {
12941 return -EINVAL;
12942 }
12943 if (true || sync) {
12944 /* if write is stable, the epilogue is waiting on
12945 * flock */
12946 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12947 done = false;
12948 } else {
12949 /* if write is unstable, we just place a barrier for
12950 * future commits to wait on */
12951 /*onsafe = new C_Block_Sync(this, vino.ino,
12952 barrier_interval(offset, offset + length), &r);
12953 */
12954 done = true;
12955 }
12956 object_t oid = file_object_t(vino.ino, blockid);
12957 SnapContext fakesnap;
12958 bufferptr bp;
12959 if (length > 0) bp = buffer::copy(buf, length);
12960 bufferlist bl;
12961 bl.push_back(bp);
12962
12963 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12964 << dendl;
12965
12966 fakesnap.seq = snapseq;
12967
12968 /* lock just in time */
12969 client_lock.Lock();
181888fb
FG
12970 if (unmounting) {
12971 client_lock.Unlock();
12972 delete onsafe;
12973 return -ENOTCONN;
12974 }
7c673cae
FG
12975
12976 objecter->write(oid,
12977 object_locator_t(layout->pool_id),
12978 offset,
12979 length,
12980 fakesnap,
12981 bl,
12982 ceph::real_clock::now(),
12983 0,
12984 onsafe);
12985
12986 client_lock.Unlock();
12987 if (!done /* also !sync */) {
12988 flock.Lock();
12989 while (! done)
12990 cond.Wait(flock);
12991 flock.Unlock();
12992 }
12993
12994 if (r < 0) {
12995 return r;
12996 } else {
12997 return length;
12998 }
12999}
13000
13001int Client::ll_commit_blocks(Inode *in,
13002 uint64_t offset,
13003 uint64_t length)
13004{
13005 Mutex::Locker lock(client_lock);
13006 /*
13007 BarrierContext *bctx;
b32b8144 13008 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13009 uint64_t ino = vino.ino;
13010
13011 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13012 << offset << " to " << length << dendl;
13013
13014 if (length == 0) {
13015 return -EINVAL;
13016 }
13017
13018 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13019 if (p != barriers.end()) {
13020 barrier_interval civ(offset, offset + length);
13021 p->second->commit_barrier(civ);
13022 }
13023 */
13024 return 0;
13025}
13026
13027int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13028{
13029 Mutex::Locker lock(client_lock);
13030 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13031 "~" << len << dendl;
13032 tout(cct) << "ll_write" << std::endl;
13033 tout(cct) << (unsigned long)fh << std::endl;
13034 tout(cct) << off << std::endl;
13035 tout(cct) << len << std::endl;
13036
181888fb
FG
13037 if (unmounting)
13038 return -ENOTCONN;
13039
7c673cae
FG
13040 int r = _write(fh, off, len, data, NULL, 0);
13041 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13042 << dendl;
13043 return r;
13044}
13045
13046int Client::ll_flush(Fh *fh)
13047{
13048 Mutex::Locker lock(client_lock);
13049 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13050 tout(cct) << "ll_flush" << std::endl;
13051 tout(cct) << (unsigned long)fh << std::endl;
13052
181888fb
FG
13053 if (unmounting)
13054 return -ENOTCONN;
13055
7c673cae
FG
13056 return _flush(fh);
13057}
13058
13059int Client::ll_fsync(Fh *fh, bool syncdataonly)
13060{
13061 Mutex::Locker lock(client_lock);
13062 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13063 tout(cct) << "ll_fsync" << std::endl;
13064 tout(cct) << (unsigned long)fh << std::endl;
13065
181888fb
FG
13066 if (unmounting)
13067 return -ENOTCONN;
13068
7c673cae
FG
13069 int r = _fsync(fh, syncdataonly);
13070 if (r) {
13071 // If we're returning an error, clear it from the FH
13072 fh->take_async_err();
13073 }
13074 return r;
13075}
13076
28e407b8
AA
13077int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13078{
13079 Mutex::Locker lock(client_lock);
13080 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13081 tout(cct) << "ll_sync_inode" << std::endl;
13082 tout(cct) << (unsigned long)in << std::endl;
13083
13084 if (unmounting)
13085 return -ENOTCONN;
13086
13087 return _fsync(in, syncdataonly);
13088}
13089
7c673cae
FG
13090#ifdef FALLOC_FL_PUNCH_HOLE
13091
13092int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13093{
13094 if (offset < 0 || length <= 0)
13095 return -EINVAL;
13096
13097 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13098 return -EOPNOTSUPP;
13099
13100 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13101 return -EOPNOTSUPP;
13102
13103 Inode *in = fh->inode.get();
13104
13105 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13106 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13107 return -ENOSPC;
13108 }
13109
13110 if (in->snapid != CEPH_NOSNAP)
13111 return -EROFS;
13112
13113 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13114 return -EBADF;
13115
13116 uint64_t size = offset + length;
28e407b8 13117 std::list<InodeRef> quota_roots;
7c673cae
FG
13118 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13119 size > in->size &&
28e407b8 13120 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
7c673cae
FG
13121 return -EDQUOT;
13122 }
13123
13124 int have;
13125 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13126 if (r < 0)
13127 return r;
13128
13129 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
13130 Cond uninline_cond;
13131 bool uninline_done = false;
13132 int uninline_ret = 0;
13133 Context *onuninline = NULL;
13134
13135 if (mode & FALLOC_FL_PUNCH_HOLE) {
13136 if (in->inline_version < CEPH_INLINE_NONE &&
13137 (have & CEPH_CAP_FILE_BUFFER)) {
13138 bufferlist bl;
13139 int len = in->inline_data.length();
13140 if (offset < len) {
13141 if (offset > 0)
13142 in->inline_data.copy(0, offset, bl);
13143 int size = length;
13144 if (offset + size > len)
13145 size = len - offset;
13146 if (size > 0)
13147 bl.append_zero(size);
13148 if (offset + size < len)
13149 in->inline_data.copy(offset + size, len - offset - size, bl);
13150 in->inline_data = bl;
13151 in->inline_version++;
13152 }
91327a77 13153 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13154 in->change_attr++;
28e407b8 13155 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13156 } else {
13157 if (in->inline_version < CEPH_INLINE_NONE) {
13158 onuninline = new C_SafeCond(&uninline_flock,
13159 &uninline_cond,
13160 &uninline_done,
13161 &uninline_ret);
13162 uninline_data(in, onuninline);
13163 }
13164
13165 Mutex flock("Client::_punch_hole flock");
13166 Cond cond;
13167 bool done = false;
13168 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
13169
13170 unsafe_sync_write++;
13171 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13172
13173 _invalidate_inode_cache(in, offset, length);
13174 filer->zero(in->ino, &in->layout,
13175 in->snaprealm->get_snap_context(),
13176 offset, length,
13177 ceph::real_clock::now(),
13178 0, true, onfinish);
91327a77 13179 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13180 in->change_attr++;
28e407b8 13181 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13182
13183 client_lock.Unlock();
13184 flock.Lock();
13185 while (!done)
13186 cond.Wait(flock);
13187 flock.Unlock();
13188 client_lock.Lock();
13189 _sync_write_commit(in);
13190 }
13191 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13192 uint64_t size = offset + length;
13193 if (size > in->size) {
13194 in->size = size;
91327a77 13195 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13196 in->change_attr++;
28e407b8 13197 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13198
28e407b8 13199 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 13200 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13201 } else if (is_max_size_approaching(in)) {
13202 check_caps(in, 0);
7c673cae
FG
13203 }
13204 }
13205 }
13206
13207 if (onuninline) {
13208 client_lock.Unlock();
13209 uninline_flock.Lock();
13210 while (!uninline_done)
13211 uninline_cond.Wait(uninline_flock);
13212 uninline_flock.Unlock();
13213 client_lock.Lock();
13214
13215 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13216 in->inline_data.clear();
13217 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13218 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13219 check_caps(in, 0);
13220 } else
13221 r = uninline_ret;
13222 }
13223
13224 put_cap_ref(in, CEPH_CAP_FILE_WR);
13225 return r;
13226}
13227#else
13228
13229int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13230{
13231 return -EOPNOTSUPP;
13232}
13233
13234#endif
13235
13236
13237int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13238{
13239 Mutex::Locker lock(client_lock);
13240 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13241 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13242 tout(cct) << (unsigned long)fh << std::endl;
13243
181888fb
FG
13244 if (unmounting)
13245 return -ENOTCONN;
13246
7c673cae
FG
13247 return _fallocate(fh, mode, offset, length);
13248}
13249
13250int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13251{
13252 Mutex::Locker lock(client_lock);
13253 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13254
181888fb
FG
13255 if (unmounting)
13256 return -ENOTCONN;
13257
7c673cae
FG
13258 Fh *fh = get_filehandle(fd);
13259 if (!fh)
13260 return -EBADF;
13261#if defined(__linux__) && defined(O_PATH)
13262 if (fh->flags & O_PATH)
13263 return -EBADF;
13264#endif
13265 return _fallocate(fh, mode, offset, length);
13266}
13267
13268int Client::ll_release(Fh *fh)
13269{
13270 Mutex::Locker lock(client_lock);
91327a77
AA
13271
13272 if (unmounting)
13273 return -ENOTCONN;
13274
7c673cae
FG
13275 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13276 dendl;
13277 tout(cct) << "ll_release (fh)" << std::endl;
13278 tout(cct) << (unsigned long)fh << std::endl;
13279
13280 if (ll_unclosed_fh_set.count(fh))
13281 ll_unclosed_fh_set.erase(fh);
13282 return _release_fh(fh);
13283}
13284
13285int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13286{
13287 Mutex::Locker lock(client_lock);
13288
13289 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13290 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13291
181888fb
FG
13292 if (unmounting)
13293 return -ENOTCONN;
13294
7c673cae
FG
13295 return _getlk(fh, fl, owner);
13296}
13297
13298int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13299{
13300 Mutex::Locker lock(client_lock);
13301
13302 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13303 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13304
181888fb
FG
13305 if (unmounting)
13306 return -ENOTCONN;
13307
7c673cae
FG
13308 return _setlk(fh, fl, owner, sleep);
13309}
13310
13311int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13312{
13313 Mutex::Locker lock(client_lock);
13314
13315 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13316 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13317
181888fb
FG
13318 if (unmounting)
13319 return -ENOTCONN;
13320
7c673cae
FG
13321 return _flock(fh, cmd, owner);
13322}
13323
b32b8144
FG
13324int Client::set_deleg_timeout(uint32_t timeout)
13325{
13326 Mutex::Locker lock(client_lock);
13327
13328 /*
13329 * The whole point is to prevent blacklisting so we must time out the
13330 * delegation before the session autoclose timeout kicks in.
13331 */
13332 if (timeout >= mdsmap->get_session_autoclose())
13333 return -EINVAL;
13334
13335 deleg_timeout = timeout;
13336 return 0;
13337}
13338
13339int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13340{
13341 int ret = -EINVAL;
13342
13343 Mutex::Locker lock(client_lock);
13344
13345 if (!mounted)
13346 return -ENOTCONN;
13347
13348 Inode *inode = fh->inode.get();
13349
13350 switch(cmd) {
13351 case CEPH_DELEGATION_NONE:
13352 inode->unset_deleg(fh);
13353 ret = 0;
13354 break;
13355 default:
13356 try {
13357 ret = inode->set_deleg(fh, cmd, cb, priv);
13358 } catch (std::bad_alloc) {
13359 ret = -ENOMEM;
13360 }
13361 break;
13362 }
13363 return ret;
13364}
13365
7c673cae
FG
13366class C_Client_RequestInterrupt : public Context {
13367private:
13368 Client *client;
13369 MetaRequest *req;
13370public:
13371 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13372 req->get();
13373 }
13374 void finish(int r) override {
13375 Mutex::Locker l(client->client_lock);
13376 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13377 client->_interrupt_filelock(req);
13378 client->put_request(req);
13379 }
13380};
13381
13382void Client::ll_interrupt(void *d)
13383{
13384 MetaRequest *req = static_cast<MetaRequest*>(d);
13385 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13386 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13387 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13388}
13389
13390// =========================================
13391// layout
13392
13393// expose file layouts
13394
13395int Client::describe_layout(const char *relpath, file_layout_t *lp,
13396 const UserPerm& perms)
13397{
13398 Mutex::Locker lock(client_lock);
13399
181888fb
FG
13400 if (unmounting)
13401 return -ENOTCONN;
13402
7c673cae
FG
13403 filepath path(relpath);
13404 InodeRef in;
13405 int r = path_walk(path, &in, perms);
13406 if (r < 0)
13407 return r;
13408
13409 *lp = in->layout;
13410
13411 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13412 return 0;
13413}
13414
13415int Client::fdescribe_layout(int fd, file_layout_t *lp)
13416{
13417 Mutex::Locker lock(client_lock);
13418
181888fb
FG
13419 if (unmounting)
13420 return -ENOTCONN;
13421
7c673cae
FG
13422 Fh *f = get_filehandle(fd);
13423 if (!f)
13424 return -EBADF;
13425 Inode *in = f->inode.get();
13426
13427 *lp = in->layout;
13428
13429 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13430 return 0;
13431}
13432
d2e6a577
FG
13433int64_t Client::get_default_pool_id()
13434{
13435 Mutex::Locker lock(client_lock);
181888fb
FG
13436
13437 if (unmounting)
13438 return -ENOTCONN;
13439
d2e6a577
FG
13440 /* first data pool is the default */
13441 return mdsmap->get_first_data_pool();
13442}
7c673cae
FG
13443
13444// expose osdmap
13445
13446int64_t Client::get_pool_id(const char *pool_name)
13447{
13448 Mutex::Locker lock(client_lock);
181888fb
FG
13449
13450 if (unmounting)
13451 return -ENOTCONN;
13452
7c673cae
FG
13453 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13454 pool_name);
13455}
13456
13457string Client::get_pool_name(int64_t pool)
13458{
13459 Mutex::Locker lock(client_lock);
181888fb
FG
13460
13461 if (unmounting)
13462 return string();
13463
7c673cae
FG
13464 return objecter->with_osdmap([pool](const OSDMap& o) {
13465 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13466 });
13467}
13468
13469int Client::get_pool_replication(int64_t pool)
13470{
13471 Mutex::Locker lock(client_lock);
181888fb
FG
13472
13473 if (unmounting)
13474 return -ENOTCONN;
13475
7c673cae
FG
13476 return objecter->with_osdmap([pool](const OSDMap& o) {
13477 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13478 });
13479}
13480
13481int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13482{
13483 Mutex::Locker lock(client_lock);
13484
181888fb
FG
13485 if (unmounting)
13486 return -ENOTCONN;
13487
7c673cae
FG
13488 Fh *f = get_filehandle(fd);
13489 if (!f)
13490 return -EBADF;
13491 Inode *in = f->inode.get();
13492
13493 vector<ObjectExtent> extents;
13494 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13495 assert(extents.size() == 1);
13496
13497 objecter->with_osdmap([&](const OSDMap& o) {
13498 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13499 o.pg_to_acting_osds(pg, osds);
13500 });
13501
13502 if (osds.empty())
13503 return -EINVAL;
13504
13505 /*
13506 * Return the remainder of the extent (stripe unit)
13507 *
13508 * If length = 1 is passed to Striper::file_to_extents we get a single
13509 * extent back, but its length is one so we still need to compute the length
13510 * to the end of the stripe unit.
13511 *
13512 * If length = su then we may get 1 or 2 objects back in the extents vector
13513 * which would have to be examined. Even then, the offsets are local to the
13514 * object, so matching up to the file offset is extra work.
13515 *
13516 * It seems simpler to stick with length = 1 and manually compute the
13517 * remainder.
13518 */
13519 if (len) {
13520 uint64_t su = in->layout.stripe_unit;
13521 *len = su - (off % su);
13522 }
13523
13524 return 0;
13525}
13526
13527int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13528{
13529 Mutex::Locker lock(client_lock);
181888fb
FG
13530
13531 if (unmounting)
13532 return -ENOTCONN;
13533
7c673cae
FG
13534 if (id < 0)
13535 return -EINVAL;
13536 return objecter->with_osdmap([&](const OSDMap& o) {
13537 return o.crush->get_full_location_ordered(id, path);
13538 });
13539}
13540
13541int Client::get_file_stripe_address(int fd, loff_t offset,
13542 vector<entity_addr_t>& address)
13543{
13544 Mutex::Locker lock(client_lock);
13545
181888fb
FG
13546 if (unmounting)
13547 return -ENOTCONN;
13548
7c673cae
FG
13549 Fh *f = get_filehandle(fd);
13550 if (!f)
13551 return -EBADF;
13552 Inode *in = f->inode.get();
13553
13554 // which object?
13555 vector<ObjectExtent> extents;
13556 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13557 in->truncate_size, extents);
13558 assert(extents.size() == 1);
13559
13560 // now we have the object and its 'layout'
13561 return objecter->with_osdmap([&](const OSDMap& o) {
13562 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13563 vector<int> osds;
13564 o.pg_to_acting_osds(pg, osds);
13565 if (osds.empty())
13566 return -EINVAL;
13567 for (unsigned i = 0; i < osds.size(); i++) {
13568 entity_addr_t addr = o.get_addr(osds[i]);
13569 address.push_back(addr);
13570 }
13571 return 0;
13572 });
13573}
13574
13575int Client::get_osd_addr(int osd, entity_addr_t& addr)
13576{
13577 Mutex::Locker lock(client_lock);
181888fb
FG
13578
13579 if (unmounting)
13580 return -ENOTCONN;
13581
7c673cae
FG
13582 return objecter->with_osdmap([&](const OSDMap& o) {
13583 if (!o.exists(osd))
13584 return -ENOENT;
13585
13586 addr = o.get_addr(osd);
13587 return 0;
13588 });
13589}
13590
13591int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13592 loff_t length, loff_t offset)
13593{
13594 Mutex::Locker lock(client_lock);
13595
181888fb
FG
13596 if (unmounting)
13597 return -ENOTCONN;
13598
7c673cae
FG
13599 Fh *f = get_filehandle(fd);
13600 if (!f)
13601 return -EBADF;
13602 Inode *in = f->inode.get();
13603
13604 // map to a list of extents
13605 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13606
13607 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13608 return 0;
13609}
13610
13611
b32b8144 13612/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13613int Client::get_local_osd()
13614{
13615 Mutex::Locker lock(client_lock);
181888fb
FG
13616
13617 if (unmounting)
13618 return -ENOTCONN;
13619
7c673cae
FG
13620 objecter->with_osdmap([this](const OSDMap& o) {
13621 if (o.get_epoch() != local_osd_epoch) {
13622 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13623 local_osd_epoch = o.get_epoch();
13624 }
13625 });
13626 return local_osd;
13627}
13628
13629
13630
13631
13632
13633
13634// ===============================
13635
13636void Client::ms_handle_connect(Connection *con)
13637{
13638 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13639}
13640
13641bool Client::ms_handle_reset(Connection *con)
13642{
13643 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13644 return false;
13645}
13646
13647void Client::ms_handle_remote_reset(Connection *con)
13648{
13649 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13650 Mutex::Locker l(client_lock);
13651 switch (con->get_peer_type()) {
13652 case CEPH_ENTITY_TYPE_MDS:
13653 {
13654 // kludge to figure out which mds this is; fixme with a Connection* state
13655 mds_rank_t mds = MDS_RANK_NONE;
13656 MetaSession *s = NULL;
13657 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13658 p != mds_sessions.end();
13659 ++p) {
13660 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13661 mds = p->first;
13662 s = p->second;
13663 }
13664 }
13665 if (mds >= 0) {
d2e6a577 13666 assert (s != NULL);
7c673cae
FG
13667 switch (s->state) {
13668 case MetaSession::STATE_CLOSING:
13669 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13670 _closed_mds_session(s);
13671 break;
13672
13673 case MetaSession::STATE_OPENING:
13674 {
13675 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13676 list<Context*> waiters;
13677 waiters.swap(s->waiting_for_open);
13678 _closed_mds_session(s);
13679 MetaSession *news = _get_or_open_mds_session(mds);
13680 news->waiting_for_open.swap(waiters);
13681 }
13682 break;
13683
13684 case MetaSession::STATE_OPEN:
13685 {
28e407b8 13686 objecter->maybe_request_map(); /* to check if we are blacklisted */
7c673cae
FG
13687 const md_config_t *conf = cct->_conf;
13688 if (conf->client_reconnect_stale) {
13689 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13690 _closed_mds_session(s);
13691 } else {
13692 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13693 s->state = MetaSession::STATE_STALE;
13694 }
13695 }
13696 break;
13697
13698 case MetaSession::STATE_NEW:
13699 case MetaSession::STATE_CLOSED:
13700 default:
13701 break;
13702 }
13703 }
13704 }
13705 break;
13706 }
13707}
13708
13709bool Client::ms_handle_refused(Connection *con)
13710{
13711 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13712 return false;
13713}
13714
13715bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13716{
13717 if (dest_type == CEPH_ENTITY_TYPE_MON)
13718 return true;
13719 *authorizer = monclient->build_authorizer(dest_type);
13720 return true;
13721}
13722
13723Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13724{
13725 Inode *cur = in;
13726 utime_t now = ceph_clock_now();
13727
13728 while (cur) {
13729 if (cur != in && cur->quota.is_enable())
13730 break;
13731
13732 Inode *parent_in = NULL;
13733 if (!cur->dn_set.empty()) {
13734 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13735 Dentry *dn = *p;
13736 if (dn->lease_mds >= 0 &&
13737 dn->lease_ttl > now &&
13738 mds_sessions.count(dn->lease_mds)) {
13739 parent_in = dn->dir->parent_inode;
13740 } else {
13741 Inode *diri = dn->dir->parent_inode;
13742 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13743 diri->shared_gen == dn->cap_shared_gen) {
13744 parent_in = dn->dir->parent_inode;
13745 }
13746 }
13747 if (parent_in)
13748 break;
13749 }
13750 } else if (root_parents.count(cur)) {
13751 parent_in = root_parents[cur].get();
13752 }
13753
13754 if (parent_in) {
13755 cur = parent_in;
13756 continue;
13757 }
13758
13759 if (cur == root_ancestor)
13760 break;
13761
181888fb
FG
13762 // deleted inode
13763 if (cur->nlink == 0) {
13764 cur = root_ancestor;
13765 break;
13766 }
13767
7c673cae
FG
13768 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13769 filepath path(cur->ino);
13770 req->set_filepath(path);
13771 req->set_inode(cur);
13772
13773 InodeRef parent_ref;
13774 int ret = make_request(req, perms, &parent_ref);
13775 if (ret < 0) {
13776 ldout(cct, 1) << __func__ << " " << in->vino()
13777 << " failed to find parent of " << cur->vino()
13778 << " err " << ret << dendl;
13779 // FIXME: what to do?
13780 cur = root_ancestor;
13781 break;
13782 }
13783
13784 now = ceph_clock_now();
13785 if (cur == in)
13786 cur = parent_ref.get();
13787 else
13788 cur = in; // start over
13789 }
13790
13791 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13792 return cur;
13793}
13794
13795/**
13796 * Traverse quota ancestors of the Inode, return true
13797 * if any of them passes the passed function
13798 */
13799bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13800 std::function<bool (const Inode &in)> test)
13801{
13802 while (true) {
13803 assert(in != NULL);
13804 if (test(*in)) {
13805 return true;
13806 }
13807
13808 if (in == root_ancestor) {
13809 // We're done traversing, drop out
13810 return false;
13811 } else {
13812 // Continue up the tree
13813 in = get_quota_root(in, perms);
13814 }
13815 }
13816
13817 return false;
13818}
13819
13820bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13821{
13822 return check_quota_condition(in, perms,
13823 [](const Inode &in) {
13824 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13825 });
13826}
13827
13828bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
28e407b8
AA
13829 const UserPerm& perms,
13830 std::list<InodeRef>* quota_roots)
7c673cae
FG
13831{
13832 return check_quota_condition(in, perms,
28e407b8
AA
13833 [&new_bytes, quota_roots](const Inode &in) {
13834 if (quota_roots)
13835 quota_roots->emplace_back(const_cast<Inode*>(&in));
7c673cae
FG
13836 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13837 > in.quota.max_bytes;
13838 });
13839}
13840
28e407b8 13841bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
7c673cae 13842{
28e407b8
AA
13843 assert(in->size >= in->reported_size);
13844 const uint64_t size = in->size - in->reported_size;
13845
13846 for (auto& diri : quota_roots) {
13847 if (diri->quota.max_bytes) {
13848 if (diri->rstat.rbytes >= diri->quota.max_bytes)
13849 return true;
13850
13851 uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
13852 if ((space >> 4) < size)
13853 return true;
13854 }
13855 }
13856 return false;
7c673cae
FG
13857}
13858
13859enum {
13860 POOL_CHECKED = 1,
13861 POOL_CHECKING = 2,
13862 POOL_READ = 4,
13863 POOL_WRITE = 8,
13864};
13865
13866int Client::check_pool_perm(Inode *in, int need)
13867{
13868 if (!cct->_conf->client_check_pool_perm)
13869 return 0;
13870
13871 int64_t pool_id = in->layout.pool_id;
13872 std::string pool_ns = in->layout.pool_ns;
13873 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13874 int have = 0;
13875 while (true) {
13876 auto it = pool_perms.find(perm_key);
13877 if (it == pool_perms.end())
13878 break;
13879 if (it->second == POOL_CHECKING) {
13880 // avoid concurrent checkings
13881 wait_on_list(waiting_for_pool_perm);
13882 } else {
13883 have = it->second;
13884 assert(have & POOL_CHECKED);
13885 break;
13886 }
13887 }
13888
13889 if (!have) {
13890 if (in->snapid != CEPH_NOSNAP) {
13891 // pool permission check needs to write to the first object. But for snapshot,
13892 // head of the first object may have alread been deleted. To avoid creating
13893 // orphan object, skip the check for now.
13894 return 0;
13895 }
13896
13897 pool_perms[perm_key] = POOL_CHECKING;
13898
13899 char oid_buf[32];
13900 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13901 object_t oid = oid_buf;
13902
13903 SnapContext nullsnapc;
13904
13905 C_SaferCond rd_cond;
13906 ObjectOperation rd_op;
13907 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13908
13909 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13910 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13911
13912 C_SaferCond wr_cond;
13913 ObjectOperation wr_op;
13914 wr_op.create(true);
13915
13916 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13917 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13918
13919 client_lock.Unlock();
13920 int rd_ret = rd_cond.wait();
13921 int wr_ret = wr_cond.wait();
13922 client_lock.Lock();
13923
13924 bool errored = false;
13925
13926 if (rd_ret == 0 || rd_ret == -ENOENT)
13927 have |= POOL_READ;
13928 else if (rd_ret != -EPERM) {
13929 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13930 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13931 errored = true;
13932 }
13933
13934 if (wr_ret == 0 || wr_ret == -EEXIST)
13935 have |= POOL_WRITE;
13936 else if (wr_ret != -EPERM) {
13937 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13938 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13939 errored = true;
13940 }
13941
13942 if (errored) {
13943 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13944 // Raise EIO because actual error code might be misleading for
13945 // userspace filesystem user.
13946 pool_perms.erase(perm_key);
13947 signal_cond_list(waiting_for_pool_perm);
13948 return -EIO;
13949 }
13950
13951 pool_perms[perm_key] = have | POOL_CHECKED;
13952 signal_cond_list(waiting_for_pool_perm);
13953 }
13954
13955 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13956 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13957 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13958 return -EPERM;
13959 }
13960 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13961 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13962 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13963 return -EPERM;
13964 }
13965
13966 return 0;
13967}
13968
13969int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13970{
13971 if (acl_type == POSIX_ACL) {
13972 if (in->xattrs.count(ACL_EA_ACCESS)) {
13973 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13974
13975 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13976 }
13977 }
13978 return -EAGAIN;
13979}
13980
13981int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13982{
13983 if (acl_type == NO_ACL)
13984 return 0;
13985
13986 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13987 if (r < 0)
13988 goto out;
13989
13990 if (acl_type == POSIX_ACL) {
13991 if (in->xattrs.count(ACL_EA_ACCESS)) {
13992 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13993 bufferptr acl(access_acl.c_str(), access_acl.length());
13994 r = posix_acl_access_chmod(acl, mode);
13995 if (r < 0)
13996 goto out;
13997 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13998 } else {
13999 r = 0;
14000 }
14001 }
14002out:
14003 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14004 return r;
14005}
14006
14007int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14008 const UserPerm& perms)
14009{
14010 if (acl_type == NO_ACL)
14011 return 0;
14012
14013 if (S_ISLNK(*mode))
14014 return 0;
14015
14016 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14017 if (r < 0)
14018 goto out;
14019
14020 if (acl_type == POSIX_ACL) {
14021 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14022 map<string, bufferptr> xattrs;
14023
14024 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14025 bufferptr acl(default_acl.c_str(), default_acl.length());
14026 r = posix_acl_inherit_mode(acl, mode);
14027 if (r < 0)
14028 goto out;
14029
14030 if (r > 0) {
14031 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14032 if (r < 0)
14033 goto out;
14034 if (r > 0)
14035 xattrs[ACL_EA_ACCESS] = acl;
14036 }
14037
14038 if (S_ISDIR(*mode))
14039 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14040
14041 r = xattrs.size();
14042 if (r > 0)
14043 ::encode(xattrs, xattrs_bl);
14044 } else {
14045 if (umask_cb)
14046 *mode &= ~umask_cb(callback_handle);
14047 r = 0;
14048 }
14049 }
14050out:
14051 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14052 return r;
14053}
14054
14055void Client::set_filer_flags(int flags)
14056{
14057 Mutex::Locker l(client_lock);
14058 assert(flags == 0 ||
14059 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14060 objecter->add_global_op_flags(flags);
14061}
14062
14063void Client::clear_filer_flags(int flags)
14064{
14065 Mutex::Locker l(client_lock);
14066 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14067 objecter->clear_global_op_flag(flags);
14068}
14069
14070/**
14071 * This is included in cap release messages, to cause
14072 * the MDS to wait until this OSD map epoch. It is necessary
14073 * in corner cases where we cancel RADOS ops, so that
14074 * nobody else tries to do IO to the same objects in
14075 * the same epoch as the cancelled ops.
14076 */
14077void Client::set_cap_epoch_barrier(epoch_t e)
14078{
14079 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14080 cap_epoch_barrier = e;
14081}
14082
14083const char** Client::get_tracked_conf_keys() const
14084{
14085 static const char* keys[] = {
14086 "client_cache_size",
14087 "client_cache_mid",
14088 "client_acl_type",
b32b8144
FG
14089 "client_deleg_timeout",
14090 "client_deleg_break_on_open",
7c673cae
FG
14091 NULL
14092 };
14093 return keys;
14094}
14095
14096void Client::handle_conf_change(const struct md_config_t *conf,
14097 const std::set <std::string> &changed)
14098{
14099 Mutex::Locker lock(client_lock);
14100
181888fb 14101 if (changed.count("client_cache_mid")) {
7c673cae
FG
14102 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14103 }
14104 if (changed.count("client_acl_type")) {
14105 acl_type = NO_ACL;
14106 if (cct->_conf->client_acl_type == "posix_acl")
14107 acl_type = POSIX_ACL;
14108 }
14109}
14110
7c673cae
FG
14111void intrusive_ptr_add_ref(Inode *in)
14112{
14113 in->get();
14114}
14115
14116void intrusive_ptr_release(Inode *in)
14117{
14118 in->client->put_inode(in);
14119}
14120
14121mds_rank_t Client::_get_random_up_mds() const
14122{
14123 assert(client_lock.is_locked_by_me());
14124
14125 std::set<mds_rank_t> up;
14126 mdsmap->get_up_mds_set(up);
14127
14128 if (up.empty())
14129 return MDS_RANK_NONE;
14130 std::set<mds_rank_t>::const_iterator p = up.begin();
14131 for (int n = rand() % up.size(); n; n--)
14132 ++p;
14133 return *p;
14134}
14135
14136
14137StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14138 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14139{
14140 monclient->set_messenger(m);
14141 objecter->set_client_incarnation(0);
14142}
14143
14144StandaloneClient::~StandaloneClient()
14145{
14146 delete objecter;
14147 objecter = nullptr;
14148}
14149
14150int StandaloneClient::init()
14151{
14152 timer.init();
14153 objectcacher->start();
14154 objecter->init();
14155
14156 client_lock.Lock();
14157 assert(!initialized);
14158
14159 messenger->add_dispatcher_tail(objecter);
14160 messenger->add_dispatcher_tail(this);
14161
14162 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14163 int r = monclient->init();
14164 if (r < 0) {
14165 // need to do cleanup because we're in an intermediate init state
14166 timer.shutdown();
14167 client_lock.Unlock();
14168 objecter->shutdown();
14169 objectcacher->stop();
14170 monclient->shutdown();
14171 return r;
14172 }
14173 objecter->start();
14174
14175 client_lock.Unlock();
14176 _finish_init();
14177
14178 return 0;
14179}
14180
14181void StandaloneClient::shutdown()
14182{
14183 Client::shutdown();
14184 objecter->shutdown();
14185 monclient->shutdown();
14186}