]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to 12.2.10
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
b32b8144 86#include "Delegation.h"
7c673cae
FG
87#include "Dir.h"
88#include "ClientSnapRealm.h"
89#include "Fh.h"
90#include "MetaSession.h"
91#include "MetaRequest.h"
92#include "ObjecterWriteback.h"
93#include "posix_acl.h"
94
95#include "include/assert.h"
96#include "include/stat.h"
97
98#include "include/cephfs/ceph_statx.h"
99
100#if HAVE_GETGROUPLIST
101#include <grp.h>
102#include <pwd.h>
103#include <unistd.h>
104#endif
105
106#undef dout_prefix
107#define dout_prefix *_dout << "client." << whoami << " "
108
109#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111// FreeBSD fails to define this
112#ifndef O_DSYNC
113#define O_DSYNC 0x0
114#endif
115// Darwin fails to define this
116#ifndef O_RSYNC
117#define O_RSYNC 0x0
118#endif
119
120#ifndef O_DIRECT
121#define O_DIRECT 0x0
122#endif
123
124#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127{
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130}
131
132
133// -------------
134
135Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137{
138}
139
140bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142{
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163}
164
165
166// -------------
167
168dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174void Client::_reset_faked_inos()
175{
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181}
182
183void Client::_assign_faked_ino(Inode *in)
184{
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201}
202
203void Client::_release_faked_ino(Inode *in)
204{
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207}
208
209vinodeno_t Client::_map_faked_ino(ino_t ino)
210{
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220}
221
222vinodeno_t Client::map_faked_ino(ino_t ino)
223{
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226}
227
228// cons/des
229
230Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
7c673cae
FG
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
7c673cae
FG
241 async_ino_invalidator(m->cct),
242 async_dentry_invalidator(m->cct),
243 interrupt_finisher(m->cct),
244 remount_finisher(m->cct),
245 objecter_finisher(m->cct),
246 tick_event(NULL),
247 messenger(m), monclient(mc),
248 objecter(objecter_),
249 whoami(mc->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
251 initialized(false),
31f18b77 252 mounted(false), unmounting(false), blacklisted(false),
b32b8144 253 local_osd(-ENXIO), local_osd_epoch(0),
7c673cae 254 unsafe_sync_write(0),
b32b8144
FG
255 client_lock("Client::client_lock"),
256 deleg_timeout(0)
7c673cae
FG
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
7c673cae
FG
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
275
276 // file handles
277 free_fd_set.insert(10, 1<<30);
278
279 mdsmap.reset(new MDSMap);
280
281 // osd interfaces
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
283 &client_lock));
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
286 (void*)this,
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
292 true));
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 295 objecter->enable_blacklist_events();
7c673cae
FG
296}
297
298
299Client::~Client()
300{
301 assert(!client_lock.is_locked());
302
31f18b77
FG
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
306 client_lock.Lock();
7c673cae 307 tear_down_cache();
31f18b77 308 client_lock.Unlock();
7c673cae
FG
309}
310
311void Client::tear_down_cache()
312{
313 // fd's
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
315 it != fd_map.end();
316 ++it) {
317 Fh *fh = it->second;
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
319 _release_fh(fh);
320 }
321 fd_map.clear();
322
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
326 _closedir(dirp);
327 }
328
329 // caps!
330 // *** FIXME ***
331
332 // empty lru
7c673cae
FG
333 trim_cache();
334 assert(lru.lru_get_size() == 0);
335
336 // close root ino
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
339 delete root;
340 root = 0;
341 root_ancestor = 0;
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
344 inode_map.clear();
345 _reset_faked_inos();
346 }
347
348 assert(inode_map.empty());
349}
350
351inodeno_t Client::get_root_ino()
352{
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
356 else
357 return root->ino;
358}
359
360Inode *Client::get_root()
361{
362 Mutex::Locker l(client_lock);
363 root->ll_get();
364 return root;
365}
366
367
368// debug crapola
369
370void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
371{
372 filepath path;
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
377 << " " << path
378 << " ref " << in->get_num_ref()
379 << *in << dendl;
380
381 if (f) {
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
384 if (disconnected)
385 f->dump_int("disconnected", 1);
386 in->dump(f);
387 f->close_section();
388 }
389
390 did.insert(in);
391 if (in->dir) {
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
395 ++it) {
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
397 if (f) {
398 f->open_object_section("dentry");
399 it->second->dump(f);
400 f->close_section();
401 }
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
404 }
405 }
406}
407
408void Client::dump_cache(Formatter *f)
409{
410 set<Inode*> did;
411
412 ldout(cct, 1) << "dump_cache" << dendl;
413
414 if (f)
415 f->open_array_section("cache");
416
417 if (root)
418 dump_inode(f, root, did, true);
419
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
423 ++it) {
424 if (did.count(it->second))
425 continue;
426 dump_inode(f, it->second, did, true);
427 }
428
429 if (f)
430 f->close_section();
431}
432
433void Client::dump_status(Formatter *f)
434{
435 assert(client_lock.is_locked_by_me());
436
437 ldout(cct, 1) << __func__ << dendl;
438
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
441
442 if (f) {
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
446 f->close_section();
447
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
1adf2230
AA
451 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
452 f->dump_object("inst", inst);
453 f->dump_stream("inst_str") << inst;
454 f->dump_stream("addr_str") << inst.addr;
7c673cae
FG
455 f->dump_int("inode_count", inode_map.size());
456 f->dump_int("mds_epoch", mdsmap->get_epoch());
457 f->dump_int("osd_epoch", osd_epoch);
458 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
459 }
460}
461
462int Client::init()
463{
464 timer.init();
465 objectcacher->start();
466
467 client_lock.Lock();
468 assert(!initialized);
469
470 messenger->add_dispatcher_tail(this);
471 client_lock.Unlock();
472
473 _finish_init();
474 return 0;
475}
476
477void Client::_finish_init()
478{
479 client_lock.Lock();
480 // logger
481 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
482 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
483 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
484 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
485 logger.reset(plb.create_perf_counters());
486 cct->get_perfcounters_collection()->add(logger.get());
487
488 client_lock.Unlock();
489
490 cct->_conf->add_observer(this);
491
492 AdminSocket* admin_socket = cct->get_admin_socket();
493 int ret = admin_socket->register_command("mds_requests",
494 "mds_requests",
495 &m_command_hook,
496 "show in-progress mds requests");
497 if (ret < 0) {
498 lderr(cct) << "error registering admin socket command: "
499 << cpp_strerror(-ret) << dendl;
500 }
501 ret = admin_socket->register_command("mds_sessions",
502 "mds_sessions",
503 &m_command_hook,
504 "show mds session state");
505 if (ret < 0) {
506 lderr(cct) << "error registering admin socket command: "
507 << cpp_strerror(-ret) << dendl;
508 }
509 ret = admin_socket->register_command("dump_cache",
510 "dump_cache",
511 &m_command_hook,
512 "show in-memory metadata cache contents");
513 if (ret < 0) {
514 lderr(cct) << "error registering admin socket command: "
515 << cpp_strerror(-ret) << dendl;
516 }
517 ret = admin_socket->register_command("kick_stale_sessions",
518 "kick_stale_sessions",
519 &m_command_hook,
520 "kick sessions that were remote reset");
521 if (ret < 0) {
522 lderr(cct) << "error registering admin socket command: "
523 << cpp_strerror(-ret) << dendl;
524 }
525 ret = admin_socket->register_command("status",
526 "status",
527 &m_command_hook,
528 "show overall client status");
529 if (ret < 0) {
530 lderr(cct) << "error registering admin socket command: "
531 << cpp_strerror(-ret) << dendl;
532 }
533
534 client_lock.Lock();
535 initialized = true;
536 client_lock.Unlock();
537}
538
539void Client::shutdown()
540{
541 ldout(cct, 1) << "shutdown" << dendl;
542
543 // If we were not mounted, but were being used for sending
544 // MDS commands, we may have sessions that need closing.
545 client_lock.Lock();
546 _close_sessions();
547 client_lock.Unlock();
548
549 cct->_conf->remove_observer(this);
550
551 AdminSocket* admin_socket = cct->get_admin_socket();
552 admin_socket->unregister_command("mds_requests");
553 admin_socket->unregister_command("mds_sessions");
554 admin_socket->unregister_command("dump_cache");
555 admin_socket->unregister_command("kick_stale_sessions");
556 admin_socket->unregister_command("status");
557
558 if (ino_invalidate_cb) {
559 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
560 async_ino_invalidator.wait_for_empty();
561 async_ino_invalidator.stop();
562 }
563
564 if (dentry_invalidate_cb) {
565 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
566 async_dentry_invalidator.wait_for_empty();
567 async_dentry_invalidator.stop();
568 }
569
570 if (switch_interrupt_cb) {
571 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
572 interrupt_finisher.wait_for_empty();
573 interrupt_finisher.stop();
574 }
575
576 if (remount_cb) {
577 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
578 remount_finisher.wait_for_empty();
579 remount_finisher.stop();
580 }
581
582 objectcacher->stop(); // outside of client_lock! this does a join.
583
584 client_lock.Lock();
585 assert(initialized);
586 initialized = false;
587 timer.shutdown();
588 client_lock.Unlock();
589
590 objecter_finisher.wait_for_empty();
591 objecter_finisher.stop();
592
593 if (logger) {
594 cct->get_perfcounters_collection()->remove(logger.get());
595 logger.reset();
596 }
597}
598
599
600// ===================
601// metadata cache stuff
602
603void Client::trim_cache(bool trim_kernel_dcache)
604{
181888fb
FG
605 uint64_t max = cct->_conf->client_cache_size;
606 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
607 unsigned last = 0;
608 while (lru.lru_get_size() != last) {
609 last = lru.lru_get_size();
610
181888fb 611 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
612
613 // trim!
31f18b77 614 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
615 if (!dn)
616 break; // done
617
618 trim_dentry(dn);
619 }
620
181888fb 621 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
622 _invalidate_kernel_dcache();
623
624 // hose root?
625 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
626 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
627 delete root;
628 root = 0;
629 root_ancestor = 0;
630 while (!root_parents.empty())
631 root_parents.erase(root_parents.begin());
632 inode_map.clear();
633 _reset_faked_inos();
634 }
635}
636
637void Client::trim_cache_for_reconnect(MetaSession *s)
638{
639 mds_rank_t mds = s->mds_num;
640 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
641
642 int trimmed = 0;
643 list<Dentry*> skipped;
644 while (lru.lru_get_size() > 0) {
645 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
646 if (!dn)
647 break;
648
649 if ((dn->inode && dn->inode->caps.count(mds)) ||
650 dn->dir->parent_inode->caps.count(mds)) {
651 trim_dentry(dn);
652 trimmed++;
653 } else
654 skipped.push_back(dn);
655 }
656
657 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
658 lru.lru_insert_mid(*p);
659
660 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
661 << " trimmed " << trimmed << " dentries" << dendl;
662
663 if (s->caps.size() > 0)
664 _invalidate_kernel_dcache();
665}
666
667void Client::trim_dentry(Dentry *dn)
668{
669 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
670 << " in dir " << hex << dn->dir->parent_inode->ino
671 << dendl;
672 if (dn->inode) {
673 Inode *diri = dn->dir->parent_inode;
674 diri->dir_release_count++;
675 clear_dir_complete_and_ordered(diri, true);
676 }
677 unlink(dn, false, false); // drop dir, drop dentry
678}
679
680
1adf2230
AA
681void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
682 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 683{
7c673cae
FG
684 uint64_t prior_size = in->size;
685
7c673cae
FG
686 if (truncate_seq > in->truncate_seq ||
687 (truncate_seq == in->truncate_seq && size > in->size)) {
688 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
689 in->size = size;
690 in->reported_size = size;
691 if (truncate_seq != in->truncate_seq) {
692 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
693 << truncate_seq << dendl;
694 in->truncate_seq = truncate_seq;
695 in->oset.truncate_seq = truncate_seq;
696
697 // truncate cached file data
698 if (prior_size > size) {
699 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
700 }
701 }
702
703 // truncate inline data
704 if (in->inline_version < CEPH_INLINE_NONE) {
705 uint32_t len = in->inline_data.length();
706 if (size < len)
707 in->inline_data.splice(size, len - size);
708 }
709 }
710 if (truncate_seq >= in->truncate_seq &&
711 in->truncate_size != truncate_size) {
712 if (in->is_file()) {
713 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
714 << truncate_size << dendl;
715 in->truncate_size = truncate_size;
716 in->oset.truncate_size = truncate_size;
717 } else {
718 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
719 }
720 }
1adf2230
AA
721}
722
723void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
724 utime_t ctime, utime_t mtime, utime_t atime)
725{
726 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
727 << " ctime " << ctime << " mtime " << mtime << dendl;
728
729 if (time_warp_seq > in->time_warp_seq)
730 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
731 << " is higher than local time_warp_seq "
732 << in->time_warp_seq << dendl;
733
734 int warn = false;
7c673cae
FG
735 // be careful with size, mtime, atime
736 if (issued & (CEPH_CAP_FILE_EXCL|
737 CEPH_CAP_FILE_WR|
738 CEPH_CAP_FILE_BUFFER|
739 CEPH_CAP_AUTH_EXCL|
740 CEPH_CAP_XATTR_EXCL)) {
741 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
742 if (ctime > in->ctime)
743 in->ctime = ctime;
744 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
745 //the mds updated times, so take those!
746 in->mtime = mtime;
747 in->atime = atime;
748 in->time_warp_seq = time_warp_seq;
749 } else if (time_warp_seq == in->time_warp_seq) {
750 //take max times
751 if (mtime > in->mtime)
752 in->mtime = mtime;
753 if (atime > in->atime)
754 in->atime = atime;
755 } else if (issued & CEPH_CAP_FILE_EXCL) {
756 //ignore mds values as we have a higher seq
757 } else warn = true;
758 } else {
759 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
760 if (time_warp_seq >= in->time_warp_seq) {
761 in->ctime = ctime;
762 in->mtime = mtime;
763 in->atime = atime;
764 in->time_warp_seq = time_warp_seq;
765 } else warn = true;
766 }
767 if (warn) {
768 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
769 << time_warp_seq << " is lower than local time_warp_seq "
770 << in->time_warp_seq
771 << dendl;
772 }
773}
774
775void Client::_fragmap_remove_non_leaves(Inode *in)
776{
777 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
778 if (!in->dirfragtree.is_leaf(p->first))
779 in->fragmap.erase(p++);
780 else
781 ++p;
782}
783
784void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
785{
786 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (p->second == mds)
788 in->fragmap.erase(p++);
789 else
790 ++p;
791}
792
793Inode * Client::add_update_inode(InodeStat *st, utime_t from,
794 MetaSession *session,
795 const UserPerm& request_perms)
796{
797 Inode *in;
798 bool was_new = false;
799 if (inode_map.count(st->vino)) {
800 in = inode_map[st->vino];
801 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
802 } else {
803 in = new Inode(this, st->vino, &st->layout);
804 inode_map[st->vino] = in;
805
806 if (use_faked_inos())
807 _assign_faked_ino(in);
808
809 if (!root) {
810 root = in;
811 root_ancestor = in;
812 cwd = root;
813 } else if (!mounted) {
814 root_parents[root_ancestor] = in;
815 root_ancestor = in;
816 }
817
818 // immutable bits
819 in->ino = st->vino.ino;
820 in->snapid = st->vino.snapid;
821 in->mode = st->mode & S_IFMT;
822 was_new = true;
823 }
824
825 in->rdev = st->rdev;
826 if (in->is_symlink())
827 in->symlink = st->symlink;
828
7c673cae 829 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
830 bool new_version = false;
831 if (in->version == 0 ||
832 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
833 (in->version & ~1) < st->version))
834 new_version = true;
7c673cae 835
1adf2230
AA
836 int issued;
837 in->caps_issued(&issued);
838 issued |= in->caps_dirty();
839 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 840
1adf2230
AA
841 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
842 !(issued & CEPH_CAP_AUTH_EXCL)) {
843 in->mode = st->mode;
844 in->uid = st->uid;
845 in->gid = st->gid;
846 in->btime = st->btime;
847 }
7c673cae 848
1adf2230
AA
849 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
850 !(issued & CEPH_CAP_LINK_EXCL)) {
851 in->nlink = st->nlink;
852 }
7c673cae 853
1adf2230
AA
854 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
855 update_inode_file_time(in, issued, st->time_warp_seq,
856 st->ctime, st->mtime, st->atime);
857 }
7c673cae 858
1adf2230
AA
859 if (new_version ||
860 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 861 in->layout = st->layout;
1adf2230
AA
862 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
863 }
7c673cae 864
1adf2230
AA
865 if (in->is_dir()) {
866 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
867 in->dirstat = st->dirstat;
868 }
869 // dir_layout/rstat/quota are not tracked by capability, update them only if
870 // the inode stat is from auth mds
871 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
872 in->dir_layout = st->dir_layout;
873 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
874 in->rstat = st->rstat;
875 in->quota = st->quota;
876 }
877 // move me if/when version reflects fragtree changes.
878 if (in->dirfragtree != st->dirfragtree) {
879 in->dirfragtree = st->dirfragtree;
880 _fragmap_remove_non_leaves(in);
7c673cae 881 }
7c673cae
FG
882 }
883
884 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
885 st->xattrbl.length() &&
886 st->xattr_version > in->xattr_version) {
887 bufferlist::iterator p = st->xattrbl.begin();
888 ::decode(in->xattrs, p);
889 in->xattr_version = st->xattr_version;
890 }
891
1adf2230
AA
892 if (st->inline_version > in->inline_version) {
893 in->inline_data = st->inline_data;
894 in->inline_version = st->inline_version;
7c673cae
FG
895 }
896
1adf2230
AA
897 /* always take a newer change attr */
898 if (st->change_attr > in->change_attr)
899 in->change_attr = st->change_attr;
900
901 if (st->version > in->version)
902 in->version = st->version;
903
904 if (was_new)
905 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
906
907 if (!st->cap.caps)
908 return in; // as with readdir returning indoes in different snaprealms (no caps!)
909
7c673cae
FG
910 if (in->snapid == CEPH_NOSNAP) {
911 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
912 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
913 request_perms);
28e407b8 914 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 915 in->max_size = st->max_size;
28e407b8
AA
916 in->rstat = st->rstat;
917 }
7c673cae 918
1adf2230
AA
919 // setting I_COMPLETE needs to happen after adding the cap
920 if (in->is_dir() &&
921 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
922 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
923 in->dirstat.nfiles == 0 &&
924 in->dirstat.nsubdirs == 0) {
925 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
926 in->flags |= I_COMPLETE | I_DIR_ORDERED;
927 if (in->dir) {
928 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
929 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
930 in->dir->readdir_cache.clear();
931 for (const auto& p : in->dir->dentries) {
932 unlink(p.second, true, true); // keep dir, keep dentry
933 }
934 if (in->dir->dentries.empty())
935 close_dir(in->dir);
7c673cae 936 }
7c673cae 937 }
1adf2230
AA
938 } else {
939 in->snap_caps |= st->cap.caps;
7c673cae
FG
940 }
941
942 return in;
943}
944
945
946/*
947 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
948 */
949Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
950 Inode *in, utime_t from, MetaSession *session,
951 Dentry *old_dentry)
952{
953 Dentry *dn = NULL;
954 if (dir->dentries.count(dname))
955 dn = dir->dentries[dname];
956
957 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
958 << " in dir " << dir->parent_inode->vino() << " dn " << dn
959 << dendl;
960
961 if (dn && dn->inode) {
962 if (dn->inode->vino() == in->vino()) {
963 touch_dn(dn);
964 ldout(cct, 12) << " had dentry " << dname
965 << " with correct vino " << dn->inode->vino()
966 << dendl;
967 } else {
968 ldout(cct, 12) << " had dentry " << dname
969 << " with WRONG vino " << dn->inode->vino()
970 << dendl;
971 unlink(dn, true, true); // keep dir, keep dentry
972 }
973 }
974
975 if (!dn || !dn->inode) {
976 InodeRef tmp_ref(in);
977 if (old_dentry) {
978 if (old_dentry->dir != dir) {
979 Inode *old_diri = old_dentry->dir->parent_inode;
980 old_diri->dir_ordered_count++;
981 clear_dir_complete_and_ordered(old_diri, false);
982 }
983 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
984 }
985 Inode *diri = dir->parent_inode;
986 diri->dir_ordered_count++;
987 clear_dir_complete_and_ordered(diri, false);
988 dn = link(dir, dname, in, dn);
989 }
990
991 update_dentry_lease(dn, dlease, from, session);
992 return dn;
993}
994
995void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
996{
997 utime_t dttl = from;
998 dttl += (float)dlease->duration_ms / 1000.0;
999
1000 assert(dn);
1001
1002 if (dlease->mask & CEPH_LOCK_DN) {
1003 if (dttl > dn->lease_ttl) {
1004 ldout(cct, 10) << "got dentry lease on " << dn->name
1005 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1006 dn->lease_ttl = dttl;
1007 dn->lease_mds = session->mds_num;
1008 dn->lease_seq = dlease->seq;
1009 dn->lease_gen = session->cap_gen;
1010 }
1011 }
1012 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1013}
1014
1015
1016/*
1017 * update MDS location cache for a single inode
1018 */
1019void Client::update_dir_dist(Inode *in, DirStat *dst)
1020{
1021 // auth
1022 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1023 if (dst->auth >= 0) {
1024 in->fragmap[dst->frag] = dst->auth;
1025 } else {
1026 in->fragmap.erase(dst->frag);
1027 }
1028 if (!in->dirfragtree.is_leaf(dst->frag)) {
1029 in->dirfragtree.force_to_leaf(cct, dst->frag);
1030 _fragmap_remove_non_leaves(in);
1031 }
1032
1033 // replicated
1034 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1035
1036 // dist
1037 /*
1038 if (!st->dirfrag_dist.empty()) { // FIXME
1039 set<int> dist = st->dirfrag_dist.begin()->second;
1040 if (dist.empty() && !in->dir_contacts.empty())
1041 ldout(cct, 9) << "lost dist spec for " << in->ino
1042 << " " << dist << dendl;
1043 if (!dist.empty() && in->dir_contacts.empty())
1044 ldout(cct, 9) << "got dist spec for " << in->ino
1045 << " " << dist << dendl;
1046 in->dir_contacts = dist;
1047 }
1048 */
1049}
1050
1051void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1052{
1053 if (diri->flags & I_COMPLETE) {
1054 if (complete) {
1055 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1056 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1057 } else {
1058 if (diri->flags & I_DIR_ORDERED) {
1059 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1060 diri->flags &= ~I_DIR_ORDERED;
1061 }
1062 }
1063 if (diri->dir)
1064 diri->dir->readdir_cache.clear();
1065 }
1066}
1067
1068/*
1069 * insert results from readdir or lssnap into the metadata cache.
1070 */
1071void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1072
1073 MClientReply *reply = request->reply;
1074 ConnectionRef con = request->reply->get_connection();
1075 uint64_t features = con->get_features();
1076
1077 dir_result_t *dirp = request->dirp;
1078 assert(dirp);
1079
1080 // the extra buffer list is only set for readdir and lssnap replies
1081 bufferlist::iterator p = reply->get_extra_bl().begin();
1082 if (!p.end()) {
1083 // snapdir?
1084 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1085 assert(diri);
1086 diri = open_snapdir(diri);
1087 }
1088
1089 // only open dir if we're actually adding stuff to it!
1090 Dir *dir = diri->open_dir();
1091 assert(dir);
1092
1093 // dirstat
1094 DirStat dst(p);
1095 __u32 numdn;
1096 __u16 flags;
1097 ::decode(numdn, p);
1098 ::decode(flags, p);
1099
1100 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1101 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1102
1103 frag_t fg = (unsigned)request->head.args.readdir.frag;
1104 unsigned readdir_offset = dirp->next_offset;
1105 string readdir_start = dirp->last_name;
1106 assert(!readdir_start.empty() || readdir_offset == 2);
1107
1108 unsigned last_hash = 0;
1109 if (hash_order) {
1110 if (!readdir_start.empty()) {
1111 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1112 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1113 /* mds understands offset_hash */
1114 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1115 }
1116 }
1117
1118 if (fg != dst.frag) {
1119 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1120 fg = dst.frag;
1121 if (!hash_order) {
1122 readdir_offset = 2;
1123 readdir_start.clear();
1124 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1125 }
1126 }
1127
1128 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1129 << ", hash_order=" << hash_order
1130 << ", readdir_start " << readdir_start
1131 << ", last_hash " << last_hash
1132 << ", next_offset " << readdir_offset << dendl;
1133
1134 if (diri->snapid != CEPH_SNAPDIR &&
1135 fg.is_leftmost() && readdir_offset == 2 &&
1136 !(hash_order && last_hash)) {
1137 dirp->release_count = diri->dir_release_count;
1138 dirp->ordered_count = diri->dir_ordered_count;
1139 dirp->start_shared_gen = diri->shared_gen;
1140 dirp->cache_index = 0;
1141 }
1142
1143 dirp->buffer_frag = fg;
1144
1145 _readdir_drop_dirp_buffer(dirp);
1146 dirp->buffer.reserve(numdn);
1147
1148 string dname;
1149 LeaseStat dlease;
1150 for (unsigned i=0; i<numdn; i++) {
1151 ::decode(dname, p);
1152 ::decode(dlease, p);
1153 InodeStat ist(p, features);
1154
1155 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1156
1157 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1158 request->perms);
1159 Dentry *dn;
1160 if (diri->dir->dentries.count(dname)) {
1161 Dentry *olddn = diri->dir->dentries[dname];
1162 if (olddn->inode != in) {
1163 // replace incorrect dentry
1164 unlink(olddn, true, true); // keep dir, dentry
1165 dn = link(dir, dname, in, olddn);
1166 assert(dn == olddn);
1167 } else {
1168 // keep existing dn
1169 dn = olddn;
1170 touch_dn(dn);
1171 }
1172 } else {
1173 // new dn
1174 dn = link(dir, dname, in, NULL);
1175 }
1176
1177 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1178 if (hash_order) {
1179 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1180 if (hash != last_hash)
1181 readdir_offset = 2;
1182 last_hash = hash;
1183 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1184 } else {
1185 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1186 }
1187 // add to readdir cache
1188 if (dirp->release_count == diri->dir_release_count &&
1189 dirp->ordered_count == diri->dir_ordered_count &&
1190 dirp->start_shared_gen == diri->shared_gen) {
1191 if (dirp->cache_index == dir->readdir_cache.size()) {
1192 if (i == 0) {
1193 assert(!dirp->inode->is_complete_and_ordered());
1194 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1195 }
1196 dir->readdir_cache.push_back(dn);
1197 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1198 if (dirp->inode->is_complete_and_ordered())
1199 assert(dir->readdir_cache[dirp->cache_index] == dn);
1200 else
1201 dir->readdir_cache[dirp->cache_index] = dn;
1202 } else {
1203 assert(0 == "unexpected readdir buffer idx");
1204 }
1205 dirp->cache_index++;
1206 }
1207 // add to cached result list
1208 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1209 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1210 }
1211
1212 if (numdn > 0)
1213 dirp->last_name = dname;
1214 if (end)
1215 dirp->next_offset = 2;
1216 else
1217 dirp->next_offset = readdir_offset;
1218
1219 if (dir->is_empty())
1220 close_dir(dir);
1221 }
1222}
1223
1224/** insert_trace
1225 *
1226 * insert a trace from a MDS reply into the cache.
1227 */
1228Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1229{
1230 MClientReply *reply = request->reply;
1231 int op = request->get_op();
1232
1233 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1234 << " is_target=" << (int)reply->head.is_target
1235 << " is_dentry=" << (int)reply->head.is_dentry
1236 << dendl;
1237
1238 bufferlist::iterator p = reply->get_trace_bl().begin();
1239 if (request->got_unsafe) {
1240 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1241 assert(p.end());
1242 return NULL;
1243 }
1244
1245 if (p.end()) {
1246 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1247
1248 Dentry *d = request->dentry();
1249 if (d) {
1250 Inode *diri = d->dir->parent_inode;
1251 diri->dir_release_count++;
1252 clear_dir_complete_and_ordered(diri, true);
1253 }
1254
1255 if (d && reply->get_result() == 0) {
1256 if (op == CEPH_MDS_OP_RENAME) {
1257 // rename
1258 Dentry *od = request->old_dentry();
1259 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1260 assert(od);
1261 unlink(od, true, true); // keep dir, dentry
1262 } else if (op == CEPH_MDS_OP_RMDIR ||
1263 op == CEPH_MDS_OP_UNLINK) {
1264 // unlink, rmdir
1265 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1266 unlink(d, true, true); // keep dir, dentry
1267 }
1268 }
1269 return NULL;
1270 }
1271
1272 ConnectionRef con = request->reply->get_connection();
1273 uint64_t features = con->get_features();
1274 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1275
1276 // snap trace
1277 SnapRealm *realm = NULL;
1278 if (reply->snapbl.length())
1279 update_snap_trace(reply->snapbl, &realm);
1280
1281 ldout(cct, 10) << " hrm "
1282 << " is_target=" << (int)reply->head.is_target
1283 << " is_dentry=" << (int)reply->head.is_dentry
1284 << dendl;
1285
1286 InodeStat dirst;
1287 DirStat dst;
1288 string dname;
1289 LeaseStat dlease;
1290 InodeStat ist;
1291
1292 if (reply->head.is_dentry) {
1293 dirst.decode(p, features);
1294 dst.decode(p);
1295 ::decode(dname, p);
1296 ::decode(dlease, p);
1297 }
1298
1299 Inode *in = 0;
1300 if (reply->head.is_target) {
1301 ist.decode(p, features);
1302 if (cct->_conf->client_debug_getattr_caps) {
1303 unsigned wanted = 0;
1304 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1305 wanted = request->head.args.getattr.mask;
1306 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1307 wanted = request->head.args.open.mask;
1308
1309 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1310 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1311 assert(0 == "MDS reply does not contain xattrs");
1312 }
1313
1314 in = add_update_inode(&ist, request->sent_stamp, session,
1315 request->perms);
1316 }
1317
1318 Inode *diri = NULL;
1319 if (reply->head.is_dentry) {
1320 diri = add_update_inode(&dirst, request->sent_stamp, session,
1321 request->perms);
1322 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1323
1324 if (in) {
1325 Dir *dir = diri->open_dir();
1326 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1327 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1328 } else {
1329 Dentry *dn = NULL;
1330 if (diri->dir && diri->dir->dentries.count(dname)) {
1331 dn = diri->dir->dentries[dname];
1332 if (dn->inode) {
1333 diri->dir_ordered_count++;
1334 clear_dir_complete_and_ordered(diri, false);
1335 unlink(dn, true, true); // keep dir, dentry
1336 }
1337 }
1338 if (dlease.duration_ms > 0) {
1339 if (!dn) {
1340 Dir *dir = diri->open_dir();
1341 dn = link(dir, dname, NULL, NULL);
1342 }
1343 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1344 }
1345 }
1346 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1347 op == CEPH_MDS_OP_MKSNAP) {
1348 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1349 // fake it for snap lookup
1350 vinodeno_t vino = ist.vino;
1351 vino.snapid = CEPH_SNAPDIR;
1352 assert(inode_map.count(vino));
1353 diri = inode_map[vino];
1354
1355 string dname = request->path.last_dentry();
1356
1357 LeaseStat dlease;
1358 dlease.duration_ms = 0;
1359
1360 if (in) {
1361 Dir *dir = diri->open_dir();
1362 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1363 } else {
1364 if (diri->dir && diri->dir->dentries.count(dname)) {
1365 Dentry *dn = diri->dir->dentries[dname];
1366 if (dn->inode)
1367 unlink(dn, true, true); // keep dir, dentry
1368 }
1369 }
1370 }
1371
1372 if (in) {
1373 if (op == CEPH_MDS_OP_READDIR ||
1374 op == CEPH_MDS_OP_LSSNAP) {
1375 insert_readdir_results(request, session, in);
1376 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1377 // hack: return parent inode instead
1378 in = diri;
1379 }
1380
1381 if (request->dentry() == NULL && in != request->inode()) {
1382 // pin the target inode if its parent dentry is not pinned
1383 request->set_other_inode(in);
1384 }
1385 }
1386
1387 if (realm)
1388 put_snap_realm(realm);
1389
1390 request->target = in;
1391 return in;
1392}
1393
1394// -------
1395
1396mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1397{
1398 mds_rank_t mds = MDS_RANK_NONE;
1399 __u32 hash = 0;
1400 bool is_hash = false;
1401
1402 Inode *in = NULL;
1403 Dentry *de = NULL;
1404 Cap *cap = NULL;
1405
1406 if (req->resend_mds >= 0) {
1407 mds = req->resend_mds;
1408 req->resend_mds = -1;
1409 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1410 goto out;
1411 }
1412
1413 if (cct->_conf->client_use_random_mds)
1414 goto random_mds;
1415
1416 in = req->inode();
1417 de = req->dentry();
1418 if (in) {
1419 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1420 if (req->path.depth()) {
1421 hash = in->hash_dentry_name(req->path[0]);
1422 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1423 << " on " << req->path[0]
1424 << " => " << hash << dendl;
1425 is_hash = true;
1426 }
1427 } else if (de) {
1428 if (de->inode) {
1429 in = de->inode.get();
1430 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1431 } else {
1432 in = de->dir->parent_inode;
1433 hash = in->hash_dentry_name(de->name);
1434 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1435 << " on " << de->name
1436 << " => " << hash << dendl;
1437 is_hash = true;
1438 }
1439 }
1440 if (in) {
1441 if (in->snapid != CEPH_NOSNAP) {
1442 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1443 while (in->snapid != CEPH_NOSNAP) {
1444 if (in->snapid == CEPH_SNAPDIR)
1445 in = in->snapdir_parent.get();
1446 else if (!in->dn_set.empty())
1447 /* In most cases there will only be one dentry, so getting it
1448 * will be the correct action. If there are multiple hard links,
1449 * I think the MDS should be able to redirect as needed*/
1450 in = in->get_first_parent()->dir->parent_inode;
1451 else {
1452 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1453 break;
1454 }
1455 }
1456 is_hash = false;
1457 }
1458
1459 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1460 << " hash=" << hash << dendl;
1461
1462 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1463 frag_t fg = in->dirfragtree[hash];
1464 if (in->fragmap.count(fg)) {
1465 mds = in->fragmap[fg];
1466 if (phash_diri)
1467 *phash_diri = in;
91327a77
AA
1468 } else if (in->auth_cap) {
1469 mds = in->auth_cap->session->mds_num;
1470 }
1471 if (mds >= 0) {
7c673cae
FG
1472 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1473 goto out;
1474 }
1475 }
1476
1477 if (req->auth_is_best())
1478 cap = in->auth_cap;
1479 if (!cap && !in->caps.empty())
1480 cap = in->caps.begin()->second;
1481 if (!cap)
1482 goto random_mds;
1483 mds = cap->session->mds_num;
1484 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1485
1486 goto out;
1487 }
1488
1489random_mds:
1490 if (mds < 0) {
1491 mds = _get_random_up_mds();
1492 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1493 }
1494
1495out:
1496 ldout(cct, 20) << "mds is " << mds << dendl;
1497 return mds;
1498}
1499
1500
1501void Client::connect_mds_targets(mds_rank_t mds)
1502{
1503 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1504 assert(mds_sessions.count(mds));
1505 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1506 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1507 q != info.export_targets.end();
1508 ++q) {
1509 if (mds_sessions.count(*q) == 0 &&
1510 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1511 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1512 << " export target mds." << *q << dendl;
1513 _open_mds_session(*q);
1514 }
1515 }
1516}
1517
1518void Client::dump_mds_sessions(Formatter *f)
1519{
1520 f->dump_int("id", get_nodeid().v);
1adf2230
AA
1521 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
1522 f->dump_object("inst", inst);
1523 f->dump_stream("inst_str") << inst;
1524 f->dump_stream("addr_str") << inst.addr;
7c673cae
FG
1525 f->open_array_section("sessions");
1526 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1527 f->open_object_section("session");
1528 p->second->dump(f);
1529 f->close_section();
1530 }
1531 f->close_section();
1532 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1533}
1534void Client::dump_mds_requests(Formatter *f)
1535{
1536 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1537 p != mds_requests.end();
1538 ++p) {
1539 f->open_object_section("request");
1540 p->second->dump(f);
1541 f->close_section();
1542 }
1543}
1544
1545int Client::verify_reply_trace(int r,
1546 MetaRequest *request, MClientReply *reply,
1547 InodeRef *ptarget, bool *pcreated,
1548 const UserPerm& perms)
1549{
1550 // check whether this request actually did the create, and set created flag
1551 bufferlist extra_bl;
1552 inodeno_t created_ino;
1553 bool got_created_ino = false;
1554 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1555
1556 extra_bl.claim(reply->get_extra_bl());
1557 if (extra_bl.length() >= 8) {
1558 // if the extra bufferlist has a buffer, we assume its the created inode
1559 // and that this request to create succeeded in actually creating
1560 // the inode (won the race with other create requests)
1561 ::decode(created_ino, extra_bl);
1562 got_created_ino = true;
1563 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1564 }
1565
1566 if (pcreated)
1567 *pcreated = got_created_ino;
1568
1569 if (request->target) {
1570 *ptarget = request->target;
1571 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1572 } else {
1573 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1574 (*ptarget) = p->second;
1575 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1576 } else {
1577 // we got a traceless reply, and need to look up what we just
1578 // created. for now, do this by name. someday, do this by the
1579 // ino... which we know! FIXME.
1580 InodeRef target;
1581 Dentry *d = request->dentry();
1582 if (d) {
1583 if (d->dir) {
1584 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1585 << d->dir->parent_inode->ino << "/" << d->name
1586 << " got_ino " << got_created_ino
1587 << " ino " << created_ino
1588 << dendl;
1589 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1590 &target, perms);
1591 } else {
1592 // if the dentry is not linked, just do our best. see #5021.
1593 assert(0 == "how did this happen? i want logs!");
1594 }
1595 } else {
1596 Inode *in = request->inode();
1597 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1598 << in->ino << dendl;
1599 r = _getattr(in, request->regetattr_mask, perms, true);
1600 target = in;
1601 }
1602 if (r >= 0) {
1603 // verify ino returned in reply and trace_dist are the same
1604 if (got_created_ino &&
1605 created_ino.val != target->ino.val) {
1606 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1607 r = -EINTR;
1608 }
1609 if (ptarget)
1610 ptarget->swap(target);
1611 }
1612 }
1613 }
1614
1615 return r;
1616}
1617
1618
1619/**
1620 * make a request
1621 *
1622 * Blocking helper to make an MDS request.
1623 *
1624 * If the ptarget flag is set, behavior changes slightly: the caller
1625 * expects to get a pointer to the inode we are creating or operating
1626 * on. As a result, we will follow up any traceless mutation reply
1627 * with a getattr or lookup to transparently handle a traceless reply
1628 * from the MDS (as when the MDS restarts and the client has to replay
1629 * a request).
1630 *
1631 * @param request the MetaRequest to execute
1632 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1633 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1634 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1635 * @param use_mds [optional] prefer a specific mds (-1 for default)
1636 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1637 */
1638int Client::make_request(MetaRequest *request,
1639 const UserPerm& perms,
1640 InodeRef *ptarget, bool *pcreated,
1641 mds_rank_t use_mds,
1642 bufferlist *pdirbl)
1643{
1644 int r = 0;
1645
1646 // assign a unique tid
1647 ceph_tid_t tid = ++last_tid;
1648 request->set_tid(tid);
1649
1650 // and timestamp
1651 request->op_stamp = ceph_clock_now();
1652
1653 // make note
1654 mds_requests[tid] = request->get();
1655 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1656 oldest_tid = tid;
1657
1658 request->set_caller_perms(perms);
1659
1660 if (cct->_conf->client_inject_fixed_oldest_tid) {
1661 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1662 request->set_oldest_client_tid(1);
1663 } else {
1664 request->set_oldest_client_tid(oldest_tid);
1665 }
1666
1667 // hack target mds?
1668 if (use_mds >= 0)
1669 request->resend_mds = use_mds;
1670
1671 while (1) {
1672 if (request->aborted())
1673 break;
1674
31f18b77
FG
1675 if (blacklisted) {
1676 request->abort(-EBLACKLISTED);
1677 break;
1678 }
1679
7c673cae
FG
1680 // set up wait cond
1681 Cond caller_cond;
1682 request->caller_cond = &caller_cond;
1683
1684 // choose mds
1685 Inode *hash_diri = NULL;
1686 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1687 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1688 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1689 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1690 if (hash_diri) {
1691 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1692 _fragmap_remove_stopped_mds(hash_diri, mds);
1693 } else {
1694 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1695 request->resend_mds = _get_random_up_mds();
1696 }
1697 } else {
1698 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1699 wait_on_list(waiting_for_mdsmap);
1700 }
1701 continue;
1702 }
1703
1704 // open a session?
1705 MetaSession *session = NULL;
1706 if (!have_open_session(mds)) {
1707 session = _get_or_open_mds_session(mds);
1708
1709 // wait
1710 if (session->state == MetaSession::STATE_OPENING) {
1711 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1712 wait_on_context_list(session->waiting_for_open);
1713 // Abort requests on REJECT from MDS
1714 if (rejected_by_mds.count(mds)) {
1715 request->abort(-EPERM);
1716 break;
1717 }
1718 continue;
1719 }
1720
1721 if (!have_open_session(mds))
1722 continue;
1723 } else {
1724 session = mds_sessions[mds];
1725 }
1726
1727 // send request.
1728 send_request(request, session);
1729
1730 // wait for signal
1731 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1732 request->kick = false;
1733 while (!request->reply && // reply
1734 request->resend_mds < 0 && // forward
1735 !request->kick)
1736 caller_cond.Wait(client_lock);
1737 request->caller_cond = NULL;
1738
1739 // did we get a reply?
1740 if (request->reply)
1741 break;
1742 }
1743
1744 if (!request->reply) {
1745 assert(request->aborted());
1746 assert(!request->got_unsafe);
1747 r = request->get_abort_code();
1748 request->item.remove_myself();
1749 unregister_request(request);
1750 put_request(request); // ours
1751 return r;
1752 }
1753
1754 // got it!
1755 MClientReply *reply = request->reply;
1756 request->reply = NULL;
1757 r = reply->get_result();
1758 if (r >= 0)
1759 request->success = true;
1760
1761 // kick dispatcher (we've got it!)
1762 assert(request->dispatch_cond);
1763 request->dispatch_cond->Signal();
1764 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1765 request->dispatch_cond = 0;
1766
1767 if (r >= 0 && ptarget)
1768 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1769
1770 if (pdirbl)
1771 pdirbl->claim(reply->get_extra_bl());
1772
1773 // -- log times --
1774 utime_t lat = ceph_clock_now();
1775 lat -= request->sent_stamp;
1776 ldout(cct, 20) << "lat " << lat << dendl;
1777 logger->tinc(l_c_lat, lat);
1778 logger->tinc(l_c_reply, lat);
1779
1780 put_request(request);
1781
1782 reply->put();
1783 return r;
1784}
1785
1786void Client::unregister_request(MetaRequest *req)
1787{
1788 mds_requests.erase(req->tid);
1789 if (req->tid == oldest_tid) {
1790 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1791 while (true) {
1792 if (p == mds_requests.end()) {
1793 oldest_tid = 0;
1794 break;
1795 }
1796 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1797 oldest_tid = p->first;
1798 break;
1799 }
1800 ++p;
1801 }
1802 }
1803 put_request(req);
1804}
1805
1806void Client::put_request(MetaRequest *request)
1807{
1808 if (request->_put()) {
1809 int op = -1;
1810 if (request->success)
1811 op = request->get_op();
1812 InodeRef other_in;
1813 request->take_other_inode(&other_in);
1814 delete request;
1815
1816 if (other_in &&
1817 (op == CEPH_MDS_OP_RMDIR ||
1818 op == CEPH_MDS_OP_RENAME ||
1819 op == CEPH_MDS_OP_RMSNAP)) {
1820 _try_to_trim_inode(other_in.get(), false);
1821 }
1822 }
1823}
1824
1825int Client::encode_inode_release(Inode *in, MetaRequest *req,
1826 mds_rank_t mds, int drop,
1827 int unless, int force)
1828{
1829 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1830 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1831 << ", have:" << ", force:" << force << ")" << dendl;
1832 int released = 0;
1833 if (in->caps.count(mds)) {
1834 Cap *caps = in->caps[mds];
1835 drop &= ~(in->dirty_caps | get_caps_used(in));
1836 if ((drop & caps->issued) &&
1837 !(unless & caps->issued)) {
1838 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1839 caps->issued &= ~drop;
1840 caps->implemented &= ~drop;
1841 released = 1;
1842 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1843 } else {
1844 released = force;
1845 }
1846 if (released) {
1847 ceph_mds_request_release rel;
1848 rel.ino = in->ino;
1849 rel.cap_id = caps->cap_id;
1850 rel.seq = caps->seq;
1851 rel.issue_seq = caps->issue_seq;
1852 rel.mseq = caps->mseq;
1853 rel.caps = caps->implemented;
1854 rel.wanted = caps->wanted;
1855 rel.dname_len = 0;
1856 rel.dname_seq = 0;
1857 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1858 }
1859 }
1860 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1861 << released << dendl;
1862 return released;
1863}
1864
1865void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1866 mds_rank_t mds, int drop, int unless)
1867{
1868 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1869 << dn << ")" << dendl;
1870 int released = 0;
1871 if (dn->dir)
1872 released = encode_inode_release(dn->dir->parent_inode, req,
1873 mds, drop, unless, 1);
1874 if (released && dn->lease_mds == mds) {
1875 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1876 MClientRequest::Release& rel = req->cap_releases.back();
1877 rel.item.dname_len = dn->name.length();
1878 rel.item.dname_seq = dn->lease_seq;
1879 rel.dname = dn->name;
1880 }
1881 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1882 << dn << ")" << dendl;
1883}
1884
1885
1886/*
1887 * This requires the MClientRequest *request member to be set.
1888 * It will error out horribly without one.
1889 * Additionally, if you set any *drop member, you'd better have
1890 * set the corresponding dentry!
1891 */
1892void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1893{
1894 ldout(cct, 20) << "encode_cap_releases enter (req: "
1895 << req << ", mds: " << mds << ")" << dendl;
1896 if (req->inode_drop && req->inode())
1897 encode_inode_release(req->inode(), req,
1898 mds, req->inode_drop,
1899 req->inode_unless);
1900
1901 if (req->old_inode_drop && req->old_inode())
1902 encode_inode_release(req->old_inode(), req,
1903 mds, req->old_inode_drop,
1904 req->old_inode_unless);
1905 if (req->other_inode_drop && req->other_inode())
1906 encode_inode_release(req->other_inode(), req,
1907 mds, req->other_inode_drop,
1908 req->other_inode_unless);
1909
1910 if (req->dentry_drop && req->dentry())
1911 encode_dentry_release(req->dentry(), req,
1912 mds, req->dentry_drop,
1913 req->dentry_unless);
1914
1915 if (req->old_dentry_drop && req->old_dentry())
1916 encode_dentry_release(req->old_dentry(), req,
1917 mds, req->old_dentry_drop,
1918 req->old_dentry_unless);
1919 ldout(cct, 25) << "encode_cap_releases exit (req: "
1920 << req << ", mds " << mds <<dendl;
1921}
1922
1923bool Client::have_open_session(mds_rank_t mds)
1924{
1925 return
1926 mds_sessions.count(mds) &&
1927 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1928 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1929}
1930
1931MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1932{
1933 if (mds_sessions.count(mds) == 0)
1934 return NULL;
1935 MetaSession *s = mds_sessions[mds];
1936 if (s->con != con)
1937 return NULL;
1938 return s;
1939}
1940
1941MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1942{
1943 if (mds_sessions.count(mds))
1944 return mds_sessions[mds];
1945 return _open_mds_session(mds);
1946}
1947
1948/**
1949 * Populate a map of strings with client-identifying metadata,
1950 * such as the hostname. Call this once at initialization.
1951 */
1952void Client::populate_metadata(const std::string &mount_root)
1953{
1954 // Hostname
1955 struct utsname u;
1956 int r = uname(&u);
1957 if (r >= 0) {
1958 metadata["hostname"] = u.nodename;
1959 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1960 } else {
1961 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1962 }
1963
1964 metadata["pid"] = stringify(getpid());
1965
1966 // Ceph entity id (the '0' in "client.0")
1967 metadata["entity_id"] = cct->_conf->name.get_id();
1968
1969 // Our mount position
1970 if (!mount_root.empty()) {
1971 metadata["root"] = mount_root;
1972 }
1973
1974 // Ceph version
1975 metadata["ceph_version"] = pretty_version_to_str();
1976 metadata["ceph_sha1"] = git_version_to_str();
1977
1978 // Apply any metadata from the user's configured overrides
1979 std::vector<std::string> tokens;
1980 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1981 for (const auto &i : tokens) {
1982 auto eqpos = i.find("=");
1983 // Throw out anything that isn't of the form "<str>=<str>"
1984 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1985 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1986 continue;
1987 }
1988 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1989 }
1990}
1991
1992/**
1993 * Optionally add or override client metadata fields.
1994 */
1995void Client::update_metadata(std::string const &k, std::string const &v)
1996{
1997 Mutex::Locker l(client_lock);
1998 assert(initialized);
1999
2000 if (metadata.count(k)) {
2001 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2002 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
2003 }
2004
2005 metadata[k] = v;
2006}
2007
2008MetaSession *Client::_open_mds_session(mds_rank_t mds)
2009{
2010 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
2011 assert(mds_sessions.count(mds) == 0);
2012 MetaSession *session = new MetaSession;
2013 session->mds_num = mds;
2014 session->seq = 0;
2015 session->inst = mdsmap->get_inst(mds);
2016 session->con = messenger->get_connection(session->inst);
2017 session->state = MetaSession::STATE_OPENING;
2018 session->mds_state = MDSMap::STATE_NULL;
2019 mds_sessions[mds] = session;
2020
2021 // Maybe skip sending a request to open if this MDS daemon
2022 // has previously sent us a REJECT.
2023 if (rejected_by_mds.count(mds)) {
2024 if (rejected_by_mds[mds] == session->inst) {
2025 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2026 "because we were rejected" << dendl;
2027 return session;
2028 } else {
2029 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2030 "rejected us, trying with new inst" << dendl;
2031 rejected_by_mds.erase(mds);
2032 }
2033 }
2034
2035 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2036 m->client_meta = metadata;
2037 session->con->send_message(m);
2038 return session;
2039}
2040
2041void Client::_close_mds_session(MetaSession *s)
2042{
2043 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2044 s->state = MetaSession::STATE_CLOSING;
2045 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2046}
2047
2048void Client::_closed_mds_session(MetaSession *s)
2049{
2050 s->state = MetaSession::STATE_CLOSED;
2051 s->con->mark_down();
2052 signal_context_list(s->waiting_for_open);
2053 mount_cond.Signal();
2054 remove_session_caps(s);
2055 kick_requests_closed(s);
2056 mds_sessions.erase(s->mds_num);
2057 delete s;
2058}
2059
2060void Client::handle_client_session(MClientSession *m)
2061{
2062 mds_rank_t from = mds_rank_t(m->get_source().num());
2063 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2064
2065 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2066 if (!session) {
2067 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2068 m->put();
2069 return;
2070 }
2071
2072 switch (m->get_op()) {
2073 case CEPH_SESSION_OPEN:
2074 renew_caps(session);
2075 session->state = MetaSession::STATE_OPEN;
2076 if (unmounting)
2077 mount_cond.Signal();
2078 else
2079 connect_mds_targets(from);
2080 signal_context_list(session->waiting_for_open);
2081 break;
2082
2083 case CEPH_SESSION_CLOSE:
2084 _closed_mds_session(session);
2085 break;
2086
2087 case CEPH_SESSION_RENEWCAPS:
2088 if (session->cap_renew_seq == m->get_seq()) {
2089 session->cap_ttl =
2090 session->last_cap_renew_request + mdsmap->get_session_timeout();
2091 wake_inode_waiters(session);
2092 }
2093 break;
2094
2095 case CEPH_SESSION_STALE:
28e407b8
AA
2096 // invalidate session caps/leases
2097 session->cap_gen++;
2098 session->cap_ttl = ceph_clock_now();
2099 session->cap_ttl -= 1;
7c673cae
FG
2100 renew_caps(session);
2101 break;
2102
2103 case CEPH_SESSION_RECALL_STATE:
2104 trim_caps(session, m->get_max_caps());
2105 break;
2106
2107 case CEPH_SESSION_FLUSHMSG:
2108 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2109 break;
2110
2111 case CEPH_SESSION_FORCE_RO:
2112 force_session_readonly(session);
2113 break;
2114
2115 case CEPH_SESSION_REJECT:
2116 rejected_by_mds[session->mds_num] = session->inst;
2117 _closed_mds_session(session);
2118
2119 break;
2120
2121 default:
2122 ceph_abort();
2123 }
2124
2125 m->put();
2126}
2127
2128bool Client::_any_stale_sessions() const
2129{
2130 assert(client_lock.is_locked_by_me());
2131
2132 for (const auto &i : mds_sessions) {
2133 if (i.second->state == MetaSession::STATE_STALE) {
2134 return true;
2135 }
2136 }
2137
2138 return false;
2139}
2140
2141void Client::_kick_stale_sessions()
2142{
2143 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2144
2145 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2146 p != mds_sessions.end(); ) {
2147 MetaSession *s = p->second;
2148 ++p;
2149 if (s->state == MetaSession::STATE_STALE)
2150 _closed_mds_session(s);
2151 }
2152}
2153
2154void Client::send_request(MetaRequest *request, MetaSession *session,
2155 bool drop_cap_releases)
2156{
2157 // make the request
2158 mds_rank_t mds = session->mds_num;
2159 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2160 << " for mds." << mds << dendl;
2161 MClientRequest *r = build_client_request(request);
2162 if (request->dentry()) {
2163 r->set_dentry_wanted();
2164 }
2165 if (request->got_unsafe) {
2166 r->set_replayed_op();
2167 if (request->target)
2168 r->head.ino = request->target->ino;
2169 } else {
2170 encode_cap_releases(request, mds);
2171 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2172 request->cap_releases.clear();
2173 else
2174 r->releases.swap(request->cap_releases);
2175 }
2176 r->set_mdsmap_epoch(mdsmap->get_epoch());
2177 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2178 objecter->with_osdmap([r](const OSDMap& o) {
2179 r->set_osdmap_epoch(o.get_epoch());
2180 });
2181 }
2182
2183 if (request->mds == -1) {
2184 request->sent_stamp = ceph_clock_now();
2185 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2186 }
2187 request->mds = mds;
2188
2189 Inode *in = request->inode();
2190 if (in && in->caps.count(mds))
2191 request->sent_on_mseq = in->caps[mds]->mseq;
2192
2193 session->requests.push_back(&request->item);
2194
2195 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2196 session->con->send_message(r);
2197}
2198
2199MClientRequest* Client::build_client_request(MetaRequest *request)
2200{
2201 MClientRequest *req = new MClientRequest(request->get_op());
2202 req->set_tid(request->tid);
2203 req->set_stamp(request->op_stamp);
2204 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2205
2206 // if the filepath's haven't been set, set them!
2207 if (request->path.empty()) {
2208 Inode *in = request->inode();
2209 Dentry *de = request->dentry();
2210 if (in)
2211 in->make_nosnap_relative_path(request->path);
2212 else if (de) {
2213 if (de->inode)
2214 de->inode->make_nosnap_relative_path(request->path);
2215 else if (de->dir) {
2216 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2217 request->path.push_dentry(de->name);
2218 }
2219 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2220 << " No path, inode, or appropriately-endowed dentry given!"
2221 << dendl;
2222 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2223 << " No path, inode, or dentry given!"
2224 << dendl;
2225 }
2226 req->set_filepath(request->get_filepath());
2227 req->set_filepath2(request->get_filepath2());
2228 req->set_data(request->data);
2229 req->set_retry_attempt(request->retry_attempt++);
2230 req->head.num_fwd = request->num_fwd;
2231 const gid_t *_gids;
2232 int gid_count = request->perms.get_gids(&_gids);
2233 req->set_gid_list(gid_count, _gids);
2234 return req;
2235}
2236
2237
2238
2239void Client::handle_client_request_forward(MClientRequestForward *fwd)
2240{
2241 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2242 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2243 if (!session) {
2244 fwd->put();
2245 return;
2246 }
2247 ceph_tid_t tid = fwd->get_tid();
2248
2249 if (mds_requests.count(tid) == 0) {
2250 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2251 fwd->put();
2252 return;
2253 }
2254
2255 MetaRequest *request = mds_requests[tid];
2256 assert(request);
2257
2258 // reset retry counter
2259 request->retry_attempt = 0;
2260
2261 // request not forwarded, or dest mds has no session.
2262 // resend.
2263 ldout(cct, 10) << "handle_client_request tid " << tid
2264 << " fwd " << fwd->get_num_fwd()
2265 << " to mds." << fwd->get_dest_mds()
2266 << ", resending to " << fwd->get_dest_mds()
2267 << dendl;
2268
2269 request->mds = -1;
2270 request->item.remove_myself();
2271 request->num_fwd = fwd->get_num_fwd();
2272 request->resend_mds = fwd->get_dest_mds();
2273 request->caller_cond->Signal();
2274
2275 fwd->put();
2276}
2277
2278bool Client::is_dir_operation(MetaRequest *req)
2279{
2280 int op = req->get_op();
2281 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2282 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2283 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2284 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2285 return true;
2286 return false;
2287}
2288
2289void Client::handle_client_reply(MClientReply *reply)
2290{
2291 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2292 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2293 if (!session) {
2294 reply->put();
2295 return;
2296 }
2297
2298 ceph_tid_t tid = reply->get_tid();
2299 bool is_safe = reply->is_safe();
2300
2301 if (mds_requests.count(tid) == 0) {
2302 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2303 << " safe is:" << is_safe << dendl;
2304 reply->put();
2305 return;
2306 }
2307 MetaRequest *request = mds_requests.at(tid);
2308
2309 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2310 << " tid " << tid << dendl;
2311
2312 if (request->got_unsafe && !is_safe) {
2313 //duplicate response
2314 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2315 << mds_num << " safe:" << is_safe << dendl;
2316 reply->put();
2317 return;
2318 }
2319
2320 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2321 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2322 << " from mds." << request->mds << dendl;
2323 request->send_to_auth = true;
2324 request->resend_mds = choose_target_mds(request);
2325 Inode *in = request->inode();
2326 if (request->resend_mds >= 0 &&
2327 request->resend_mds == request->mds &&
2328 (in == NULL ||
2329 in->caps.count(request->resend_mds) == 0 ||
2330 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2331 // have to return ESTALE
2332 } else {
2333 request->caller_cond->Signal();
2334 reply->put();
2335 return;
2336 }
2337 ldout(cct, 20) << "have to return ESTALE" << dendl;
2338 }
2339
2340 assert(request->reply == NULL);
2341 request->reply = reply;
2342 insert_trace(request, session);
2343
2344 // Handle unsafe reply
2345 if (!is_safe) {
2346 request->got_unsafe = true;
2347 session->unsafe_requests.push_back(&request->unsafe_item);
2348 if (is_dir_operation(request)) {
2349 Inode *dir = request->inode();
2350 assert(dir);
2351 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2352 }
2353 if (request->target) {
2354 InodeRef &in = request->target;
2355 in->unsafe_ops.push_back(&request->unsafe_target_item);
2356 }
2357 }
2358
2359 // Only signal the caller once (on the first reply):
2360 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2361 if (!is_safe || !request->got_unsafe) {
2362 Cond cond;
2363 request->dispatch_cond = &cond;
2364
2365 // wake up waiter
2366 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2367 request->caller_cond->Signal();
2368
2369 // wake for kick back
2370 while (request->dispatch_cond) {
2371 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2372 cond.Wait(client_lock);
2373 }
2374 }
2375
2376 if (is_safe) {
2377 // the filesystem change is committed to disk
2378 // we're done, clean up
2379 if (request->got_unsafe) {
2380 request->unsafe_item.remove_myself();
2381 request->unsafe_dir_item.remove_myself();
2382 request->unsafe_target_item.remove_myself();
2383 signal_cond_list(request->waitfor_safe);
2384 }
2385 request->item.remove_myself();
2386 unregister_request(request);
2387 }
2388 if (unmounting)
2389 mount_cond.Signal();
2390}
2391
2392void Client::_handle_full_flag(int64_t pool)
2393{
2394 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2395 << "on " << pool << dendl;
2396 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2397 // to do this rather than blocking, because otherwise when we fill up we
2398 // potentially lock caps forever on files with dirty pages, and we need
2399 // to be able to release those caps to the MDS so that it can delete files
2400 // and free up space.
2401 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2402
2403 // For all inodes with layouts in this pool and a pending flush write op
2404 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2405 // from ObjectCacher so that it doesn't re-issue the write in response to
2406 // the ENOSPC error.
2407 // Fortunately since we're cancelling everything in a given pool, we don't
2408 // need to know which ops belong to which ObjectSet, we can just blow all
2409 // the un-flushed cached data away and mark any dirty inodes' async_err
2410 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2411 // affecting this pool, and all the objectsets we're purging were also
2412 // in this pool.
2413 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2414 i != inode_map.end(); ++i)
2415 {
2416 Inode *inode = i->second;
2417 if (inode->oset.dirty_or_tx
2418 && (pool == -1 || inode->layout.pool_id == pool)) {
2419 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2420 << " has dirty objects, purging and setting ENOSPC" << dendl;
2421 objectcacher->purge_set(&inode->oset);
2422 inode->set_async_err(-ENOSPC);
2423 }
2424 }
2425
2426 if (cancelled_epoch != (epoch_t)-1) {
2427 set_cap_epoch_barrier(cancelled_epoch);
2428 }
2429}
2430
2431void Client::handle_osd_map(MOSDMap *m)
2432{
31f18b77
FG
2433 std::set<entity_addr_t> new_blacklists;
2434 objecter->consume_blacklist_events(&new_blacklists);
2435
2436 const auto myaddr = messenger->get_myaddr();
2437 if (!blacklisted && new_blacklists.count(myaddr)) {
2438 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2439 return o.get_epoch();
2440 });
2441 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2442 blacklisted = true;
2443 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2444 p != mds_requests.end(); ) {
2445 auto req = p->second;
2446 ++p;
2447 req->abort(-EBLACKLISTED);
2448 if (req->caller_cond) {
2449 req->kick = true;
2450 req->caller_cond->Signal();
2451 }
2452 }
2453
2454 // Progress aborts on any requests that were on this waitlist. Any
2455 // requests that were on a waiting_for_open session waitlist
2456 // will get kicked during close session below.
2457 signal_cond_list(waiting_for_mdsmap);
2458
2459 // Force-close all sessions: assume this is not abandoning any state
2460 // on the MDS side because the MDS will have seen the blacklist too.
2461 while(!mds_sessions.empty()) {
2462 auto i = mds_sessions.begin();
2463 auto session = i->second;
2464 _closed_mds_session(session);
2465 }
2466
2467 // Since we know all our OSD ops will fail, cancel them all preemtively,
2468 // so that on an unhealthy cluster we can umount promptly even if e.g.
2469 // some PGs were inaccessible.
2470 objecter->op_cancel_writes(-EBLACKLISTED);
2471
2472 } else if (blacklisted) {
2473 // Handle case where we were blacklisted but no longer are
2474 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2475 return o.is_blacklisted(myaddr);});
2476 }
2477
7c673cae
FG
2478 if (objecter->osdmap_full_flag()) {
2479 _handle_full_flag(-1);
2480 } else {
2481 // Accumulate local list of full pools so that I can drop
2482 // the objecter lock before re-entering objecter in
2483 // cancel_writes
2484 std::vector<int64_t> full_pools;
2485
2486 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2487 for (const auto& kv : o.get_pools()) {
2488 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2489 full_pools.push_back(kv.first);
2490 }
2491 }
2492 });
2493
2494 for (auto p : full_pools)
2495 _handle_full_flag(p);
2496
2497 // Subscribe to subsequent maps to watch for the full flag going
2498 // away. For the global full flag objecter does this for us, but
2499 // it pays no attention to the per-pool full flag so in this branch
2500 // we do it ourselves.
2501 if (!full_pools.empty()) {
2502 objecter->maybe_request_map();
2503 }
2504 }
2505
2506 m->put();
2507}
2508
2509
2510// ------------------------
2511// incoming messages
2512
2513
2514bool Client::ms_dispatch(Message *m)
2515{
2516 Mutex::Locker l(client_lock);
2517 if (!initialized) {
2518 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2519 m->put();
2520 return true;
2521 }
2522
2523 switch (m->get_type()) {
2524 // mounting and mds sessions
2525 case CEPH_MSG_MDS_MAP:
2526 handle_mds_map(static_cast<MMDSMap*>(m));
2527 break;
2528 case CEPH_MSG_FS_MAP:
2529 handle_fs_map(static_cast<MFSMap*>(m));
2530 break;
2531 case CEPH_MSG_FS_MAP_USER:
2532 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2533 break;
2534 case CEPH_MSG_CLIENT_SESSION:
2535 handle_client_session(static_cast<MClientSession*>(m));
2536 break;
2537
2538 case CEPH_MSG_OSD_MAP:
2539 handle_osd_map(static_cast<MOSDMap*>(m));
2540 break;
2541
2542 // requests
2543 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2544 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2545 break;
2546 case CEPH_MSG_CLIENT_REPLY:
2547 handle_client_reply(static_cast<MClientReply*>(m));
2548 break;
2549
2550 case CEPH_MSG_CLIENT_SNAP:
2551 handle_snap(static_cast<MClientSnap*>(m));
2552 break;
2553 case CEPH_MSG_CLIENT_CAPS:
2554 handle_caps(static_cast<MClientCaps*>(m));
2555 break;
2556 case CEPH_MSG_CLIENT_LEASE:
2557 handle_lease(static_cast<MClientLease*>(m));
2558 break;
2559 case MSG_COMMAND_REPLY:
2560 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2561 handle_command_reply(static_cast<MCommandReply*>(m));
2562 } else {
2563 return false;
2564 }
2565 break;
2566 case CEPH_MSG_CLIENT_QUOTA:
2567 handle_quota(static_cast<MClientQuota*>(m));
2568 break;
2569
2570 default:
2571 return false;
2572 }
2573
2574 // unmounting?
2575 if (unmounting) {
2576 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2577 << "+" << inode_map.size() << dendl;
2578 long unsigned size = lru.lru_get_size() + inode_map.size();
2579 trim_cache();
2580 if (size < lru.lru_get_size() + inode_map.size()) {
2581 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2582 mount_cond.Signal();
2583 } else {
2584 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2585 << "+" << inode_map.size() << dendl;
2586 }
2587 }
2588
2589 return true;
2590}
2591
2592void Client::handle_fs_map(MFSMap *m)
2593{
2594 fsmap.reset(new FSMap(m->get_fsmap()));
2595 m->put();
2596
2597 signal_cond_list(waiting_for_fsmap);
2598
2599 monclient->sub_got("fsmap", fsmap->get_epoch());
2600}
2601
2602void Client::handle_fs_map_user(MFSMapUser *m)
2603{
2604 fsmap_user.reset(new FSMapUser);
2605 *fsmap_user = m->get_fsmap();
2606 m->put();
2607
2608 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2609 signal_cond_list(waiting_for_fsmap);
2610}
2611
2612void Client::handle_mds_map(MMDSMap* m)
2613{
2614 if (m->get_epoch() <= mdsmap->get_epoch()) {
2615 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2616 << " is identical to or older than our "
2617 << mdsmap->get_epoch() << dendl;
2618 m->put();
2619 return;
2620 }
2621
2622 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2623
2624 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2625 oldmap.swap(mdsmap);
2626
2627 mdsmap->decode(m->get_encoded());
2628
2629 // Cancel any commands for missing or laggy GIDs
2630 std::list<ceph_tid_t> cancel_ops;
2631 auto &commands = command_table.get_commands();
2632 for (const auto &i : commands) {
2633 auto &op = i.second;
2634 const mds_gid_t op_mds_gid = op.mds_gid;
2635 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2636 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2637 cancel_ops.push_back(i.first);
2638 if (op.outs) {
2639 std::ostringstream ss;
2640 ss << "MDS " << op_mds_gid << " went away";
2641 *(op.outs) = ss.str();
2642 }
2643 op.con->mark_down();
2644 if (op.on_finish) {
2645 op.on_finish->complete(-ETIMEDOUT);
2646 }
2647 }
2648 }
2649
2650 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2651 i != cancel_ops.end(); ++i) {
2652 command_table.erase(*i);
2653 }
2654
2655 // reset session
2656 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2657 p != mds_sessions.end(); ) {
2658 mds_rank_t mds = p->first;
2659 MetaSession *session = p->second;
2660 ++p;
2661
2662 int oldstate = oldmap->get_state(mds);
2663 int newstate = mdsmap->get_state(mds);
2664 if (!mdsmap->is_up(mds)) {
2665 session->con->mark_down();
2666 } else if (mdsmap->get_inst(mds) != session->inst) {
2667 session->con->mark_down();
2668 session->inst = mdsmap->get_inst(mds);
2669 // When new MDS starts to take over, notify kernel to trim unused entries
2670 // in its dcache/icache. Hopefully, the kernel will release some unused
2671 // inodes before the new MDS enters reconnect state.
2672 trim_cache_for_reconnect(session);
2673 } else if (oldstate == newstate)
2674 continue; // no change
2675
2676 session->mds_state = newstate;
2677 if (newstate == MDSMap::STATE_RECONNECT) {
2678 session->con = messenger->get_connection(session->inst);
2679 send_reconnect(session);
2680 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2681 if (oldstate < MDSMap::STATE_ACTIVE) {
2682 // kick new requests
2683 kick_requests(session);
2684 kick_flushing_caps(session);
2685 signal_context_list(session->waiting_for_open);
2686 kick_maxsize_requests(session);
2687 wake_inode_waiters(session);
2688 }
2689 connect_mds_targets(mds);
2690 } else if (newstate == MDSMap::STATE_NULL &&
2691 mds >= mdsmap->get_max_mds()) {
2692 _closed_mds_session(session);
2693 }
2694 }
2695
2696 // kick any waiting threads
2697 signal_cond_list(waiting_for_mdsmap);
2698
2699 m->put();
2700
2701 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2702}
2703
2704void Client::send_reconnect(MetaSession *session)
2705{
2706 mds_rank_t mds = session->mds_num;
2707 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2708
2709 // trim unused caps to reduce MDS's cache rejoin time
2710 trim_cache_for_reconnect(session);
2711
2712 session->readonly = false;
2713
2714 if (session->release) {
2715 session->release->put();
2716 session->release = NULL;
2717 }
2718
2719 // reset my cap seq number
2720 session->seq = 0;
2721 //connect to the mds' offload targets
2722 connect_mds_targets(mds);
2723 //make sure unsafe requests get saved
2724 resend_unsafe_requests(session);
2725
2726 MClientReconnect *m = new MClientReconnect;
2727
2728 // i have an open session.
2729 ceph::unordered_set<inodeno_t> did_snaprealm;
2730 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2731 p != inode_map.end();
2732 ++p) {
2733 Inode *in = p->second;
2734 if (in->caps.count(mds)) {
2735 ldout(cct, 10) << " caps on " << p->first
2736 << " " << ccap_string(in->caps[mds]->issued)
2737 << " wants " << ccap_string(in->caps_wanted())
2738 << dendl;
2739 filepath path;
2740 in->make_long_path(path);
2741 ldout(cct, 10) << " path " << path << dendl;
2742
2743 bufferlist flockbl;
2744 _encode_filelocks(in, flockbl);
2745
2746 Cap *cap = in->caps[mds];
2747 cap->seq = 0; // reset seq.
2748 cap->issue_seq = 0; // reset seq.
2749 cap->mseq = 0; // reset seq.
2750 cap->issued = cap->implemented;
2751
2752 snapid_t snap_follows = 0;
2753 if (!in->cap_snaps.empty())
2754 snap_follows = in->cap_snaps.begin()->first;
2755
2756 m->add_cap(p->first.ino,
2757 cap->cap_id,
2758 path.get_ino(), path.get_path(), // ino
2759 in->caps_wanted(), // wanted
2760 cap->issued, // issued
2761 in->snaprealm->ino,
2762 snap_follows,
2763 flockbl);
2764
2765 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2766 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2767 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2768 did_snaprealm.insert(in->snaprealm->ino);
2769 }
2770 }
2771 }
2772
2773 early_kick_flushing_caps(session);
2774
2775 session->con->send_message(m);
2776
2777 mount_cond.Signal();
2778}
2779
2780
2781void Client::kick_requests(MetaSession *session)
2782{
2783 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2784 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2785 p != mds_requests.end();
2786 ++p) {
31f18b77
FG
2787 MetaRequest *req = p->second;
2788 if (req->got_unsafe)
2789 continue;
2790 if (req->aborted()) {
2791 if (req->caller_cond) {
2792 req->kick = true;
2793 req->caller_cond->Signal();
2794 }
7c673cae 2795 continue;
31f18b77
FG
2796 }
2797 if (req->retry_attempt > 0)
7c673cae 2798 continue; // new requests only
31f18b77 2799 if (req->mds == session->mds_num) {
7c673cae
FG
2800 send_request(p->second, session);
2801 }
2802 }
2803}
2804
2805void Client::resend_unsafe_requests(MetaSession *session)
2806{
2807 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2808 !iter.end();
2809 ++iter)
2810 send_request(*iter, session);
2811
2812 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2813 // process completed requests in clientreplay stage.
2814 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2815 p != mds_requests.end();
2816 ++p) {
2817 MetaRequest *req = p->second;
2818 if (req->got_unsafe)
2819 continue;
31f18b77
FG
2820 if (req->aborted())
2821 continue;
7c673cae
FG
2822 if (req->retry_attempt == 0)
2823 continue; // old requests only
2824 if (req->mds == session->mds_num)
2825 send_request(req, session, true);
2826 }
2827}
2828
2829void Client::wait_unsafe_requests()
2830{
2831 list<MetaRequest*> last_unsafe_reqs;
2832 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2833 p != mds_sessions.end();
2834 ++p) {
2835 MetaSession *s = p->second;
2836 if (!s->unsafe_requests.empty()) {
2837 MetaRequest *req = s->unsafe_requests.back();
2838 req->get();
2839 last_unsafe_reqs.push_back(req);
2840 }
2841 }
2842
2843 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2844 p != last_unsafe_reqs.end();
2845 ++p) {
2846 MetaRequest *req = *p;
2847 if (req->unsafe_item.is_on_list())
2848 wait_on_list(req->waitfor_safe);
2849 put_request(req);
2850 }
2851}
2852
2853void Client::kick_requests_closed(MetaSession *session)
2854{
2855 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2856 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2857 p != mds_requests.end(); ) {
2858 MetaRequest *req = p->second;
2859 ++p;
2860 if (req->mds == session->mds_num) {
2861 if (req->caller_cond) {
2862 req->kick = true;
2863 req->caller_cond->Signal();
2864 }
2865 req->item.remove_myself();
2866 if (req->got_unsafe) {
2867 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2868 req->unsafe_item.remove_myself();
2869 req->unsafe_dir_item.remove_myself();
2870 req->unsafe_target_item.remove_myself();
2871 signal_cond_list(req->waitfor_safe);
2872 unregister_request(req);
2873 }
2874 }
2875 }
2876 assert(session->requests.empty());
2877 assert(session->unsafe_requests.empty());
2878}
2879
2880
2881
2882
2883/************
2884 * leases
2885 */
2886
2887void Client::got_mds_push(MetaSession *s)
2888{
2889 s->seq++;
2890 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2891 if (s->state == MetaSession::STATE_CLOSING) {
2892 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2893 }
2894}
2895
2896void Client::handle_lease(MClientLease *m)
2897{
2898 ldout(cct, 10) << "handle_lease " << *m << dendl;
2899
2900 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2901
2902 mds_rank_t mds = mds_rank_t(m->get_source().num());
2903 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2904 if (!session) {
2905 m->put();
2906 return;
2907 }
2908
2909 got_mds_push(session);
2910
2911 ceph_seq_t seq = m->get_seq();
2912
2913 Inode *in;
2914 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2915 if (inode_map.count(vino) == 0) {
2916 ldout(cct, 10) << " don't have vino " << vino << dendl;
2917 goto revoke;
2918 }
2919 in = inode_map[vino];
2920
2921 if (m->get_mask() & CEPH_LOCK_DN) {
2922 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2923 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2924 goto revoke;
2925 }
2926 Dentry *dn = in->dir->dentries[m->dname];
2927 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2928 dn->lease_mds = -1;
2929 }
2930
2931 revoke:
2932 m->get_connection()->send_message(
2933 new MClientLease(
2934 CEPH_MDS_LEASE_RELEASE, seq,
2935 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2936 m->put();
2937}
2938
2939void Client::put_inode(Inode *in, int n)
2940{
2941 ldout(cct, 10) << "put_inode on " << *in << dendl;
2942 int left = in->_put(n);
2943 if (left == 0) {
2944 // release any caps
2945 remove_all_caps(in);
2946
2947 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2948 bool unclean = objectcacher->release_set(&in->oset);
2949 assert(!unclean);
2950 inode_map.erase(in->vino());
2951 if (use_faked_inos())
2952 _release_faked_ino(in);
2953
2954 if (in == root) {
2955 root = 0;
2956 root_ancestor = 0;
2957 while (!root_parents.empty())
2958 root_parents.erase(root_parents.begin());
2959 }
2960
2961 delete in;
2962 }
2963}
2964
2965void Client::close_dir(Dir *dir)
2966{
2967 Inode *in = dir->parent_inode;
2968 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2969 assert(dir->is_empty());
2970 assert(in->dir == dir);
2971 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2972 if (!in->dn_set.empty())
2973 in->get_first_parent()->put(); // unpin dentry
2974
2975 delete in->dir;
2976 in->dir = 0;
2977 put_inode(in); // unpin inode
2978}
2979
2980 /**
2981 * Don't call this with in==NULL, use get_or_create for that
2982 * leave dn set to default NULL unless you're trying to add
2983 * a new inode to a pre-created Dentry
2984 */
2985Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2986{
2987 if (!dn) {
2988 // create a new Dentry
2989 dn = new Dentry;
2990 dn->name = name;
2991
2992 // link to dir
2993 dn->dir = dir;
2994 dir->dentries[dn->name] = dn;
2995 lru.lru_insert_mid(dn); // mid or top?
91327a77
AA
2996 if (!in)
2997 dir->num_null_dentries++;
7c673cae
FG
2998
2999 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3000 << " dn " << dn << " (new dn)" << dendl;
3001 } else {
91327a77
AA
3002 assert(!dn->inode);
3003 if (in)
3004 dir->num_null_dentries--;
7c673cae
FG
3005 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3006 << " dn " << dn << " (old dn)" << dendl;
3007 }
3008
3009 if (in) { // link to inode
3010 dn->inode = in;
3011 if (in->is_dir()) {
3012 if (in->dir)
3013 dn->get(); // dir -> dn pin
3014 if (in->ll_ref)
3015 dn->get(); // ll_ref -> dn pin
3016 }
3017
3018 assert(in->dn_set.count(dn) == 0);
3019
3020 // only one parent for directories!
3021 if (in->is_dir() && !in->dn_set.empty()) {
3022 Dentry *olddn = in->get_first_parent();
3023 assert(olddn->dir != dir || olddn->name != name);
3024 Inode *old_diri = olddn->dir->parent_inode;
3025 old_diri->dir_release_count++;
3026 clear_dir_complete_and_ordered(old_diri, true);
3027 unlink(olddn, true, true); // keep dir, dentry
3028 }
3029
3030 in->dn_set.insert(dn);
3031
3032 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3033 }
3034
3035 return dn;
3036}
3037
3038void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3039{
3040 InodeRef in;
3041 in.swap(dn->inode);
3042 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3043 << " inode " << dn->inode << dendl;
3044
3045 // unlink from inode
3046 if (in) {
3047 if (in->is_dir()) {
3048 if (in->dir)
3049 dn->put(); // dir -> dn pin
3050 if (in->ll_ref)
3051 dn->put(); // ll_ref -> dn pin
3052 }
3053 dn->inode = 0;
3054 assert(in->dn_set.count(dn));
3055 in->dn_set.erase(dn);
3056 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3057 }
3058
3059 if (keepdentry) {
3060 dn->lease_mds = -1;
91327a77
AA
3061 if (in)
3062 dn->dir->num_null_dentries++;
7c673cae
FG
3063 } else {
3064 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3065
3066 // unlink from dir
3067 dn->dir->dentries.erase(dn->name);
91327a77
AA
3068 if (!in)
3069 dn->dir->num_null_dentries--;
7c673cae
FG
3070 if (dn->dir->is_empty() && !keepdir)
3071 close_dir(dn->dir);
3072 dn->dir = 0;
3073
3074 // delete den
3075 lru.lru_remove(dn);
3076 dn->put();
3077 }
3078}
3079
3080/**
3081 * For asynchronous flushes, check for errors from the IO and
3082 * update the inode if necessary
3083 */
3084class C_Client_FlushComplete : public Context {
3085private:
3086 Client *client;
3087 InodeRef inode;
3088public:
3089 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3090 void finish(int r) override {
3091 assert(client->client_lock.is_locked_by_me());
3092 if (r != 0) {
3093 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3094 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3095 << " 0x" << std::hex << inode->ino << std::dec
3096 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3097 inode->set_async_err(r);
3098 }
3099 }
3100};
3101
3102
3103/****
3104 * caps
3105 */
3106
3107void Client::get_cap_ref(Inode *in, int cap)
3108{
3109 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3110 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3111 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3112 in->get();
3113 }
3114 if ((cap & CEPH_CAP_FILE_CACHE) &&
3115 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3116 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3117 in->get();
3118 }
3119 in->get_cap_ref(cap);
3120}
3121
3122void Client::put_cap_ref(Inode *in, int cap)
3123{
3124 int last = in->put_cap_ref(cap);
3125 if (last) {
3126 int put_nref = 0;
3127 int drop = last & ~in->caps_issued();
3128 if (in->snapid == CEPH_NOSNAP) {
3129 if ((last & CEPH_CAP_FILE_WR) &&
3130 !in->cap_snaps.empty() &&
3131 in->cap_snaps.rbegin()->second.writing) {
3132 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3133 in->cap_snaps.rbegin()->second.writing = 0;
3134 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3135 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3136 }
3137 if (last & CEPH_CAP_FILE_BUFFER) {
3138 for (auto &p : in->cap_snaps)
3139 p.second.dirty_data = 0;
3140 signal_cond_list(in->waitfor_commit);
3141 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3142 ++put_nref;
3143 }
3144 }
3145 if (last & CEPH_CAP_FILE_CACHE) {
3146 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3147 ++put_nref;
3148 }
3149 if (drop)
3150 check_caps(in, 0);
3151 if (put_nref)
3152 put_inode(in, put_nref);
3153 }
3154}
3155
3156int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3157{
3158 int r = check_pool_perm(in, need);
3159 if (r < 0)
3160 return r;
3161
3162 while (1) {
3163 int file_wanted = in->caps_file_wanted();
3164 if ((file_wanted & need) != need) {
3165 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3166 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3167 << dendl;
3168 return -EBADF;
3169 }
3170
3171 int implemented;
3172 int have = in->caps_issued(&implemented);
3173
3174 bool waitfor_caps = false;
3175 bool waitfor_commit = false;
3176
3177 if (have & need & CEPH_CAP_FILE_WR) {
3178 if (endoff > 0 &&
3179 (endoff >= (loff_t)in->max_size ||
3180 endoff > (loff_t)(in->size << 1)) &&
3181 endoff > (loff_t)in->wanted_max_size) {
3182 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3183 in->wanted_max_size = endoff;
3184 check_caps(in, 0);
3185 }
3186
3187 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3188 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3189 waitfor_caps = true;
3190 }
3191 if (!in->cap_snaps.empty()) {
3192 if (in->cap_snaps.rbegin()->second.writing) {
3193 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3194 waitfor_caps = true;
3195 }
3196 for (auto &p : in->cap_snaps) {
3197 if (p.second.dirty_data) {
3198 waitfor_commit = true;
3199 break;
3200 }
3201 }
3202 if (waitfor_commit) {
3203 _flush(in, new C_Client_FlushComplete(this, in));
3204 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3205 }
3206 }
3207 }
3208
3209 if (!waitfor_caps && !waitfor_commit) {
3210 if ((have & need) == need) {
7c673cae
FG
3211 int revoking = implemented & ~have;
3212 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3213 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3214 << " revoking " << ccap_string(revoking)
7c673cae 3215 << dendl;
c07f9fc5 3216 if ((revoking & want) == 0) {
7c673cae
FG
3217 *phave = need | (have & want);
3218 in->get_cap_ref(need);
3219 return 0;
3220 }
3221 }
3222 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3223 waitfor_caps = true;
3224 }
3225
3226 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3227 in->auth_cap->session->readonly)
3228 return -EROFS;
3229
3230 if (in->flags & I_CAP_DROPPED) {
3231 int mds_wanted = in->caps_mds_wanted();
3232 if ((mds_wanted & need) != need) {
3233 int ret = _renew_caps(in);
3234 if (ret < 0)
3235 return ret;
3236 continue;
3237 }
3238 if ((mds_wanted & file_wanted) ==
3239 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3240 in->flags &= ~I_CAP_DROPPED;
3241 }
3242 }
3243
3244 if (waitfor_caps)
3245 wait_on_list(in->waitfor_caps);
3246 else if (waitfor_commit)
3247 wait_on_list(in->waitfor_commit);
3248 }
3249}
3250
3251int Client::get_caps_used(Inode *in)
3252{
3253 unsigned used = in->caps_used();
3254 if (!(used & CEPH_CAP_FILE_CACHE) &&
3255 !objectcacher->set_is_empty(&in->oset))
3256 used |= CEPH_CAP_FILE_CACHE;
3257 return used;
3258}
3259
3260void Client::cap_delay_requeue(Inode *in)
3261{
3262 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3263 in->hold_caps_until = ceph_clock_now();
3264 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3265 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3266}
3267
3268void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3269 bool sync, int used, int want, int retain,
3270 int flush, ceph_tid_t flush_tid)
3271{
3272 int held = cap->issued | cap->implemented;
3273 int revoking = cap->implemented & ~cap->issued;
3274 retain &= ~revoking;
3275 int dropping = cap->issued & ~retain;
3276 int op = CEPH_CAP_OP_UPDATE;
3277
3278 ldout(cct, 10) << "send_cap " << *in
3279 << " mds." << session->mds_num << " seq " << cap->seq
3280 << (sync ? " sync " : " async ")
3281 << " used " << ccap_string(used)
3282 << " want " << ccap_string(want)
3283 << " flush " << ccap_string(flush)
3284 << " retain " << ccap_string(retain)
3285 << " held "<< ccap_string(held)
3286 << " revoking " << ccap_string(revoking)
3287 << " dropping " << ccap_string(dropping)
3288 << dendl;
3289
3290 if (cct->_conf->client_inject_release_failure && revoking) {
3291 const int would_have_issued = cap->issued & retain;
3292 const int would_have_implemented = cap->implemented & (cap->issued | used);
3293 // Simulated bug:
3294 // - tell the server we think issued is whatever they issued plus whatever we implemented
3295 // - leave what we have implemented in place
3296 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3297 cap->issued = cap->issued | cap->implemented;
3298
3299 // Make an exception for revoking xattr caps: we are injecting
3300 // failure to release other caps, but allow xattr because client
3301 // will block on xattr ops if it can't release these to MDS (#9800)
3302 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3303 cap->issued ^= xattr_mask & revoking;
3304 cap->implemented ^= xattr_mask & revoking;
3305
3306 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3307 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3308 } else {
3309 // Normal behaviour
3310 cap->issued &= retain;
3311 cap->implemented &= cap->issued | used;
3312 }
3313
3314 snapid_t follows = 0;
3315
3316 if (flush)
3317 follows = in->snaprealm->get_snap_context().seq;
3318
3319 MClientCaps *m = new MClientCaps(op,
3320 in->ino,
3321 0,
3322 cap->cap_id, cap->seq,
3323 cap->implemented,
3324 want,
3325 flush,
3326 cap->mseq,
3327 cap_epoch_barrier);
3328 m->caller_uid = in->cap_dirtier_uid;
3329 m->caller_gid = in->cap_dirtier_gid;
3330
3331 m->head.issue_seq = cap->issue_seq;
3332 m->set_tid(flush_tid);
3333
3334 m->head.uid = in->uid;
3335 m->head.gid = in->gid;
3336 m->head.mode = in->mode;
3337
3338 m->head.nlink = in->nlink;
3339
3340 if (flush & CEPH_CAP_XATTR_EXCL) {
3341 ::encode(in->xattrs, m->xattrbl);
3342 m->head.xattr_version = in->xattr_version;
3343 }
3344
3345 m->size = in->size;
3346 m->max_size = in->max_size;
3347 m->truncate_seq = in->truncate_seq;
3348 m->truncate_size = in->truncate_size;
3349 m->mtime = in->mtime;
3350 m->atime = in->atime;
3351 m->ctime = in->ctime;
3352 m->btime = in->btime;
3353 m->time_warp_seq = in->time_warp_seq;
3354 m->change_attr = in->change_attr;
3355 if (sync)
3356 m->flags |= CLIENT_CAPS_SYNC;
3357
3358 if (flush & CEPH_CAP_FILE_WR) {
3359 m->inline_version = in->inline_version;
3360 m->inline_data = in->inline_data;
3361 }
3362
3363 in->reported_size = in->size;
3364 m->set_snap_follows(follows);
3365 cap->wanted = want;
3366 if (cap == in->auth_cap) {
3367 m->set_max_size(in->wanted_max_size);
3368 in->requested_max_size = in->wanted_max_size;
3369 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3370 }
3371
3372 if (!session->flushing_caps_tids.empty())
3373 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3374
3375 session->con->send_message(m);
3376}
3377
31f18b77
FG
3378static bool is_max_size_approaching(Inode *in)
3379{
3380 /* mds will adjust max size according to the reported size */
3381 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3382 return false;
3383 if (in->size >= in->max_size)
3384 return true;
3385 /* half of previous max_size increment has been used */
3386 if (in->max_size > in->reported_size &&
3387 (in->size << 1) >= in->max_size + in->reported_size)
3388 return true;
3389 return false;
3390}
7c673cae
FG
3391
3392/**
3393 * check_caps
3394 *
3395 * Examine currently used and wanted versus held caps. Release, flush or ack
3396 * revoked caps to the MDS as appropriate.
3397 *
3398 * @param in the inode to check
3399 * @param flags flags to apply to cap check
3400 */
3401void Client::check_caps(Inode *in, unsigned flags)
3402{
3403 unsigned wanted = in->caps_wanted();
3404 unsigned used = get_caps_used(in);
3405 unsigned cap_used;
3406
3407 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3408 // we do this here because we don't want to drop to Fs (and then
3409 // drop the Fs if we do a create!) if that alone makes us send lookups
3410 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3411 wanted |= CEPH_CAP_FILE_EXCL;
3412 }
3413
3414 int implemented;
3415 int issued = in->caps_issued(&implemented);
3416 int revoking = implemented & ~issued;
3417
3418 int retain = wanted | used | CEPH_CAP_PIN;
3419 if (!unmounting) {
3420 if (wanted)
3421 retain |= CEPH_CAP_ANY;
3422 else
3423 retain |= CEPH_CAP_ANY_SHARED;
3424 }
3425
3426 ldout(cct, 10) << "check_caps on " << *in
3427 << " wanted " << ccap_string(wanted)
3428 << " used " << ccap_string(used)
3429 << " issued " << ccap_string(issued)
3430 << " revoking " << ccap_string(revoking)
3431 << " flags=" << flags
3432 << dendl;
3433
3434 if (in->snapid != CEPH_NOSNAP)
3435 return; //snap caps last forever, can't write
3436
3437 if (in->caps.empty())
3438 return; // guard if at end of func
3439
3440 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
94b18763
FG
3441 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3442 if (_release(in))
3443 used &= ~CEPH_CAP_FILE_CACHE;
3444 }
7c673cae
FG
3445
3446 if (!in->cap_snaps.empty())
3447 flush_snaps(in);
3448
3449 if (flags & CHECK_CAPS_NODELAY)
3450 in->hold_caps_until = utime_t();
3451 else
3452 cap_delay_requeue(in);
3453
3454 utime_t now = ceph_clock_now();
3455
3456 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3457 while (it != in->caps.end()) {
3458 mds_rank_t mds = it->first;
3459 Cap *cap = it->second;
3460 ++it;
3461
3462 MetaSession *session = mds_sessions[mds];
3463 assert(session);
3464
3465 cap_used = used;
3466 if (in->auth_cap && cap != in->auth_cap)
3467 cap_used &= ~in->auth_cap->issued;
3468
3469 revoking = cap->implemented & ~cap->issued;
3470
3471 ldout(cct, 10) << " cap mds." << mds
3472 << " issued " << ccap_string(cap->issued)
3473 << " implemented " << ccap_string(cap->implemented)
3474 << " revoking " << ccap_string(revoking) << dendl;
3475
3476 if (in->wanted_max_size > in->max_size &&
3477 in->wanted_max_size > in->requested_max_size &&
3478 cap == in->auth_cap)
3479 goto ack;
3480
3481 /* approaching file_max? */
3482 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3483 cap == in->auth_cap &&
3484 is_max_size_approaching(in)) {
7c673cae 3485 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3486 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3487 goto ack;
3488 }
3489
3490 /* completed revocation? */
3491 if (revoking && (revoking & cap_used) == 0) {
3492 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3493 goto ack;
3494 }
3495
3496 /* want more caps from mds? */
3497 if (wanted & ~(cap->wanted | cap->issued))
3498 goto ack;
3499
3500 if (!revoking && unmounting && (cap_used == 0))
3501 goto ack;
3502
3503 if (wanted == cap->wanted && // mds knows what we want.
3504 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3505 !in->dirty_caps) // and we have no dirty caps
3506 continue;
3507
3508 if (now < in->hold_caps_until) {
3509 ldout(cct, 10) << "delaying cap release" << dendl;
3510 continue;
3511 }
3512
3513 ack:
3514 // re-send old cap/snapcap flushes first.
3515 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3516 session->mds_state < MDSMap::STATE_ACTIVE &&
3517 session->early_flushing_caps.count(in) == 0) {
3518 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3519 << " to mds." << session->mds_num << dendl;
3520 session->early_flushing_caps.insert(in);
3521 if (in->cap_snaps.size())
3522 flush_snaps(in, true);
3523 if (in->flushing_caps)
3524 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3525 }
3526
3527 int flushing;
3528 ceph_tid_t flush_tid;
3529 if (in->auth_cap == cap && in->dirty_caps) {
3530 flushing = mark_caps_flushing(in, &flush_tid);
3531 } else {
3532 flushing = 0;
3533 flush_tid = 0;
3534 }
3535
3536 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3537 retain, flushing, flush_tid);
3538 }
3539}
3540
3541
3542void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3543{
3544 int used = get_caps_used(in);
3545 int dirty = in->caps_dirty();
3546 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3547
3548 if (in->cap_snaps.size() &&
3549 in->cap_snaps.rbegin()->second.writing) {
3550 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3551 return;
3552 } else if (in->caps_dirty() ||
3553 (used & CEPH_CAP_FILE_WR) ||
3554 (dirty & CEPH_CAP_ANY_WR)) {
3555 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3556 assert(capsnapem.second == true); /* element inserted */
3557 CapSnap &capsnap = capsnapem.first->second;
3558 capsnap.context = old_snapc;
3559 capsnap.issued = in->caps_issued();
3560 capsnap.dirty = in->caps_dirty();
3561
3562 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3563
3564 capsnap.uid = in->uid;
3565 capsnap.gid = in->gid;
3566 capsnap.mode = in->mode;
3567 capsnap.btime = in->btime;
3568 capsnap.xattrs = in->xattrs;
3569 capsnap.xattr_version = in->xattr_version;
3570
3571 if (used & CEPH_CAP_FILE_WR) {
3572 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3573 capsnap.writing = 1;
3574 } else {
3575 finish_cap_snap(in, capsnap, used);
3576 }
3577 } else {
3578 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3579 }
3580}
3581
3582void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3583{
3584 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3585 capsnap.size = in->size;
3586 capsnap.mtime = in->mtime;
3587 capsnap.atime = in->atime;
3588 capsnap.ctime = in->ctime;
3589 capsnap.time_warp_seq = in->time_warp_seq;
3590 capsnap.change_attr = in->change_attr;
3591
3592 capsnap.dirty |= in->caps_dirty();
3593
3594 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3595 capsnap.inline_data = in->inline_data;
3596 capsnap.inline_version = in->inline_version;
3597 }
3598
3599 if (used & CEPH_CAP_FILE_BUFFER) {
3600 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3601 << " WRBUFFER, delaying" << dendl;
3602 } else {
3603 capsnap.dirty_data = 0;
3604 flush_snaps(in);
3605 }
3606}
3607
3608void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3609{
3610 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3611 in->cap_snaps.at(seq).dirty_data = 0;
3612 flush_snaps(in);
3613}
3614
3615void Client::flush_snaps(Inode *in, bool all_again)
3616{
3617 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3618 assert(in->cap_snaps.size());
3619
3620 // pick auth mds
3621 assert(in->auth_cap);
3622 MetaSession *session = in->auth_cap->session;
3623 int mseq = in->auth_cap->mseq;
3624
3625 for (auto &p : in->cap_snaps) {
3626 CapSnap &capsnap = p.second;
3627 if (!all_again) {
3628 // only flush once per session
3629 if (capsnap.flush_tid > 0)
3630 continue;
3631 }
3632
3633 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3634 << " follows " << p.first
3635 << " size " << capsnap.size
3636 << " mtime " << capsnap.mtime
3637 << " dirty_data=" << capsnap.dirty_data
3638 << " writing=" << capsnap.writing
3639 << " on " << *in << dendl;
3640 if (capsnap.dirty_data || capsnap.writing)
3641 continue;
3642
3643 if (capsnap.flush_tid == 0) {
3644 capsnap.flush_tid = ++last_flush_tid;
3645 if (!in->flushing_cap_item.is_on_list())
3646 session->flushing_caps.push_back(&in->flushing_cap_item);
3647 session->flushing_caps_tids.insert(capsnap.flush_tid);
3648 }
3649
3650 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3651 cap_epoch_barrier);
3652 if (user_id >= 0)
3653 m->caller_uid = user_id;
3654 if (group_id >= 0)
3655 m->caller_gid = group_id;
3656
3657 m->set_client_tid(capsnap.flush_tid);
3658 m->head.snap_follows = p.first;
3659
3660 m->head.caps = capsnap.issued;
3661 m->head.dirty = capsnap.dirty;
3662
3663 m->head.uid = capsnap.uid;
3664 m->head.gid = capsnap.gid;
3665 m->head.mode = capsnap.mode;
3666 m->btime = capsnap.btime;
3667
3668 m->size = capsnap.size;
3669
3670 m->head.xattr_version = capsnap.xattr_version;
3671 ::encode(capsnap.xattrs, m->xattrbl);
3672
3673 m->ctime = capsnap.ctime;
3674 m->btime = capsnap.btime;
3675 m->mtime = capsnap.mtime;
3676 m->atime = capsnap.atime;
3677 m->time_warp_seq = capsnap.time_warp_seq;
3678 m->change_attr = capsnap.change_attr;
3679
3680 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3681 m->inline_version = in->inline_version;
3682 m->inline_data = in->inline_data;
3683 }
3684
3685 assert(!session->flushing_caps_tids.empty());
3686 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3687
3688 session->con->send_message(m);
3689 }
3690}
3691
3692
3693
3694void Client::wait_on_list(list<Cond*>& ls)
3695{
3696 Cond cond;
3697 ls.push_back(&cond);
3698 cond.Wait(client_lock);
3699 ls.remove(&cond);
3700}
3701
3702void Client::signal_cond_list(list<Cond*>& ls)
3703{
3704 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3705 (*it)->Signal();
3706}
3707
3708void Client::wait_on_context_list(list<Context*>& ls)
3709{
3710 Cond cond;
3711 bool done = false;
3712 int r;
3713 ls.push_back(new C_Cond(&cond, &done, &r));
3714 while (!done)
3715 cond.Wait(client_lock);
3716}
3717
3718void Client::signal_context_list(list<Context*>& ls)
3719{
3720 while (!ls.empty()) {
3721 ls.front()->complete(0);
3722 ls.pop_front();
3723 }
3724}
3725
3726void Client::wake_inode_waiters(MetaSession *s)
3727{
3728 xlist<Cap*>::iterator iter = s->caps.begin();
3729 while (!iter.end()){
3730 signal_cond_list((*iter)->inode->waitfor_caps);
3731 ++iter;
3732 }
3733}
3734
3735
3736// flush dirty data (from objectcache)
3737
3738class C_Client_CacheInvalidate : public Context {
3739private:
3740 Client *client;
3741 vinodeno_t ino;
3742 int64_t offset, length;
3743public:
3744 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3745 client(c), offset(off), length(len) {
3746 if (client->use_faked_inos())
3747 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3748 else
3749 ino = in->vino();
3750 }
3751 void finish(int r) override {
3752 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3753 assert(!client->client_lock.is_locked_by_me());
3754 client->_async_invalidate(ino, offset, length);
3755 }
3756};
3757
3758void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3759{
3760 if (unmounting)
3761 return;
3762 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3763 ino_invalidate_cb(callback_handle, ino, off, len);
3764}
3765
3766void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3767
3768 if (ino_invalidate_cb)
3769 // we queue the invalidate, which calls the callback and decrements the ref
3770 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3771}
3772
3773void Client::_invalidate_inode_cache(Inode *in)
3774{
3775 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3776
3777 // invalidate our userspace inode cache
94b18763 3778 if (cct->_conf->client_oc) {
7c673cae 3779 objectcacher->release_set(&in->oset);
94b18763
FG
3780 if (!objectcacher->set_is_empty(&in->oset))
3781 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3782 }
7c673cae
FG
3783
3784 _schedule_invalidate_callback(in, 0, 0);
3785}
3786
3787void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3788{
3789 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3790
3791 // invalidate our userspace inode cache
3792 if (cct->_conf->client_oc) {
3793 vector<ObjectExtent> ls;
3794 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3795 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3796 }
3797
3798 _schedule_invalidate_callback(in, off, len);
3799}
3800
3801bool Client::_release(Inode *in)
3802{
3803 ldout(cct, 20) << "_release " << *in << dendl;
3804 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3805 _invalidate_inode_cache(in);
3806 return true;
3807 }
3808 return false;
3809}
3810
3811bool Client::_flush(Inode *in, Context *onfinish)
3812{
3813 ldout(cct, 10) << "_flush " << *in << dendl;
3814
3815 if (!in->oset.dirty_or_tx) {
3816 ldout(cct, 10) << " nothing to flush" << dendl;
3817 onfinish->complete(0);
3818 return true;
3819 }
3820
3821 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3822 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3823 objectcacher->purge_set(&in->oset);
3824 if (onfinish) {
3825 onfinish->complete(-ENOSPC);
3826 }
3827 return true;
3828 }
3829
3830 return objectcacher->flush_set(&in->oset, onfinish);
3831}
3832
3833void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3834{
3835 assert(client_lock.is_locked());
3836 if (!in->oset.dirty_or_tx) {
3837 ldout(cct, 10) << " nothing to flush" << dendl;
3838 return;
3839 }
3840
3841 Mutex flock("Client::_flush_range flock");
3842 Cond cond;
3843 bool safe = false;
3844 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3845 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3846 offset, size, onflush);
3847 if (!ret) {
3848 // wait for flush
3849 client_lock.Unlock();
3850 flock.Lock();
3851 while (!safe)
3852 cond.Wait(flock);
3853 flock.Unlock();
3854 client_lock.Lock();
3855 }
3856}
3857
3858void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3859{
3860 // Mutex::Locker l(client_lock);
3861 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3862 Inode *in = static_cast<Inode *>(oset->parent);
3863 assert(in);
3864 _flushed(in);
3865}
3866
3867void Client::_flushed(Inode *in)
3868{
3869 ldout(cct, 10) << "_flushed " << *in << dendl;
3870
3871 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3872}
3873
3874
3875
3876// checks common to add_update_cap, handle_cap_grant
3877void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3878{
3879 unsigned had = in->caps_issued();
3880
3881 if ((issued & CEPH_CAP_FILE_CACHE) &&
3882 !(had & CEPH_CAP_FILE_CACHE))
3883 in->cache_gen++;
3884
3885 if ((issued & CEPH_CAP_FILE_SHARED) &&
3886 !(had & CEPH_CAP_FILE_SHARED)) {
3887 in->shared_gen++;
3888
3889 if (in->is_dir())
3890 clear_dir_complete_and_ordered(in, true);
3891 }
3892}
3893
3894void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3895 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3896 int flags, const UserPerm& cap_perms)
3897{
3898 Cap *cap = 0;
3899 mds_rank_t mds = mds_session->mds_num;
3900 if (in->caps.count(mds)) {
3901 cap = in->caps[mds];
3902
3903 /*
3904 * auth mds of the inode changed. we received the cap export
3905 * message, but still haven't received the cap import message.
3906 * handle_cap_export() updated the new auth MDS' cap.
3907 *
3908 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3909 * a message that was send before the cap import message. So
3910 * don't remove caps.
3911 */
3912 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3913 assert(cap == in->auth_cap);
3914 assert(cap->cap_id == cap_id);
3915 seq = cap->seq;
3916 mseq = cap->mseq;
3917 issued |= cap->issued;
3918 flags |= CEPH_CAP_FLAG_AUTH;
3919 }
3920 } else {
3921 mds_session->num_caps++;
3922 if (!in->is_any_caps()) {
3923 assert(in->snaprealm == 0);
3924 in->snaprealm = get_snap_realm(realm);
3925 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3926 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3927 }
3928 in->caps[mds] = cap = new Cap;
3929
3930 mds_session->caps.push_back(&cap->cap_item);
3931 cap->session = mds_session;
3932 cap->inode = in;
3933 cap->gen = mds_session->cap_gen;
7c673cae
FG
3934 }
3935
3936 check_cap_issue(in, cap, issued);
3937
3938 if (flags & CEPH_CAP_FLAG_AUTH) {
3939 if (in->auth_cap != cap &&
3940 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3941 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3942 ldout(cct, 10) << "add_update_cap changing auth cap: "
3943 << "add myself to new auth MDS' flushing caps list" << dendl;
3944 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3945 }
3946 in->auth_cap = cap;
3947 }
3948 }
3949
3950 unsigned old_caps = cap->issued;
3951 cap->cap_id = cap_id;
91327a77 3952 cap->issued = issued;
7c673cae
FG
3953 cap->implemented |= issued;
3954 cap->seq = seq;
3955 cap->issue_seq = seq;
3956 cap->mseq = mseq;
28e407b8 3957 cap->gen = mds_session->cap_gen;
7c673cae
FG
3958 cap->latest_perms = cap_perms;
3959 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3960 << " from mds." << mds
3961 << " on " << *in
3962 << dendl;
3963
3964 if ((issued & ~old_caps) && in->auth_cap == cap) {
3965 // non-auth MDS is revoking the newly grant caps ?
3966 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3967 if (it->second == cap)
3968 continue;
3969 if (it->second->implemented & ~it->second->issued & issued) {
3970 check_caps(in, CHECK_CAPS_NODELAY);
3971 break;
3972 }
3973 }
3974 }
3975
3976 if (issued & ~old_caps)
3977 signal_cond_list(in->waitfor_caps);
3978}
3979
3980void Client::remove_cap(Cap *cap, bool queue_release)
3981{
3982 Inode *in = cap->inode;
3983 MetaSession *session = cap->session;
3984 mds_rank_t mds = cap->session->mds_num;
3985
3986 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3987
3988 if (queue_release) {
3989 session->enqueue_cap_release(
3990 in->ino,
3991 cap->cap_id,
3992 cap->issue_seq,
3993 cap->mseq,
3994 cap_epoch_barrier);
3995 }
3996
3997 if (in->auth_cap == cap) {
3998 if (in->flushing_cap_item.is_on_list()) {
3999 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4000 in->flushing_cap_item.remove_myself();
4001 }
4002 in->auth_cap = NULL;
4003 }
4004 assert(in->caps.count(mds));
4005 in->caps.erase(mds);
4006
4007 cap->cap_item.remove_myself();
4008 delete cap;
4009 cap = nullptr;
4010
4011 if (!in->is_any_caps()) {
4012 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
4013 in->snaprealm_item.remove_myself();
4014 put_snap_realm(in->snaprealm);
4015 in->snaprealm = 0;
4016 }
4017}
4018
4019void Client::remove_all_caps(Inode *in)
4020{
4021 while (!in->caps.empty())
4022 remove_cap(in->caps.begin()->second, true);
4023}
4024
4025void Client::remove_session_caps(MetaSession *s)
4026{
4027 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
4028
4029 while (s->caps.size()) {
4030 Cap *cap = *s->caps.begin();
4031 Inode *in = cap->inode;
4032 bool dirty_caps = false, cap_snaps = false;
4033 if (in->auth_cap == cap) {
4034 cap_snaps = !in->cap_snaps.empty();
4035 dirty_caps = in->dirty_caps | in->flushing_caps;
4036 in->wanted_max_size = 0;
4037 in->requested_max_size = 0;
4038 in->flags |= I_CAP_DROPPED;
4039 }
4040 remove_cap(cap, false);
4041 signal_cond_list(in->waitfor_caps);
4042 if (cap_snaps) {
4043 InodeRef tmp_ref(in);
4044 in->cap_snaps.clear();
4045 }
4046 if (dirty_caps) {
4047 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4048 if (in->flushing_caps) {
4049 num_flushing_caps--;
4050 in->flushing_cap_tids.clear();
4051 }
4052 in->flushing_caps = 0;
28e407b8 4053 in->mark_caps_clean();
7c673cae
FG
4054 put_inode(in);
4055 }
4056 }
4057 s->flushing_caps_tids.clear();
4058 sync_cond.Signal();
4059}
4060
91327a77 4061int Client::_do_remount(bool retry_on_error)
b32b8144 4062{
91327a77
AA
4063 uint64_t max_retries = cct->_conf->get_val<uint64_t>("mds_max_retries_on_remount_failure");
4064
b32b8144
FG
4065 errno = 0;
4066 int r = remount_cb(callback_handle);
91327a77
AA
4067 if (r == 0) {
4068 retries_on_invalidate = 0;
4069 } else {
b32b8144
FG
4070 int e = errno;
4071 client_t whoami = get_nodeid();
4072 if (r == -1) {
4073 lderr(cct) <<
4074 "failed to remount (to trim kernel dentries): "
4075 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4076 } else {
4077 lderr(cct) <<
4078 "failed to remount (to trim kernel dentries): "
4079 "return code = " << r << dendl;
4080 }
91327a77
AA
4081 bool should_abort =
4082 (cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4083 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4084 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4085 if (should_abort && !unmounting) {
4086 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4087 ceph_abort();
4088 }
4089 }
4090 return r;
4091}
4092
7c673cae
FG
4093class C_Client_Remount : public Context {
4094private:
4095 Client *client;
4096public:
4097 explicit C_Client_Remount(Client *c) : client(c) {}
4098 void finish(int r) override {
b32b8144 4099 assert(r == 0);
91327a77 4100 client->_do_remount(true);
7c673cae
FG
4101 }
4102};
4103
4104void Client::_invalidate_kernel_dcache()
4105{
4106 if (unmounting)
4107 return;
94b18763
FG
4108 if (can_invalidate_dentries) {
4109 if (dentry_invalidate_cb && root->dir) {
4110 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4111 p != root->dir->dentries.end();
4112 ++p) {
4113 if (p->second->inode)
4114 _schedule_invalidate_dentry_callback(p->second, false);
4115 }
7c673cae
FG
4116 }
4117 } else if (remount_cb) {
4118 // Hacky:
4119 // when remounting a file system, linux kernel trims all unused dentries in the fs
4120 remount_finisher.queue(new C_Client_Remount(this));
4121 }
4122}
4123
91327a77
AA
4124void Client::_trim_negative_child_dentries(InodeRef& in)
4125{
4126 if (!in->is_dir())
4127 return;
4128
4129 Dir* dir = in->dir;
4130 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4131 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4132 Dentry *dn = p->second;
4133 ++p;
4134 assert(!dn->inode);
4135 if (dn->lru_is_expireable())
4136 unlink(dn, true, false); // keep dir, drop dentry
4137 }
4138 if (dir->dentries.empty()) {
4139 close_dir(dir);
4140 }
4141 }
4142
4143 if (in->flags & I_SNAPDIR_OPEN) {
4144 InodeRef snapdir = open_snapdir(in.get());
4145 _trim_negative_child_dentries(snapdir);
4146 }
4147}
4148
28e407b8 4149void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4150{
4151 mds_rank_t mds = s->mds_num;
28e407b8 4152 size_t caps_size = s->caps.size();
7c673cae
FG
4153 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4154 << " caps " << caps_size << dendl;
4155
28e407b8
AA
4156 uint64_t trimmed = 0;
4157 auto p = s->caps.begin();
4158 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4159 * looking at from getting deleted during traversal. */
7c673cae
FG
4160 while ((caps_size - trimmed) > max && !p.end()) {
4161 Cap *cap = *p;
b32b8144 4162 InodeRef in(cap->inode);
7c673cae
FG
4163
4164 // Increment p early because it will be invalidated if cap
4165 // is deleted inside remove_cap
4166 ++p;
4167
4168 if (in->caps.size() > 1 && cap != in->auth_cap) {
4169 int mine = cap->issued | cap->implemented;
4170 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4171 // disposable non-auth cap
b32b8144 4172 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4173 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4174 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4175 trimmed++;
4176 }
4177 } else {
4178 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4179 _trim_negative_child_dentries(in);
7c673cae
FG
4180 bool all = true;
4181 set<Dentry*>::iterator q = in->dn_set.begin();
7c673cae
FG
4182 while (q != in->dn_set.end()) {
4183 Dentry *dn = *q++;
4184 if (dn->lru_is_expireable()) {
4185 if (can_invalidate_dentries &&
4186 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4187 // Only issue one of these per DN for inodes in root: handle
4188 // others more efficiently by calling for root-child DNs at
4189 // the end of this function.
4190 _schedule_invalidate_dentry_callback(dn, true);
4191 }
28e407b8
AA
4192 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4193 to_trim.insert(dn);
7c673cae
FG
4194 } else {
4195 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4196 all = false;
4197 }
4198 }
4199 if (all && in->ino != MDS_INO_ROOT) {
4200 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4201 trimmed++;
4202 }
4203 }
4204 }
28e407b8
AA
4205 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4206 for (const auto &dn : to_trim) {
4207 trim_dentry(dn);
4208 }
4209 to_trim.clear();
7c673cae 4210
b32b8144
FG
4211 caps_size = s->caps.size();
4212 if (caps_size > max)
7c673cae
FG
4213 _invalidate_kernel_dcache();
4214}
4215
4216void Client::force_session_readonly(MetaSession *s)
4217{
4218 s->readonly = true;
4219 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4220 Inode *in = (*p)->inode;
4221 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4222 signal_cond_list(in->waitfor_caps);
4223 }
4224}
4225
7c673cae
FG
4226int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4227{
4228 MetaSession *session = in->auth_cap->session;
4229
4230 int flushing = in->dirty_caps;
4231 assert(flushing);
4232
4233 ceph_tid_t flush_tid = ++last_flush_tid;
4234 in->flushing_cap_tids[flush_tid] = flushing;
4235
4236 if (!in->flushing_caps) {
4237 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4238 num_flushing_caps++;
4239 } else {
4240 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4241 }
4242
4243 in->flushing_caps |= flushing;
28e407b8 4244 in->mark_caps_clean();
7c673cae
FG
4245
4246 if (!in->flushing_cap_item.is_on_list())
4247 session->flushing_caps.push_back(&in->flushing_cap_item);
4248 session->flushing_caps_tids.insert(flush_tid);
4249
4250 *ptid = flush_tid;
4251 return flushing;
4252}
4253
4254void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4255{
4256 for (auto &p : in->cap_snaps) {
4257 CapSnap &capsnap = p.second;
4258 if (capsnap.flush_tid > 0) {
4259 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4260 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4261 }
4262 }
4263 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4264 it != in->flushing_cap_tids.end();
4265 ++it) {
4266 old_s->flushing_caps_tids.erase(it->first);
4267 new_s->flushing_caps_tids.insert(it->first);
4268 }
4269 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4270}
4271
4272/*
4273 * Flush all caps back to the MDS. Because the callers generally wait on the
4274 * result of this function (syncfs and umount cases), we set
4275 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4276 */
4277void Client::flush_caps_sync()
4278{
4279 ldout(cct, 10) << __func__ << dendl;
28e407b8 4280 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4281 while (!p.end()) {
4282 unsigned flags = CHECK_CAPS_NODELAY;
4283 Inode *in = *p;
4284
4285 ++p;
28e407b8
AA
4286 delayed_list.pop_front();
4287 if (p.end() && dirty_list.empty())
7c673cae
FG
4288 flags |= CHECK_CAPS_SYNCHRONOUS;
4289 check_caps(in, flags);
4290 }
4291
4292 // other caps, too
28e407b8 4293 p = dirty_list.begin();
7c673cae
FG
4294 while (!p.end()) {
4295 unsigned flags = CHECK_CAPS_NODELAY;
4296 Inode *in = *p;
4297
4298 ++p;
4299 if (p.end())
4300 flags |= CHECK_CAPS_SYNCHRONOUS;
4301 check_caps(in, flags);
4302 }
4303}
4304
4305void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4306{
4307 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4308 Cap *cap = in->auth_cap;
4309 assert(cap->session == session);
4310
4311 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4312 p != in->flushing_cap_tids.end();
4313 ++p) {
4314 bool req_sync = false;
4315
4316 /* If this is a synchronous request, then flush the journal on last one */
4317 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4318 req_sync = true;
4319
4320 send_cap(in, session, cap, req_sync,
4321 (get_caps_used(in) | in->caps_dirty()),
4322 in->caps_wanted(), (cap->issued | cap->implemented),
4323 p->second, p->first);
4324 }
4325}
4326
4327void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4328{
4329 while (in->flushing_caps) {
4330 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4331 assert(it != in->flushing_cap_tids.end());
4332 if (it->first > want)
4333 break;
4334 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4335 << ccap_string(it->second) << " want " << want
4336 << " last " << it->first << dendl;
4337 wait_on_list(in->waitfor_caps);
4338 }
4339}
4340
4341void Client::wait_sync_caps(ceph_tid_t want)
4342{
4343 retry:
4344 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4345 << num_flushing_caps << " total flushing)" << dendl;
4346 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4347 p != mds_sessions.end();
4348 ++p) {
4349 MetaSession *s = p->second;
4350 if (s->flushing_caps_tids.empty())
4351 continue;
4352 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4353 if (oldest_tid <= want) {
4354 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4355 << " (want " << want << ")" << dendl;
4356 sync_cond.Wait(client_lock);
4357 goto retry;
4358 }
4359 }
4360}
4361
4362void Client::kick_flushing_caps(MetaSession *session)
4363{
4364 mds_rank_t mds = session->mds_num;
4365 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4366
4367 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4368 Inode *in = *p;
4369 if (session->early_flushing_caps.count(in))
4370 continue;
4371 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4372 if (in->cap_snaps.size())
4373 flush_snaps(in, true);
4374 if (in->flushing_caps)
4375 flush_caps(in, session);
4376 }
4377
4378 session->early_flushing_caps.clear();
4379}
4380
4381void Client::early_kick_flushing_caps(MetaSession *session)
4382{
4383 session->early_flushing_caps.clear();
4384
4385 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4386 Inode *in = *p;
4387 assert(in->auth_cap);
4388
4389 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4390 // stage. This guarantees that MDS processes the cap flush message before issuing
4391 // the flushing caps to other client.
4392 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4393 continue;
4394
4395 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4396 << " to mds." << session->mds_num << dendl;
4397
4398 session->early_flushing_caps.insert(in);
4399
4400 if (in->cap_snaps.size())
4401 flush_snaps(in, true);
4402 if (in->flushing_caps)
4403 flush_caps(in, session);
4404
4405 }
4406}
4407
4408void Client::kick_maxsize_requests(MetaSession *session)
4409{
4410 xlist<Cap*>::iterator iter = session->caps.begin();
4411 while (!iter.end()){
4412 (*iter)->inode->requested_max_size = 0;
4413 (*iter)->inode->wanted_max_size = 0;
4414 signal_cond_list((*iter)->inode->waitfor_caps);
4415 ++iter;
4416 }
4417}
4418
4419void SnapRealm::build_snap_context()
4420{
4421 set<snapid_t> snaps;
4422 snapid_t max_seq = seq;
4423
4424 // start with prior_parents?
4425 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4426 snaps.insert(prior_parent_snaps[i]);
4427
4428 // current parent's snaps
4429 if (pparent) {
4430 const SnapContext& psnapc = pparent->get_snap_context();
4431 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4432 if (psnapc.snaps[i] >= parent_since)
4433 snaps.insert(psnapc.snaps[i]);
4434 if (psnapc.seq > max_seq)
4435 max_seq = psnapc.seq;
4436 }
4437
4438 // my snaps
4439 for (unsigned i=0; i<my_snaps.size(); i++)
4440 snaps.insert(my_snaps[i]);
4441
4442 // ok!
4443 cached_snap_context.seq = max_seq;
4444 cached_snap_context.snaps.resize(0);
4445 cached_snap_context.snaps.reserve(snaps.size());
4446 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4447 cached_snap_context.snaps.push_back(*p);
4448}
4449
4450void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4451{
4452 list<SnapRealm*> q;
4453 q.push_back(realm);
4454
4455 while (!q.empty()) {
4456 realm = q.front();
4457 q.pop_front();
4458
4459 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4460 realm->invalidate_cache();
4461
4462 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4463 p != realm->pchildren.end();
4464 ++p)
4465 q.push_back(*p);
4466 }
4467}
4468
4469SnapRealm *Client::get_snap_realm(inodeno_t r)
4470{
4471 SnapRealm *realm = snap_realms[r];
4472 if (!realm)
4473 snap_realms[r] = realm = new SnapRealm(r);
4474 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4475 realm->nref++;
4476 return realm;
4477}
4478
4479SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4480{
4481 if (snap_realms.count(r) == 0) {
4482 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4483 return NULL;
4484 }
4485 SnapRealm *realm = snap_realms[r];
4486 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4487 realm->nref++;
4488 return realm;
4489}
4490
4491void Client::put_snap_realm(SnapRealm *realm)
4492{
4493 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4494 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4495 if (--realm->nref == 0) {
4496 snap_realms.erase(realm->ino);
4497 if (realm->pparent) {
4498 realm->pparent->pchildren.erase(realm);
4499 put_snap_realm(realm->pparent);
4500 }
4501 delete realm;
4502 }
4503}
4504
4505bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4506{
4507 if (realm->parent != parent) {
4508 ldout(cct, 10) << "adjust_realm_parent " << *realm
4509 << " " << realm->parent << " -> " << parent << dendl;
4510 realm->parent = parent;
4511 if (realm->pparent) {
4512 realm->pparent->pchildren.erase(realm);
4513 put_snap_realm(realm->pparent);
4514 }
4515 realm->pparent = get_snap_realm(parent);
4516 realm->pparent->pchildren.insert(realm);
4517 return true;
4518 }
4519 return false;
4520}
4521
4522static bool has_new_snaps(const SnapContext& old_snapc,
4523 const SnapContext& new_snapc)
4524{
4525 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4526}
4527
4528
4529void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4530{
4531 SnapRealm *first_realm = NULL;
4532 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4533
4534 map<SnapRealm*, SnapContext> dirty_realms;
4535
4536 bufferlist::iterator p = bl.begin();
4537 while (!p.end()) {
4538 SnapRealmInfo info;
4539 ::decode(info, p);
4540 SnapRealm *realm = get_snap_realm(info.ino());
4541
4542 bool invalidate = false;
4543
4544 if (info.seq() > realm->seq) {
4545 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4546 << dendl;
4547
4548 if (flush) {
4549 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4550 // flush me + children
4551 list<SnapRealm*> q;
4552 q.push_back(realm);
4553 while (!q.empty()) {
4554 SnapRealm *realm = q.front();
4555 q.pop_front();
4556
4557 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4558 p != realm->pchildren.end();
4559 ++p)
4560 q.push_back(*p);
4561
4562 if (dirty_realms.count(realm) == 0) {
4563 realm->nref++;
4564 dirty_realms[realm] = realm->get_snap_context();
4565 }
4566 }
4567 }
4568
4569 // update
4570 realm->seq = info.seq();
4571 realm->created = info.created();
4572 realm->parent_since = info.parent_since();
4573 realm->prior_parent_snaps = info.prior_parent_snaps;
4574 realm->my_snaps = info.my_snaps;
4575 invalidate = true;
4576 }
4577
4578 // _always_ verify parent
4579 if (adjust_realm_parent(realm, info.parent()))
4580 invalidate = true;
4581
4582 if (invalidate) {
4583 invalidate_snaprealm_and_children(realm);
4584 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4585 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4586 } else {
4587 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4588 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4589 }
4590
4591 if (!first_realm)
4592 first_realm = realm;
4593 else
4594 put_snap_realm(realm);
4595 }
4596
4597 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4598 q != dirty_realms.end();
4599 ++q) {
4600 SnapRealm *realm = q->first;
4601 // if there are new snaps ?
4602 if (has_new_snaps(q->second, realm->get_snap_context())) {
4603 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4604 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4605 while (!r.end()) {
4606 Inode *in = *r;
4607 ++r;
4608 queue_cap_snap(in, q->second);
4609 }
4610 } else {
4611 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4612 }
4613 put_snap_realm(realm);
4614 }
4615
4616 if (realm_ret)
4617 *realm_ret = first_realm;
4618 else
4619 put_snap_realm(first_realm);
4620}
4621
4622void Client::handle_snap(MClientSnap *m)
4623{
4624 ldout(cct, 10) << "handle_snap " << *m << dendl;
4625 mds_rank_t mds = mds_rank_t(m->get_source().num());
4626 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4627 if (!session) {
4628 m->put();
4629 return;
4630 }
4631
4632 got_mds_push(session);
4633
4634 map<Inode*, SnapContext> to_move;
4635 SnapRealm *realm = 0;
4636
4637 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4638 assert(m->head.split);
4639 SnapRealmInfo info;
4640 bufferlist::iterator p = m->bl.begin();
4641 ::decode(info, p);
4642 assert(info.ino() == m->head.split);
4643
4644 // flush, then move, ino's.
4645 realm = get_snap_realm(info.ino());
4646 ldout(cct, 10) << " splitting off " << *realm << dendl;
4647 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4648 p != m->split_inos.end();
4649 ++p) {
4650 vinodeno_t vino(*p, CEPH_NOSNAP);
4651 if (inode_map.count(vino)) {
4652 Inode *in = inode_map[vino];
4653 if (!in->snaprealm || in->snaprealm == realm)
4654 continue;
4655 if (in->snaprealm->created > info.created()) {
4656 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4657 << *in->snaprealm << dendl;
4658 continue;
4659 }
4660 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4661
4662
4663 in->snaprealm_item.remove_myself();
4664 to_move[in] = in->snaprealm->get_snap_context();
4665 put_snap_realm(in->snaprealm);
4666 }
4667 }
4668
4669 // move child snaprealms, too
4670 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4671 p != m->split_realms.end();
4672 ++p) {
4673 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4674 SnapRealm *child = get_snap_realm_maybe(*p);
4675 if (!child)
4676 continue;
4677 adjust_realm_parent(child, realm->ino);
4678 put_snap_realm(child);
4679 }
4680 }
4681
4682 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4683
4684 if (realm) {
4685 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4686 Inode *in = p->first;
4687 in->snaprealm = realm;
4688 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4689 realm->nref++;
4690 // queue for snap writeback
4691 if (has_new_snaps(p->second, realm->get_snap_context()))
4692 queue_cap_snap(in, p->second);
4693 }
4694 put_snap_realm(realm);
4695 }
4696
4697 m->put();
4698}
4699
4700void Client::handle_quota(MClientQuota *m)
4701{
4702 mds_rank_t mds = mds_rank_t(m->get_source().num());
4703 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4704 if (!session) {
4705 m->put();
4706 return;
4707 }
4708
4709 got_mds_push(session);
4710
4711 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4712
4713 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4714 if (inode_map.count(vino)) {
4715 Inode *in = NULL;
4716 in = inode_map[vino];
4717
4718 if (in) {
4719 in->quota = m->quota;
4720 in->rstat = m->rstat;
4721 }
4722 }
4723
4724 m->put();
4725}
4726
4727void Client::handle_caps(MClientCaps *m)
4728{
4729 mds_rank_t mds = mds_rank_t(m->get_source().num());
4730 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4731 if (!session) {
4732 m->put();
4733 return;
4734 }
4735
4736 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4737 // Pause RADOS operations until we see the required epoch
4738 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4739 }
4740
4741 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4742 // Record the barrier so that we will transmit it to MDS when releasing
4743 set_cap_epoch_barrier(m->osd_epoch_barrier);
4744 }
4745
4746 got_mds_push(session);
4747
4748 m->clear_payload(); // for if/when we send back to MDS
4749
4750 Inode *in = 0;
4751 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4752 if (inode_map.count(vino))
4753 in = inode_map[vino];
4754 if (!in) {
4755 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4756 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4757 session->enqueue_cap_release(
4758 m->get_ino(),
4759 m->get_cap_id(),
4760 m->get_seq(),
4761 m->get_mseq(),
4762 cap_epoch_barrier);
4763 } else {
4764 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4765 }
4766 m->put();
4767
4768 // in case the mds is waiting on e.g. a revocation
4769 flush_cap_releases();
4770 return;
4771 }
4772
4773 switch (m->get_op()) {
4774 case CEPH_CAP_OP_EXPORT:
4775 return handle_cap_export(session, in, m);
4776 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4777 return handle_cap_flushsnap_ack(session, in, m);
4778 case CEPH_CAP_OP_IMPORT:
4779 handle_cap_import(session, in, m);
4780 }
4781
4782 if (in->caps.count(mds) == 0) {
4783 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4784 m->put();
4785 return;
4786 }
4787
4788 Cap *cap = in->caps[mds];
4789
4790 switch (m->get_op()) {
4791 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4792 case CEPH_CAP_OP_IMPORT:
4793 case CEPH_CAP_OP_REVOKE:
4794 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4795 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4796 default:
4797 m->put();
4798 }
4799}
4800
4801void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4802{
4803 mds_rank_t mds = session->mds_num;
4804
4805 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4806 << " IMPORT from mds." << mds << dendl;
4807
4808 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4809 Cap *cap = NULL;
4810 UserPerm cap_perms;
4811 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4812 cap = in->caps[peer_mds];
4813 if (cap) {
4814 cap_perms = cap->latest_perms;
4815 }
4816 }
4817
4818 // add/update it
4819 SnapRealm *realm = NULL;
4820 update_snap_trace(m->snapbl, &realm);
4821
4822 add_update_cap(in, session, m->get_cap_id(),
4823 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4824 CEPH_CAP_FLAG_AUTH, cap_perms);
4825
4826 if (cap && cap->cap_id == m->peer.cap_id) {
4827 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4828 }
4829
4830 if (realm)
4831 put_snap_realm(realm);
4832
4833 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4834 // reflush any/all caps (if we are now the auth_cap)
4835 if (in->cap_snaps.size())
4836 flush_snaps(in, true);
4837 if (in->flushing_caps)
4838 flush_caps(in, session);
4839 }
4840}
4841
4842void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4843{
4844 mds_rank_t mds = session->mds_num;
4845
4846 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4847 << " EXPORT from mds." << mds << dendl;
4848
4849 Cap *cap = NULL;
4850 if (in->caps.count(mds))
4851 cap = in->caps[mds];
4852
4853 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4854
4855 if (cap && cap->cap_id == m->get_cap_id()) {
4856 if (m->peer.cap_id) {
4857 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4858 if (in->caps.count(peer_mds)) {
4859 Cap *tcap = in->caps[peer_mds];
181888fb 4860 if (tcap->cap_id == m->peer.cap_id &&
7c673cae
FG
4861 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4862 tcap->cap_id = m->peer.cap_id;
4863 tcap->seq = m->peer.seq - 1;
4864 tcap->issue_seq = tcap->seq;
4865 tcap->mseq = m->peer.mseq;
4866 tcap->issued |= cap->issued;
4867 tcap->implemented |= cap->issued;
4868 if (cap == in->auth_cap)
4869 in->auth_cap = tcap;
4870 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4871 adjust_session_flushing_caps(in, session, tsession);
4872 }
4873 } else {
4874 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4875 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4876 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4877 cap->latest_perms);
4878 }
4879 } else {
4880 if (cap == in->auth_cap)
4881 in->flags |= I_CAP_DROPPED;
4882 }
4883
4884 remove_cap(cap, false);
4885 }
4886
4887 m->put();
4888}
4889
4890void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4891{
4892 mds_rank_t mds = session->mds_num;
4893 assert(in->caps[mds]);
4894
4895 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4896 << " size " << in->size << " -> " << m->get_size()
4897 << dendl;
4898
1adf2230
AA
4899 int issued;
4900 in->caps_issued(&issued);
4901 issued |= in->caps_dirty();
4902 update_inode_file_size(in, issued, m->get_size(),
4903 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4904 m->put();
4905}
4906
4907void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4908{
4909 ceph_tid_t flush_ack_tid = m->get_client_tid();
4910 int dirty = m->get_dirty();
4911 int cleaned = 0;
4912 int flushed = 0;
4913
4914 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4915 it != in->flushing_cap_tids.end(); ) {
4916 if (it->first == flush_ack_tid)
4917 cleaned = it->second;
4918 if (it->first <= flush_ack_tid) {
4919 session->flushing_caps_tids.erase(it->first);
4920 in->flushing_cap_tids.erase(it++);
4921 ++flushed;
4922 continue;
4923 }
4924 cleaned &= ~it->second;
4925 if (!cleaned)
4926 break;
4927 ++it;
4928 }
4929
4930 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4931 << " cleaned " << ccap_string(cleaned) << " on " << *in
4932 << " with " << ccap_string(dirty) << dendl;
4933
4934 if (flushed) {
4935 signal_cond_list(in->waitfor_caps);
4936 if (session->flushing_caps_tids.empty() ||
4937 *session->flushing_caps_tids.begin() > flush_ack_tid)
4938 sync_cond.Signal();
4939 }
4940
4941 if (!dirty) {
4942 in->cap_dirtier_uid = -1;
4943 in->cap_dirtier_gid = -1;
4944 }
4945
4946 if (!cleaned) {
4947 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4948 } else {
4949 if (in->flushing_caps) {
4950 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4951 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4952 in->flushing_caps &= ~cleaned;
4953 if (in->flushing_caps == 0) {
4954 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4955 num_flushing_caps--;
4956 if (in->cap_snaps.empty())
4957 in->flushing_cap_item.remove_myself();
4958 }
4959 if (!in->caps_dirty())
4960 put_inode(in);
4961 }
4962 }
4963
4964 m->put();
4965}
4966
4967
4968void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4969{
4970 mds_rank_t mds = session->mds_num;
4971 assert(in->caps[mds]);
4972 snapid_t follows = m->get_snap_follows();
4973
4974 if (in->cap_snaps.count(follows)) {
4975 CapSnap &capsnap = in->cap_snaps.at(follows);
4976 if (m->get_client_tid() != capsnap.flush_tid) {
4977 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4978 } else {
4979 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4980 << " on " << *in << dendl;
4981 InodeRef tmp_ref;
4982 if (in->get_num_ref() == 1)
4983 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4984 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4985 in->flushing_cap_item.remove_myself();
4986 session->flushing_caps_tids.erase(capsnap.flush_tid);
4987 in->cap_snaps.erase(follows);
4988 }
4989 } else {
4990 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4991 << " on " << *in << dendl;
4992 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4993 }
4994
4995 m->put();
4996}
4997
4998class C_Client_DentryInvalidate : public Context {
4999private:
5000 Client *client;
5001 vinodeno_t dirino;
5002 vinodeno_t ino;
5003 string name;
5004public:
5005 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5006 client(c), name(dn->name) {
5007 if (client->use_faked_inos()) {
5008 dirino.ino = dn->dir->parent_inode->faked_ino;
5009 if (del)
5010 ino.ino = dn->inode->faked_ino;
5011 } else {
5012 dirino = dn->dir->parent_inode->vino();
5013 if (del)
5014 ino = dn->inode->vino();
5015 }
5016 if (!del)
5017 ino.ino = inodeno_t();
5018 }
5019 void finish(int r) override {
5020 // _async_dentry_invalidate is responsible for its own locking
5021 assert(!client->client_lock.is_locked_by_me());
5022 client->_async_dentry_invalidate(dirino, ino, name);
5023 }
5024};
5025
5026void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5027{
5028 if (unmounting)
5029 return;
5030 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
5031 << " in dir " << dirino << dendl;
5032 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5033}
5034
5035void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5036{
5037 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5038 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5039}
5040
5041void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5042{
5043 int ref = in->get_num_ref();
5044
5045 if (in->dir && !in->dir->dentries.empty()) {
5046 for (auto p = in->dir->dentries.begin();
5047 p != in->dir->dentries.end(); ) {
5048 Dentry *dn = p->second;
5049 ++p;
5050 /* rmsnap removes whole subtree, need trim inodes recursively.
5051 * we don't need to invalidate dentries recursively. because
5052 * invalidating a directory dentry effectively invalidate
5053 * whole subtree */
5054 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5055 _try_to_trim_inode(dn->inode.get(), false);
5056
5057 if (dn->lru_is_expireable())
5058 unlink(dn, true, false); // keep dir, drop dentry
5059 }
5060 if (in->dir->dentries.empty()) {
5061 close_dir(in->dir);
5062 --ref;
5063 }
5064 }
5065
5066 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5067 InodeRef snapdir = open_snapdir(in);
5068 _try_to_trim_inode(snapdir.get(), false);
5069 --ref;
5070 }
5071
5072 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5073 set<Dentry*>::iterator q = in->dn_set.begin();
5074 while (q != in->dn_set.end()) {
5075 Dentry *dn = *q++;
5076 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5077 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5078 _schedule_invalidate_dentry_callback(dn, true);
5079 unlink(dn, true, true);
5080 }
5081 }
5082}
5083
5084void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5085{
5086 mds_rank_t mds = session->mds_num;
5087 int used = get_caps_used(in);
5088 int wanted = in->caps_wanted();
5089
5090 const int old_caps = cap->issued;
5091 const int new_caps = m->get_caps();
5092 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5093 << " mds." << mds << " seq " << m->get_seq()
5094 << " caps now " << ccap_string(new_caps)
5095 << " was " << ccap_string(old_caps) << dendl;
5096 cap->seq = m->get_seq();
28e407b8 5097 cap->gen = session->cap_gen;
7c673cae 5098
7c673cae 5099 // update inode
1adf2230
AA
5100 int issued;
5101 in->caps_issued(&issued);
5102 issued |= in->caps_dirty();
7c673cae 5103
1adf2230
AA
5104 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5105 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5106 in->mode = m->head.mode;
5107 in->uid = m->head.uid;
5108 in->gid = m->head.gid;
5109 in->btime = m->btime;
5110 }
5111 bool deleted_inode = false;
1adf2230
AA
5112 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5113 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5114 in->nlink = m->head.nlink;
5115 if (in->nlink == 0 &&
5116 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5117 deleted_inode = true;
5118 }
1adf2230 5119 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5120 m->xattrbl.length() &&
5121 m->head.xattr_version > in->xattr_version) {
5122 bufferlist::iterator p = m->xattrbl.begin();
5123 ::decode(in->xattrs, p);
5124 in->xattr_version = m->head.xattr_version;
5125 }
28e407b8
AA
5126
5127 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5128 in->dirstat.nfiles = m->get_nfiles();
5129 in->dirstat.nsubdirs = m->get_nsubdirs();
5130 }
5131
1adf2230
AA
5132 if (new_caps & CEPH_CAP_ANY_RD) {
5133 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5134 m->get_ctime(), m->get_mtime(), m->get_atime());
5135 }
5136
5137 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5138 in->layout = m->get_layout();
5139 update_inode_file_size(in, issued, m->get_size(),
5140 m->get_truncate_seq(), m->get_truncate_size());
5141 }
5142
5143 if (m->inline_version > in->inline_version) {
5144 in->inline_data = m->inline_data;
5145 in->inline_version = m->inline_version;
5146 }
5147
5148 /* always take a newer change attr */
5149 if (m->get_change_attr() > in->change_attr)
5150 in->change_attr = m->get_change_attr();
7c673cae
FG
5151
5152 // max_size
5153 if (cap == in->auth_cap &&
1adf2230
AA
5154 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5155 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5156 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5157 in->max_size = m->get_max_size();
5158 if (in->max_size > in->wanted_max_size) {
5159 in->wanted_max_size = 0;
5160 in->requested_max_size = 0;
5161 }
5162 }
5163
5164 bool check = false;
5165 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5166 check = true;
5167
5168 check_cap_issue(in, cap, new_caps);
5169
5170 // update caps
b32b8144
FG
5171 int revoked = old_caps & ~new_caps;
5172 if (revoked) {
5173 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5174 cap->issued = new_caps;
5175 cap->implemented |= new_caps;
5176
b32b8144
FG
5177 // recall delegations if we're losing caps necessary for them
5178 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5179 in->recall_deleg(false);
5180 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5181 in->recall_deleg(true);
5182
28e407b8
AA
5183 if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
5184 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5185 // waitin' for flush
28e407b8 5186 } else if (revoked & CEPH_CAP_FILE_CACHE) {
7c673cae
FG
5187 if (_release(in))
5188 check = true;
5189 } else {
5190 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5191 check = true;
5192 }
7c673cae
FG
5193 } else if (old_caps == new_caps) {
5194 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5195 } else {
5196 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5197 cap->issued = new_caps;
5198 cap->implemented |= new_caps;
5199
5200 if (cap == in->auth_cap) {
5201 // non-auth MDS is revoking the newly grant caps ?
5202 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5203 if (it->second == cap)
5204 continue;
5205 if (it->second->implemented & ~it->second->issued & new_caps) {
5206 check = true;
5207 break;
5208 }
5209 }
5210 }
5211 }
5212
5213 if (check)
5214 check_caps(in, 0);
5215
5216 // wake up waiters
5217 if (new_caps)
5218 signal_cond_list(in->waitfor_caps);
5219
5220 // may drop inode's last ref
5221 if (deleted_inode)
5222 _try_to_trim_inode(in, true);
5223
5224 m->put();
5225}
5226
7c673cae
FG
5227int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5228{
5229 if (perms.uid() == 0)
5230 return 0;
5231
5232 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5233 int ret = _posix_acl_permission(in, perms, want);
5234 if (ret != -EAGAIN)
5235 return ret;
5236 }
5237
5238 // check permissions before doing anything else
5239 if (!in->check_mode(perms, want))
5240 return -EACCES;
5241 return 0;
5242}
5243
5244int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5245 const UserPerm& perms)
5246{
5247 int r = _getattr_for_perm(in, perms);
5248 if (r < 0)
5249 goto out;
5250
5251 r = 0;
5252 if (strncmp(name, "system.", 7) == 0) {
5253 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5254 r = -EPERM;
5255 } else {
5256 r = inode_permission(in, perms, want);
5257 }
5258out:
1adf2230 5259 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5260 return r;
5261}
5262
5263ostream& operator<<(ostream &out, const UserPerm& perm) {
5264 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5265 return out;
5266}
5267
5268int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5269 const UserPerm& perms)
5270{
181888fb 5271 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5272 int r = _getattr_for_perm(in, perms);
5273 if (r < 0)
5274 goto out;
5275
5276 if (mask & CEPH_SETATTR_SIZE) {
5277 r = inode_permission(in, perms, MAY_WRITE);
5278 if (r < 0)
5279 goto out;
5280 }
5281
5282 r = -EPERM;
5283 if (mask & CEPH_SETATTR_UID) {
5284 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5285 goto out;
5286 }
5287 if (mask & CEPH_SETATTR_GID) {
5288 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5289 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5290 goto out;
5291 }
5292
5293 if (mask & CEPH_SETATTR_MODE) {
5294 if (perms.uid() != 0 && perms.uid() != in->uid)
5295 goto out;
5296
5297 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5298 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5299 stx->stx_mode &= ~S_ISGID;
5300 }
5301
5302 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5303 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5304 if (perms.uid() != 0 && perms.uid() != in->uid) {
5305 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5306 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5307 check_mask |= CEPH_SETATTR_MTIME;
5308 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5309 check_mask |= CEPH_SETATTR_ATIME;
5310 if (check_mask & mask) {
5311 goto out;
5312 } else {
5313 r = inode_permission(in, perms, MAY_WRITE);
5314 if (r < 0)
5315 goto out;
5316 }
5317 }
5318 }
5319 r = 0;
5320out:
5321 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5322 return r;
5323}
5324
5325int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5326{
181888fb 5327 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5328 unsigned want = 0;
5329
5330 if ((flags & O_ACCMODE) == O_WRONLY)
5331 want = MAY_WRITE;
5332 else if ((flags & O_ACCMODE) == O_RDWR)
5333 want = MAY_READ | MAY_WRITE;
5334 else if ((flags & O_ACCMODE) == O_RDONLY)
5335 want = MAY_READ;
5336 if (flags & O_TRUNC)
5337 want |= MAY_WRITE;
5338
5339 int r = 0;
5340 switch (in->mode & S_IFMT) {
5341 case S_IFLNK:
5342 r = -ELOOP;
5343 goto out;
5344 case S_IFDIR:
5345 if (want & MAY_WRITE) {
5346 r = -EISDIR;
5347 goto out;
5348 }
5349 break;
5350 }
5351
5352 r = _getattr_for_perm(in, perms);
5353 if (r < 0)
5354 goto out;
5355
5356 r = inode_permission(in, perms, want);
5357out:
5358 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5359 return r;
5360}
5361
5362int Client::may_lookup(Inode *dir, const UserPerm& perms)
5363{
181888fb 5364 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5365 int r = _getattr_for_perm(dir, perms);
5366 if (r < 0)
5367 goto out;
5368
5369 r = inode_permission(dir, perms, MAY_EXEC);
5370out:
5371 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5372 return r;
5373}
5374
5375int Client::may_create(Inode *dir, const UserPerm& perms)
5376{
181888fb 5377 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5378 int r = _getattr_for_perm(dir, perms);
5379 if (r < 0)
5380 goto out;
5381
5382 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5383out:
5384 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5385 return r;
5386}
5387
5388int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5389{
181888fb 5390 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5391 int r = _getattr_for_perm(dir, perms);
5392 if (r < 0)
5393 goto out;
5394
5395 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5396 if (r < 0)
5397 goto out;
5398
5399 /* 'name == NULL' means rmsnap */
5400 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5401 InodeRef otherin;
5402 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5403 if (r < 0)
5404 goto out;
5405 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5406 r = -EPERM;
5407 }
5408out:
5409 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5410 return r;
5411}
5412
5413int Client::may_hardlink(Inode *in, const UserPerm& perms)
5414{
181888fb 5415 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5416 int r = _getattr_for_perm(in, perms);
5417 if (r < 0)
5418 goto out;
5419
5420 if (perms.uid() == 0 || perms.uid() == in->uid) {
5421 r = 0;
5422 goto out;
5423 }
5424
5425 r = -EPERM;
5426 if (!S_ISREG(in->mode))
5427 goto out;
5428
5429 if (in->mode & S_ISUID)
5430 goto out;
5431
5432 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5433 goto out;
5434
5435 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5436out:
5437 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5438 return r;
5439}
5440
5441int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5442{
5443 int mask = CEPH_STAT_CAP_MODE;
5444 bool force = false;
5445 if (acl_type != NO_ACL) {
5446 mask |= CEPH_STAT_CAP_XATTR;
5447 force = in->xattr_version == 0;
5448 }
5449 return _getattr(in, mask, perms, force);
5450}
5451
5452vinodeno_t Client::_get_vino(Inode *in)
5453{
5454 /* The caller must hold the client lock */
5455 return vinodeno_t(in->ino, in->snapid);
5456}
5457
5458inodeno_t Client::_get_inodeno(Inode *in)
5459{
5460 /* The caller must hold the client lock */
5461 return in->ino;
5462}
5463
5464
5465/**
5466 * Resolve an MDS spec to a list of MDS daemon GIDs.
5467 *
5468 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5469 * It may be '*' in which case it matches all GIDs.
5470 *
5471 * If no error is returned, the `targets` vector will be populated with at least
5472 * one MDS.
5473 */
5474int Client::resolve_mds(
5475 const std::string &mds_spec,
5476 std::vector<mds_gid_t> *targets)
5477{
5478 assert(fsmap);
5479 assert(targets != nullptr);
5480
5481 mds_role_t role;
5482 std::stringstream ss;
5483 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5484 if (role_r == 0) {
5485 // We got a role, resolve it to a GID
5486 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5487 << role << "'" << dendl;
5488 targets->push_back(
5489 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5490 return 0;
5491 }
5492
5493 std::string strtol_err;
5494 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5495 if (strtol_err.empty()) {
5496 // It is a possible GID
5497 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5498 if (fsmap->gid_exists(mds_gid)) {
5499 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5500 targets->push_back(mds_gid);
5501 } else {
5502 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5503 << dendl;
5504 return -ENOENT;
5505 }
5506 } else if (mds_spec == "*") {
5507 // It is a wildcard: use all MDSs
5508 const auto mds_info = fsmap->get_mds_info();
5509
5510 if (mds_info.empty()) {
5511 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5512 return -ENOENT;
5513 }
5514
5515 for (const auto i : mds_info) {
5516 targets->push_back(i.first);
5517 }
5518 } else {
5519 // It did not parse as an integer, it is not a wildcard, it must be a name
5520 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5521 if (mds_gid == 0) {
5522 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5523
5524 lderr(cct) << "FSMap: " << *fsmap << dendl;
5525
5526 return -ENOENT;
5527 } else {
5528 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5529 << "' to GID " << mds_gid << dendl;
5530 targets->push_back(mds_gid);
5531 }
5532 }
5533
5534 return 0;
5535}
5536
5537
5538/**
5539 * Authenticate with mon and establish global ID
5540 */
5541int Client::authenticate()
5542{
5543 assert(client_lock.is_locked_by_me());
5544
5545 if (monclient->is_authenticated()) {
5546 return 0;
5547 }
5548
5549 client_lock.Unlock();
5550 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5551 client_lock.Lock();
5552 if (r < 0) {
5553 return r;
5554 }
5555
5556 whoami = monclient->get_global_id();
5557 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5558
5559 return 0;
5560}
5561
5562int Client::fetch_fsmap(bool user)
5563{
5564 int r;
5565 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5566 // rather than MDSMap because no one MDSMap contains all the daemons, and
5567 // a `tell` can address any daemon.
5568 version_t fsmap_latest;
5569 do {
5570 C_SaferCond cond;
5571 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5572 client_lock.Unlock();
5573 r = cond.wait();
5574 client_lock.Lock();
5575 } while (r == -EAGAIN);
5576
5577 if (r < 0) {
5578 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5579 return r;
5580 }
5581
5582 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5583
5584 if (user) {
5585 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5586 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5587 monclient->renew_subs();
5588 wait_on_list(waiting_for_fsmap);
5589 }
5590 assert(fsmap_user);
5591 assert(fsmap_user->get_epoch() >= fsmap_latest);
5592 } else {
5593 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5594 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5595 monclient->renew_subs();
5596 wait_on_list(waiting_for_fsmap);
5597 }
5598 assert(fsmap);
5599 assert(fsmap->get_epoch() >= fsmap_latest);
5600 }
5601 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5602 << fsmap_latest << dendl;
5603 return 0;
5604}
5605
5606/**
5607 *
5608 * @mds_spec one of ID, rank, GID, "*"
5609 *
5610 */
5611int Client::mds_command(
5612 const std::string &mds_spec,
5613 const vector<string>& cmd,
5614 const bufferlist& inbl,
5615 bufferlist *outbl,
5616 string *outs,
5617 Context *onfinish)
5618{
5619 Mutex::Locker lock(client_lock);
5620
181888fb
FG
5621 if (!initialized)
5622 return -ENOTCONN;
7c673cae
FG
5623
5624 int r;
5625 r = authenticate();
5626 if (r < 0) {
5627 return r;
5628 }
5629
5630 r = fetch_fsmap(false);
5631 if (r < 0) {
5632 return r;
5633 }
5634
5635 // Look up MDS target(s) of the command
5636 std::vector<mds_gid_t> targets;
5637 r = resolve_mds(mds_spec, &targets);
5638 if (r < 0) {
5639 return r;
5640 }
5641
5642 // If daemons are laggy, we won't send them commands. If all
5643 // are laggy then we fail.
5644 std::vector<mds_gid_t> non_laggy;
5645 for (const auto gid : targets) {
5646 const auto info = fsmap->get_info_gid(gid);
5647 if (!info.laggy()) {
5648 non_laggy.push_back(gid);
5649 }
5650 }
5651 if (non_laggy.size() == 0) {
5652 *outs = "All targeted MDS daemons are laggy";
5653 return -ENOENT;
5654 }
5655
5656 if (metadata.empty()) {
5657 // We are called on an unmounted client, so metadata
5658 // won't be initialized yet.
5659 populate_metadata("");
5660 }
5661
5662 // Send commands to targets
5663 C_GatherBuilder gather(cct, onfinish);
5664 for (const auto target_gid : non_laggy) {
5665 const auto info = fsmap->get_info_gid(target_gid);
5666
5667 // Open a connection to the target MDS
5668 entity_inst_t inst = info.get_inst();
5669 ConnectionRef conn = messenger->get_connection(inst);
5670
5671 // Generate MDSCommandOp state
5672 auto &op = command_table.start_command();
5673
5674 op.on_finish = gather.new_sub();
5675 op.cmd = cmd;
5676 op.outbl = outbl;
5677 op.outs = outs;
5678 op.inbl = inbl;
5679 op.mds_gid = target_gid;
5680 op.con = conn;
5681
5682 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5683 << " tid=" << op.tid << cmd << dendl;
5684
5685 // Construct and send MCommand
5686 MCommand *m = op.get_message(monclient->get_fsid());
5687 conn->send_message(m);
5688 }
5689 gather.activate();
5690
5691 return 0;
5692}
5693
5694void Client::handle_command_reply(MCommandReply *m)
5695{
5696 ceph_tid_t const tid = m->get_tid();
5697
5698 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5699
5700 if (!command_table.exists(tid)) {
5701 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5702 m->put();
5703 return;
5704 }
5705
5706 auto &op = command_table.get_command(tid);
5707 if (op.outbl) {
5708 op.outbl->claim(m->get_data());
5709 }
5710 if (op.outs) {
5711 *op.outs = m->rs;
5712 }
5713
5714 if (op.on_finish) {
5715 op.on_finish->complete(m->r);
5716 }
5717
5718 command_table.erase(tid);
5719
5720 m->put();
5721}
5722
5723// -------------------
5724// MOUNT
5725
5726int Client::mount(const std::string &mount_root, const UserPerm& perms,
5727 bool require_mds)
5728{
5729 Mutex::Locker lock(client_lock);
5730
5731 if (mounted) {
5732 ldout(cct, 5) << "already mounted" << dendl;
5733 return 0;
5734 }
5735
b32b8144
FG
5736 unmounting = false;
5737
7c673cae
FG
5738 int r = authenticate();
5739 if (r < 0) {
5740 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5741 return r;
5742 }
5743
5744 std::string want = "mdsmap";
5745 const auto &mds_ns = cct->_conf->client_mds_namespace;
5746 if (!mds_ns.empty()) {
5747 r = fetch_fsmap(true);
5748 if (r < 0)
5749 return r;
5750 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5751 if (cid == FS_CLUSTER_ID_NONE)
5752 return -ENOENT;
5753
5754 std::ostringstream oss;
5755 oss << want << "." << cid;
5756 want = oss.str();
5757 }
5758 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5759
5760 monclient->sub_want(want, 0, 0);
5761 monclient->renew_subs();
5762
5763 tick(); // start tick
5764
5765 if (require_mds) {
5766 while (1) {
5767 auto availability = mdsmap->is_cluster_available();
5768 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5769 // Error out
5770 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5771 return CEPH_FUSE_NO_MDS_UP;
5772 } else if (availability == MDSMap::AVAILABLE) {
5773 // Continue to mount
5774 break;
5775 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5776 // Else, wait. MDSMonitor will update the map to bring
5777 // us to a conclusion eventually.
5778 wait_on_list(waiting_for_mdsmap);
5779 } else {
5780 // Unexpected value!
5781 ceph_abort();
5782 }
5783 }
5784 }
5785
5786 populate_metadata(mount_root.empty() ? "/" : mount_root);
5787
5788 filepath fp(CEPH_INO_ROOT);
5789 if (!mount_root.empty()) {
5790 fp = filepath(mount_root.c_str());
5791 }
5792 while (true) {
5793 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5794 req->set_filepath(fp);
5795 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5796 int res = make_request(req, perms);
5797 if (res < 0) {
5798 if (res == -EACCES && root) {
5799 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5800 break;
5801 }
5802 return res;
5803 }
5804
5805 if (fp.depth())
5806 fp.pop_dentry();
5807 else
5808 break;
5809 }
5810
5811 assert(root);
5812 _ll_get(root);
5813
5814 mounted = true;
5815
5816 // trace?
5817 if (!cct->_conf->client_trace.empty()) {
5818 traceout.open(cct->_conf->client_trace.c_str());
5819 if (traceout.is_open()) {
5820 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5821 } else {
5822 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5823 }
5824 }
5825
5826 /*
5827 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5828 ldout(cct, 3) << "op: struct stat st;" << dendl;
5829 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5830 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5831 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5832 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5833 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5834 ldout(cct, 3) << "op: int fd;" << dendl;
5835 */
5836 return 0;
5837}
5838
5839// UNMOUNT
5840
5841void Client::_close_sessions()
5842{
5843 while (!mds_sessions.empty()) {
5844 // send session closes!
5845 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5846 p != mds_sessions.end();
5847 ++p) {
5848 if (p->second->state != MetaSession::STATE_CLOSING) {
5849 _close_mds_session(p->second);
5850 }
5851 }
5852
5853 // wait for sessions to close
5854 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5855 mount_cond.Wait(client_lock);
5856 }
5857}
5858
31f18b77
FG
5859void Client::flush_mdlog_sync()
5860{
5861 if (mds_requests.empty())
5862 return;
5863 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5864 p != mds_sessions.end();
5865 ++p) {
5866 MetaSession *s = p->second;
5867 flush_mdlog(s);
5868 }
5869}
5870
5871void Client::flush_mdlog(MetaSession *session)
5872{
5873 // Only send this to Luminous or newer MDS daemons, older daemons
5874 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5875 const uint64_t features = session->con->get_features();
5876 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5877 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5878 session->con->send_message(m);
5879 }
5880}
5881
5882
b32b8144 5883void Client::_unmount()
7c673cae 5884{
181888fb
FG
5885 if (unmounting)
5886 return;
7c673cae
FG
5887
5888 ldout(cct, 2) << "unmounting" << dendl;
5889 unmounting = true;
5890
b32b8144
FG
5891 deleg_timeout = 0;
5892
31f18b77 5893 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5894 while (!mds_requests.empty()) {
5895 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5896 mount_cond.Wait(client_lock);
5897 }
5898
5899 if (tick_event)
5900 timer.cancel_event(tick_event);
5901 tick_event = 0;
5902
5903 cwd.reset();
5904
5905 // clean up any unclosed files
5906 while (!fd_map.empty()) {
5907 Fh *fh = fd_map.begin()->second;
5908 fd_map.erase(fd_map.begin());
5909 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5910 _release_fh(fh);
5911 }
5912
5913 while (!ll_unclosed_fh_set.empty()) {
5914 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5915 Fh *fh = *it;
5916 ll_unclosed_fh_set.erase(fh);
5917 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5918 _release_fh(fh);
5919 }
5920
5921 while (!opened_dirs.empty()) {
5922 dir_result_t *dirp = *opened_dirs.begin();
5923 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5924 _closedir(dirp);
5925 }
5926
5927 _ll_drop_pins();
5928
31f18b77
FG
5929 if (blacklisted) {
5930 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5931
5932 if (cct->_conf->client_oc) {
5933 // Purge all cached data so that ObjectCacher doesn't get hung up
5934 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5935 // is to just leave things marked dirty
5936 // (http://tracker.ceph.com/issues/9105)
5937 for (const auto &i : inode_map) {
5938 objectcacher->purge_set(&(i.second->oset));
5939 }
5940 }
5941
5942 mounted = false;
5943 return;
5944 }
5945
7c673cae
FG
5946 while (unsafe_sync_write > 0) {
5947 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5948 mount_cond.Wait(client_lock);
5949 }
5950
5951 if (cct->_conf->client_oc) {
5952 // flush/release all buffered data
5953 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5954 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5955 p != inode_map.end();
5956 p = next) {
5957 next = p;
5958 ++next;
5959 Inode *in = p->second;
5960 if (!in) {
5961 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5962 assert(in);
5963 }
5964 if (!in->caps.empty()) {
5965 InodeRef tmp_ref(in);
5966 _release(in);
5967 _flush(in, new C_Client_FlushComplete(this, in));
5968 }
5969 }
5970 }
5971
5972 flush_caps_sync();
5973 wait_sync_caps(last_flush_tid);
5974
5975 // empty lru cache
7c673cae
FG
5976 trim_cache();
5977
5978 while (lru.lru_get_size() > 0 ||
5979 !inode_map.empty()) {
5980 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5981 << "+" << inode_map.size() << " items"
5982 << ", waiting (for caps to release?)"
5983 << dendl;
5984 utime_t until = ceph_clock_now() + utime_t(5, 0);
5985 int r = mount_cond.WaitUntil(client_lock, until);
5986 if (r == ETIMEDOUT) {
5987 dump_cache(NULL);
5988 }
5989 }
5990 assert(lru.lru_get_size() == 0);
5991 assert(inode_map.empty());
5992
5993 // stop tracing
5994 if (!cct->_conf->client_trace.empty()) {
5995 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5996 traceout.close();
5997 }
5998
5999 _close_sessions();
6000
6001 mounted = false;
6002
6003 ldout(cct, 2) << "unmounted." << dendl;
6004}
6005
b32b8144
FG
6006void Client::unmount()
6007{
6008 Mutex::Locker lock(client_lock);
6009 _unmount();
6010}
6011
7c673cae
FG
6012void Client::flush_cap_releases()
6013{
6014 // send any cap releases
6015 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6016 p != mds_sessions.end();
6017 ++p) {
6018 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
6019 p->first)) {
6020 if (cct->_conf->client_inject_release_failure) {
6021 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6022 p->second->release->put();
6023 } else {
6024 p->second->con->send_message(p->second->release);
6025 }
6026 p->second->release = 0;
6027 }
6028 }
6029}
6030
6031void Client::tick()
6032{
6033 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6034 sleep(cct->_conf->client_debug_inject_tick_delay);
6035 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6036 cct->_conf->apply_changes(NULL);
6037 }
6038
6039 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6040 tick_event = timer.add_event_after(
6041 cct->_conf->client_tick_interval,
6042 new FunctionContext([this](int) {
6043 // Called back via Timer, which takes client_lock for us
6044 assert(client_lock.is_locked_by_me());
6045 tick();
6046 }));
7c673cae
FG
6047 utime_t now = ceph_clock_now();
6048
6049 if (!mounted && !mds_requests.empty()) {
6050 MetaRequest *req = mds_requests.begin()->second;
6051 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6052 req->abort(-ETIMEDOUT);
6053 if (req->caller_cond) {
6054 req->kick = true;
6055 req->caller_cond->Signal();
6056 }
6057 signal_cond_list(waiting_for_mdsmap);
6058 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6059 p != mds_sessions.end();
6060 ++p)
6061 signal_context_list(p->second->waiting_for_open);
6062 }
6063 }
6064
6065 if (mdsmap->get_epoch()) {
6066 // renew caps?
6067 utime_t el = now - last_cap_renew;
6068 if (el > mdsmap->get_session_timeout() / 3.0)
6069 renew_caps();
6070
6071 flush_cap_releases();
6072 }
6073
6074 // delayed caps
28e407b8 6075 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6076 while (!p.end()) {
6077 Inode *in = *p;
6078 ++p;
6079 if (in->hold_caps_until > now)
6080 break;
28e407b8 6081 delayed_list.pop_front();
7c673cae
FG
6082 check_caps(in, CHECK_CAPS_NODELAY);
6083 }
6084
6085 trim_cache(true);
6086}
6087
6088void Client::renew_caps()
6089{
6090 ldout(cct, 10) << "renew_caps()" << dendl;
6091 last_cap_renew = ceph_clock_now();
6092
6093 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6094 p != mds_sessions.end();
6095 ++p) {
6096 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6097 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6098 renew_caps(p->second);
6099 }
6100}
6101
6102void Client::renew_caps(MetaSession *session)
6103{
6104 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6105 session->last_cap_renew_request = ceph_clock_now();
6106 uint64_t seq = ++session->cap_renew_seq;
6107 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6108}
6109
6110
6111// ===============================================================
6112// high level (POSIXy) interface
6113
6114int Client::_do_lookup(Inode *dir, const string& name, int mask,
6115 InodeRef *target, const UserPerm& perms)
6116{
6117 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6118 MetaRequest *req = new MetaRequest(op);
6119 filepath path;
6120 dir->make_nosnap_relative_path(path);
6121 path.push_dentry(name);
6122 req->set_filepath(path);
6123 req->set_inode(dir);
6124 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6125 mask |= DEBUG_GETATTR_CAPS;
6126 req->head.args.getattr.mask = mask;
6127
6128 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6129
6130 int r = make_request(req, perms, target);
6131 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6132 return r;
6133}
6134
6135int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6136 const UserPerm& perms)
6137{
6138 int r = 0;
6139 Dentry *dn = NULL;
6140
6141 if (!dir->is_dir()) {
6142 r = -ENOTDIR;
6143 goto done;
6144 }
6145
6146 if (dname == "..") {
6147 if (dir->dn_set.empty())
6148 *target = dir;
6149 else
6150 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6151 goto done;
6152 }
6153
6154 if (dname == ".") {
6155 *target = dir;
6156 goto done;
6157 }
6158
6159 if (dname.length() > NAME_MAX) {
6160 r = -ENAMETOOLONG;
6161 goto done;
6162 }
6163
6164 if (dname == cct->_conf->client_snapdir &&
6165 dir->snapid == CEPH_NOSNAP) {
6166 *target = open_snapdir(dir);
6167 goto done;
6168 }
6169
6170 if (dir->dir &&
6171 dir->dir->dentries.count(dname)) {
6172 dn = dir->dir->dentries[dname];
6173
6174 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6175 << " seq " << dn->lease_seq
6176 << dendl;
6177
94b18763 6178 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6179 // is dn lease valid?
6180 utime_t now = ceph_clock_now();
6181 if (dn->lease_mds >= 0 &&
6182 dn->lease_ttl > now &&
6183 mds_sessions.count(dn->lease_mds)) {
6184 MetaSession *s = mds_sessions[dn->lease_mds];
6185 if (s->cap_ttl > now &&
6186 s->cap_gen == dn->lease_gen) {
6187 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6188 // make trim_caps() behave.
6189 dir->try_touch_cap(dn->lease_mds);
6190 goto hit_dn;
6191 }
6192 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6193 << " vs lease_gen " << dn->lease_gen << dendl;
6194 }
6195 // dir lease?
94b18763 6196 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6197 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6198 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6199 goto hit_dn;
6200 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6201 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6202 << *dir << " dn '" << dname << "'" << dendl;
6203 return -ENOENT;
6204 }
6205 }
6206 } else {
6207 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6208 }
6209 } else {
6210 // can we conclude ENOENT locally?
94b18763 6211 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae
FG
6212 (dir->flags & I_COMPLETE)) {
6213 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6214 return -ENOENT;
6215 }
6216 }
6217
6218 r = _do_lookup(dir, dname, mask, target, perms);
6219 goto done;
6220
6221 hit_dn:
6222 if (dn->inode) {
6223 *target = dn->inode;
6224 } else {
6225 r = -ENOENT;
6226 }
6227 touch_dn(dn);
6228
6229 done:
6230 if (r < 0)
6231 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6232 else
6233 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6234 return r;
6235}
6236
6237int Client::get_or_create(Inode *dir, const char* name,
6238 Dentry **pdn, bool expect_null)
6239{
6240 // lookup
6241 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6242 dir->open_dir();
6243 if (dir->dir->dentries.count(name)) {
6244 Dentry *dn = dir->dir->dentries[name];
6245
6246 // is dn lease valid?
6247 utime_t now = ceph_clock_now();
6248 if (dn->inode &&
6249 dn->lease_mds >= 0 &&
6250 dn->lease_ttl > now &&
6251 mds_sessions.count(dn->lease_mds)) {
6252 MetaSession *s = mds_sessions[dn->lease_mds];
6253 if (s->cap_ttl > now &&
6254 s->cap_gen == dn->lease_gen) {
6255 if (expect_null)
6256 return -EEXIST;
6257 }
6258 }
6259 *pdn = dn;
6260 } else {
6261 // otherwise link up a new one
6262 *pdn = link(dir->dir, name, NULL, NULL);
6263 }
6264
6265 // success
6266 return 0;
6267}
6268
6269int Client::path_walk(const filepath& origpath, InodeRef *end,
6270 const UserPerm& perms, bool followsym, int mask)
6271{
6272 filepath path = origpath;
6273 InodeRef cur;
6274 if (origpath.absolute())
6275 cur = root;
6276 else
6277 cur = cwd;
6278 assert(cur);
6279
6280 ldout(cct, 10) << "path_walk " << path << dendl;
6281
6282 int symlinks = 0;
6283
6284 unsigned i=0;
6285 while (i < path.depth() && cur) {
6286 int caps = 0;
6287 const string &dname = path[i];
6288 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6289 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6290 InodeRef next;
6291 if (cct->_conf->client_permissions) {
6292 int r = may_lookup(cur.get(), perms);
6293 if (r < 0)
6294 return r;
6295 caps = CEPH_CAP_AUTH_SHARED;
6296 }
6297
6298 /* Get extra requested caps on the last component */
6299 if (i == (path.depth() - 1))
6300 caps |= mask;
6301 int r = _lookup(cur.get(), dname, caps, &next, perms);
6302 if (r < 0)
6303 return r;
6304 // only follow trailing symlink if followsym. always follow
6305 // 'directory' symlinks.
6306 if (next && next->is_symlink()) {
6307 symlinks++;
6308 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6309 if (symlinks > MAXSYMLINKS) {
6310 return -ELOOP;
6311 }
6312
6313 if (i < path.depth() - 1) {
6314 // dir symlink
6315 // replace consumed components of path with symlink dir target
6316 filepath resolved(next->symlink.c_str());
6317 resolved.append(path.postfixpath(i + 1));
6318 path = resolved;
6319 i = 0;
6320 if (next->symlink[0] == '/') {
6321 cur = root;
6322 }
6323 continue;
6324 } else if (followsym) {
6325 if (next->symlink[0] == '/') {
6326 path = next->symlink.c_str();
6327 i = 0;
6328 // reset position
6329 cur = root;
6330 } else {
6331 filepath more(next->symlink.c_str());
6332 // we need to remove the symlink component from off of the path
6333 // before adding the target that the symlink points to. remain
6334 // at the same position in the path.
6335 path.pop_dentry();
6336 path.append(more);
6337 }
6338 continue;
6339 }
6340 }
6341 cur.swap(next);
6342 i++;
6343 }
6344 if (!cur)
6345 return -ENOENT;
6346 if (end)
6347 end->swap(cur);
6348 return 0;
6349}
6350
6351
6352// namespace ops
6353
6354int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6355{
6356 Mutex::Locker lock(client_lock);
6357 tout(cct) << "link" << std::endl;
6358 tout(cct) << relexisting << std::endl;
6359 tout(cct) << relpath << std::endl;
6360
181888fb
FG
6361 if (unmounting)
6362 return -ENOTCONN;
6363
7c673cae
FG
6364 filepath existing(relexisting);
6365
6366 InodeRef in, dir;
6367 int r = path_walk(existing, &in, perm, true);
6368 if (r < 0)
6369 return r;
6370 if (std::string(relpath) == "/") {
6371 r = -EEXIST;
6372 return r;
6373 }
6374 filepath path(relpath);
6375 string name = path.last_dentry();
6376 path.pop_dentry();
6377
6378 r = path_walk(path, &dir, perm, true);
6379 if (r < 0)
6380 return r;
6381 if (cct->_conf->client_permissions) {
6382 if (S_ISDIR(in->mode)) {
6383 r = -EPERM;
6384 return r;
6385 }
6386 r = may_hardlink(in.get(), perm);
6387 if (r < 0)
6388 return r;
6389 r = may_create(dir.get(), perm);
6390 if (r < 0)
6391 return r;
6392 }
6393 r = _link(in.get(), dir.get(), name.c_str(), perm);
6394 return r;
6395}
6396
6397int Client::unlink(const char *relpath, const UserPerm& perm)
6398{
6399 Mutex::Locker lock(client_lock);
6400 tout(cct) << "unlink" << std::endl;
6401 tout(cct) << relpath << std::endl;
6402
181888fb
FG
6403 if (unmounting)
6404 return -ENOTCONN;
6405
7c673cae
FG
6406 if (std::string(relpath) == "/")
6407 return -EISDIR;
6408
6409 filepath path(relpath);
6410 string name = path.last_dentry();
6411 path.pop_dentry();
6412 InodeRef dir;
6413 int r = path_walk(path, &dir, perm);
6414 if (r < 0)
6415 return r;
6416 if (cct->_conf->client_permissions) {
6417 r = may_delete(dir.get(), name.c_str(), perm);
6418 if (r < 0)
6419 return r;
6420 }
6421 return _unlink(dir.get(), name.c_str(), perm);
6422}
6423
6424int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6425{
6426 Mutex::Locker lock(client_lock);
6427 tout(cct) << "rename" << std::endl;
6428 tout(cct) << relfrom << std::endl;
6429 tout(cct) << relto << std::endl;
6430
181888fb
FG
6431 if (unmounting)
6432 return -ENOTCONN;
6433
7c673cae
FG
6434 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6435 return -EBUSY;
6436
6437 filepath from(relfrom);
6438 filepath to(relto);
6439 string fromname = from.last_dentry();
6440 from.pop_dentry();
6441 string toname = to.last_dentry();
6442 to.pop_dentry();
6443
6444 InodeRef fromdir, todir;
6445 int r = path_walk(from, &fromdir, perm);
6446 if (r < 0)
6447 goto out;
6448 r = path_walk(to, &todir, perm);
6449 if (r < 0)
6450 goto out;
6451
6452 if (cct->_conf->client_permissions) {
6453 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6454 if (r < 0)
6455 return r;
6456 r = may_delete(todir.get(), toname.c_str(), perm);
6457 if (r < 0 && r != -ENOENT)
6458 return r;
6459 }
6460 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6461out:
6462 return r;
6463}
6464
6465// dirs
6466
6467int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6468{
6469 Mutex::Locker lock(client_lock);
6470 tout(cct) << "mkdir" << std::endl;
6471 tout(cct) << relpath << std::endl;
6472 tout(cct) << mode << std::endl;
6473 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6474
181888fb
FG
6475 if (unmounting)
6476 return -ENOTCONN;
6477
7c673cae
FG
6478 if (std::string(relpath) == "/")
6479 return -EEXIST;
6480
6481 filepath path(relpath);
6482 string name = path.last_dentry();
6483 path.pop_dentry();
6484 InodeRef dir;
6485 int r = path_walk(path, &dir, perm);
6486 if (r < 0)
6487 return r;
6488 if (cct->_conf->client_permissions) {
6489 r = may_create(dir.get(), perm);
6490 if (r < 0)
6491 return r;
6492 }
6493 return _mkdir(dir.get(), name.c_str(), mode, perm);
6494}
6495
6496int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6497{
6498 Mutex::Locker lock(client_lock);
6499 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6500 tout(cct) << "mkdirs" << std::endl;
6501 tout(cct) << relpath << std::endl;
6502 tout(cct) << mode << std::endl;
6503
181888fb
FG
6504 if (unmounting)
6505 return -ENOTCONN;
6506
7c673cae
FG
6507 //get through existing parts of path
6508 filepath path(relpath);
6509 unsigned int i;
6510 int r = 0, caps = 0;
6511 InodeRef cur, next;
6512 cur = cwd;
6513 for (i=0; i<path.depth(); ++i) {
6514 if (cct->_conf->client_permissions) {
6515 r = may_lookup(cur.get(), perms);
6516 if (r < 0)
6517 break;
6518 caps = CEPH_CAP_AUTH_SHARED;
6519 }
6520 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6521 if (r < 0)
6522 break;
6523 cur.swap(next);
6524 }
6525 //check that we have work left to do
6526 if (i==path.depth()) return -EEXIST;
6527 if (r!=-ENOENT) return r;
6528 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6529 //make new directory at each level
6530 for (; i<path.depth(); ++i) {
6531 if (cct->_conf->client_permissions) {
6532 r = may_create(cur.get(), perms);
6533 if (r < 0)
6534 return r;
6535 }
6536 //make new dir
6537 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6538
7c673cae 6539 //check proper creation/existence
c07f9fc5
FG
6540 if(-EEXIST == r && i < path.depth() - 1) {
6541 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6542 }
6543 if (r < 0)
6544 return r;
7c673cae
FG
6545 //move to new dir and continue
6546 cur.swap(next);
6547 ldout(cct, 20) << "mkdirs: successfully created directory "
6548 << filepath(cur->ino).get_path() << dendl;
6549 }
6550 return 0;
6551}
6552
6553int Client::rmdir(const char *relpath, const UserPerm& perms)
6554{
6555 Mutex::Locker lock(client_lock);
6556 tout(cct) << "rmdir" << std::endl;
6557 tout(cct) << relpath << std::endl;
6558
181888fb
FG
6559 if (unmounting)
6560 return -ENOTCONN;
6561
7c673cae
FG
6562 if (std::string(relpath) == "/")
6563 return -EBUSY;
6564
6565 filepath path(relpath);
6566 string name = path.last_dentry();
6567 path.pop_dentry();
6568 InodeRef dir;
6569 int r = path_walk(path, &dir, perms);
6570 if (r < 0)
6571 return r;
6572 if (cct->_conf->client_permissions) {
6573 int r = may_delete(dir.get(), name.c_str(), perms);
6574 if (r < 0)
6575 return r;
6576 }
6577 return _rmdir(dir.get(), name.c_str(), perms);
6578}
6579
6580int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6581{
6582 Mutex::Locker lock(client_lock);
6583 tout(cct) << "mknod" << std::endl;
6584 tout(cct) << relpath << std::endl;
6585 tout(cct) << mode << std::endl;
6586 tout(cct) << rdev << std::endl;
6587
181888fb
FG
6588 if (unmounting)
6589 return -ENOTCONN;
6590
7c673cae
FG
6591 if (std::string(relpath) == "/")
6592 return -EEXIST;
6593
6594 filepath path(relpath);
6595 string name = path.last_dentry();
6596 path.pop_dentry();
6597 InodeRef dir;
6598 int r = path_walk(path, &dir, perms);
6599 if (r < 0)
6600 return r;
6601 if (cct->_conf->client_permissions) {
6602 int r = may_create(dir.get(), perms);
6603 if (r < 0)
6604 return r;
6605 }
6606 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6607}
6608
6609// symlinks
6610
6611int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6612{
6613 Mutex::Locker lock(client_lock);
6614 tout(cct) << "symlink" << std::endl;
6615 tout(cct) << target << std::endl;
6616 tout(cct) << relpath << std::endl;
6617
181888fb
FG
6618 if (unmounting)
6619 return -ENOTCONN;
6620
7c673cae
FG
6621 if (std::string(relpath) == "/")
6622 return -EEXIST;
6623
6624 filepath path(relpath);
6625 string name = path.last_dentry();
6626 path.pop_dentry();
6627 InodeRef dir;
6628 int r = path_walk(path, &dir, perms);
6629 if (r < 0)
6630 return r;
6631 if (cct->_conf->client_permissions) {
6632 int r = may_create(dir.get(), perms);
6633 if (r < 0)
6634 return r;
6635 }
6636 return _symlink(dir.get(), name.c_str(), target, perms);
6637}
6638
6639int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6640{
6641 Mutex::Locker lock(client_lock);
6642 tout(cct) << "readlink" << std::endl;
6643 tout(cct) << relpath << std::endl;
6644
181888fb
FG
6645 if (unmounting)
6646 return -ENOTCONN;
6647
7c673cae
FG
6648 filepath path(relpath);
6649 InodeRef in;
6650 int r = path_walk(path, &in, perms, false);
6651 if (r < 0)
6652 return r;
6653
6654 return _readlink(in.get(), buf, size);
6655}
6656
6657int Client::_readlink(Inode *in, char *buf, size_t size)
6658{
6659 if (!in->is_symlink())
6660 return -EINVAL;
6661
6662 // copy into buf (at most size bytes)
6663 int r = in->symlink.length();
6664 if (r > (int)size)
6665 r = size;
6666 memcpy(buf, in->symlink.c_str(), r);
6667 return r;
6668}
6669
6670
6671// inode stuff
6672
6673int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6674{
94b18763 6675 bool yes = in->caps_issued_mask(mask, true);
7c673cae
FG
6676
6677 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6678 if (yes && !force)
6679 return 0;
6680
6681 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6682 filepath path;
6683 in->make_nosnap_relative_path(path);
6684 req->set_filepath(path);
6685 req->set_inode(in);
6686 req->head.args.getattr.mask = mask;
6687
6688 int res = make_request(req, perms);
6689 ldout(cct, 10) << "_getattr result=" << res << dendl;
6690 return res;
6691}
6692
6693int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6694 const UserPerm& perms, InodeRef *inp)
6695{
6696 int issued = in->caps_issued();
6697
6698 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6699 ccap_string(issued) << dendl;
6700
6701 if (in->snapid != CEPH_NOSNAP) {
6702 return -EROFS;
6703 }
6704 if ((mask & CEPH_SETATTR_SIZE) &&
6705 (unsigned long)stx->stx_size > in->size &&
6706 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6707 perms)) {
6708 return -EDQUOT;
6709 }
6710
6711 // make the change locally?
6712 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6713 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6714 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6715 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6716 << in->cap_dirtier_gid << ", forcing sync setattr"
6717 << dendl;
6718 /*
6719 * This works because we implicitly flush the caps as part of the
6720 * request, so the cap update check will happen with the writeback
6721 * cap context, and then the setattr check will happen with the
6722 * caller's context.
6723 *
6724 * In reality this pattern is likely pretty rare (different users
6725 * setattr'ing the same file). If that turns out not to be the
6726 * case later, we can build a more complex pipelined cap writeback
6727 * infrastructure...
6728 */
6729 if (!mask)
6730 mask |= CEPH_SETATTR_CTIME;
6731 goto force_request;
6732 }
6733
6734 if (!mask) {
6735 // caller just needs us to bump the ctime
6736 in->ctime = ceph_clock_now();
6737 in->cap_dirtier_uid = perms.uid();
6738 in->cap_dirtier_gid = perms.gid();
6739 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6740 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6741 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6742 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6743 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6744 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6745 else
6746 mask |= CEPH_SETATTR_CTIME;
6747 }
6748
6749 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6750 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6751
6752 mask &= ~CEPH_SETATTR_KILL_SGUID;
6753
6754 if (mask & CEPH_SETATTR_UID) {
6755 in->ctime = ceph_clock_now();
6756 in->cap_dirtier_uid = perms.uid();
6757 in->cap_dirtier_gid = perms.gid();
6758 in->uid = stx->stx_uid;
28e407b8 6759 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6760 mask &= ~CEPH_SETATTR_UID;
6761 kill_sguid = true;
6762 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6763 }
6764 if (mask & CEPH_SETATTR_GID) {
6765 in->ctime = ceph_clock_now();
6766 in->cap_dirtier_uid = perms.uid();
6767 in->cap_dirtier_gid = perms.gid();
6768 in->gid = stx->stx_gid;
28e407b8 6769 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6770 mask &= ~CEPH_SETATTR_GID;
6771 kill_sguid = true;
6772 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6773 }
6774
6775 if (mask & CEPH_SETATTR_MODE) {
6776 in->ctime = ceph_clock_now();
6777 in->cap_dirtier_uid = perms.uid();
6778 in->cap_dirtier_gid = perms.gid();
6779 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6780 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6781 mask &= ~CEPH_SETATTR_MODE;
6782 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6783 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6784 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6785 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6786 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6787 }
6788
6789 if (mask & CEPH_SETATTR_BTIME) {
6790 in->ctime = ceph_clock_now();
6791 in->cap_dirtier_uid = perms.uid();
6792 in->cap_dirtier_gid = perms.gid();
6793 in->btime = utime_t(stx->stx_btime);
28e407b8 6794 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6795 mask &= ~CEPH_SETATTR_BTIME;
6796 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6797 }
6798 } else if (mask & CEPH_SETATTR_SIZE) {
6799 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6800 mask |= CEPH_SETATTR_KILL_SGUID;
6801 }
6802
6803 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6804 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6805 if (mask & CEPH_SETATTR_MTIME)
6806 in->mtime = utime_t(stx->stx_mtime);
6807 if (mask & CEPH_SETATTR_ATIME)
6808 in->atime = utime_t(stx->stx_atime);
6809 in->ctime = ceph_clock_now();
6810 in->cap_dirtier_uid = perms.uid();
6811 in->cap_dirtier_gid = perms.gid();
6812 in->time_warp_seq++;
28e407b8 6813 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6814 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6815 }
6816 }
6817 if (!mask) {
6818 in->change_attr++;
6819 return 0;
6820 }
6821
6822force_request:
6823 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6824
6825 filepath path;
6826
6827 in->make_nosnap_relative_path(path);
6828 req->set_filepath(path);
6829 req->set_inode(in);
6830
6831 if (mask & CEPH_SETATTR_KILL_SGUID) {
6832 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6833 }
6834 if (mask & CEPH_SETATTR_MODE) {
6835 req->head.args.setattr.mode = stx->stx_mode;
6836 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6837 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6838 }
6839 if (mask & CEPH_SETATTR_UID) {
6840 req->head.args.setattr.uid = stx->stx_uid;
6841 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6842 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6843 }
6844 if (mask & CEPH_SETATTR_GID) {
6845 req->head.args.setattr.gid = stx->stx_gid;
6846 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6847 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6848 }
6849 if (mask & CEPH_SETATTR_BTIME) {
6850 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6851 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6852 }
6853 if (mask & CEPH_SETATTR_MTIME) {
6854 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6855 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6856 CEPH_CAP_FILE_WR;
6857 }
6858 if (mask & CEPH_SETATTR_ATIME) {
6859 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6860 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6861 CEPH_CAP_FILE_WR;
6862 }
6863 if (mask & CEPH_SETATTR_SIZE) {
6864 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6865 req->head.args.setattr.size = stx->stx_size;
6866 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6867 } else { //too big!
6868 put_request(req);
6869 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6870 return -EFBIG;
6871 }
94b18763 6872 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6873 CEPH_CAP_FILE_WR;
6874 }
6875 req->head.args.setattr.mask = mask;
6876
6877 req->regetattr_mask = mask;
6878
6879 int res = make_request(req, perms, inp);
6880 ldout(cct, 10) << "_setattr result=" << res << dendl;
6881 return res;
6882}
6883
6884/* Note that we only care about attrs that setattr cares about */
6885void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6886{
6887 stx->stx_size = st->st_size;
6888 stx->stx_mode = st->st_mode;
6889 stx->stx_uid = st->st_uid;
6890 stx->stx_gid = st->st_gid;
6891 stx->stx_mtime = st->st_mtim;
6892 stx->stx_atime = st->st_atim;
6893}
6894
6895int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6896 const UserPerm& perms, InodeRef *inp)
6897{
6898 int ret = _do_setattr(in, stx, mask, perms, inp);
6899 if (ret < 0)
6900 return ret;
6901 if (mask & CEPH_SETATTR_MODE)
6902 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6903 return ret;
6904}
6905
6906int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6907 const UserPerm& perms)
6908{
6909 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6910 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6911 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6912 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6913 if (cct->_conf->client_permissions) {
6914 int r = may_setattr(in.get(), stx, mask, perms);
6915 if (r < 0)
6916 return r;
6917 }
6918 return __setattrx(in.get(), stx, mask, perms);
6919}
6920
6921int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6922 const UserPerm& perms)
6923{
6924 struct ceph_statx stx;
6925
6926 stat_to_statx(attr, &stx);
6927 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
6928
6929 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6930 mask &= ~CEPH_SETATTR_UID;
6931 }
6932 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6933 mask &= ~CEPH_SETATTR_GID;
6934 }
6935
7c673cae
FG
6936 return _setattrx(in, &stx, mask, perms);
6937}
6938
6939int Client::setattr(const char *relpath, struct stat *attr, int mask,
6940 const UserPerm& perms)
6941{
6942 Mutex::Locker lock(client_lock);
6943 tout(cct) << "setattr" << std::endl;
6944 tout(cct) << relpath << std::endl;
6945 tout(cct) << mask << std::endl;
6946
181888fb
FG
6947 if (unmounting)
6948 return -ENOTCONN;
6949
7c673cae
FG
6950 filepath path(relpath);
6951 InodeRef in;
6952 int r = path_walk(path, &in, perms);
6953 if (r < 0)
6954 return r;
6955 return _setattr(in, attr, mask, perms);
6956}
6957
6958int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6959 const UserPerm& perms, int flags)
6960{
6961 Mutex::Locker lock(client_lock);
6962 tout(cct) << "setattrx" << std::endl;
6963 tout(cct) << relpath << std::endl;
6964 tout(cct) << mask << std::endl;
6965
181888fb
FG
6966 if (unmounting)
6967 return -ENOTCONN;
6968
7c673cae
FG
6969 filepath path(relpath);
6970 InodeRef in;
6971 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6972 if (r < 0)
6973 return r;
6974 return _setattrx(in, stx, mask, perms);
6975}
6976
6977int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6978{
6979 Mutex::Locker lock(client_lock);
6980 tout(cct) << "fsetattr" << std::endl;
6981 tout(cct) << fd << std::endl;
6982 tout(cct) << mask << std::endl;
6983
181888fb
FG
6984 if (unmounting)
6985 return -ENOTCONN;
6986
7c673cae
FG
6987 Fh *f = get_filehandle(fd);
6988 if (!f)
6989 return -EBADF;
6990#if defined(__linux__) && defined(O_PATH)
6991 if (f->flags & O_PATH)
6992 return -EBADF;
6993#endif
6994 return _setattr(f->inode, attr, mask, perms);
6995}
6996
6997int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6998{
6999 Mutex::Locker lock(client_lock);
7000 tout(cct) << "fsetattr" << std::endl;
7001 tout(cct) << fd << std::endl;
7002 tout(cct) << mask << std::endl;
7003
181888fb
FG
7004 if (unmounting)
7005 return -ENOTCONN;
7006
7c673cae
FG
7007 Fh *f = get_filehandle(fd);
7008 if (!f)
7009 return -EBADF;
7010#if defined(__linux__) && defined(O_PATH)
7011 if (f->flags & O_PATH)
7012 return -EBADF;
7013#endif
7014 return _setattrx(f->inode, stx, mask, perms);
7015}
7016
7017int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7018 frag_info_t *dirstat, int mask)
7019{
7020 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7021 Mutex::Locker lock(client_lock);
7022 tout(cct) << "stat" << std::endl;
7023 tout(cct) << relpath << std::endl;
181888fb
FG
7024
7025 if (unmounting)
7026 return -ENOTCONN;
7027
7c673cae
FG
7028 filepath path(relpath);
7029 InodeRef in;
7030 int r = path_walk(path, &in, perms, true, mask);
7031 if (r < 0)
7032 return r;
7033 r = _getattr(in, mask, perms);
7034 if (r < 0) {
7035 ldout(cct, 3) << "stat exit on error!" << dendl;
7036 return r;
7037 }
7038 fill_stat(in, stbuf, dirstat);
7039 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7040 return r;
7041}
7042
7043unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7044{
7045 unsigned mask = 0;
7046
7047 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7048 if (flags & AT_NO_ATTR_SYNC)
7049 goto out;
7050
7051 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7052 mask |= CEPH_CAP_PIN;
7053 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7054 mask |= CEPH_CAP_AUTH_SHARED;
7055 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7056 mask |= CEPH_CAP_LINK_SHARED;
7057 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7058 mask |= CEPH_CAP_FILE_SHARED;
7059 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7060 mask |= CEPH_CAP_XATTR_SHARED;
7061out:
7062 return mask;
7063}
7064
7065int Client::statx(const char *relpath, struct ceph_statx *stx,
7066 const UserPerm& perms,
7067 unsigned int want, unsigned int flags)
7068{
7069 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7070 Mutex::Locker lock(client_lock);
7071 tout(cct) << "statx" << std::endl;
7072 tout(cct) << relpath << std::endl;
181888fb
FG
7073
7074 if (unmounting)
7075 return -ENOTCONN;
7076
7c673cae
FG
7077 filepath path(relpath);
7078 InodeRef in;
7079
7080 unsigned mask = statx_to_mask(flags, want);
7081
7082 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7083 if (r < 0)
7084 return r;
7085
7086 r = _getattr(in, mask, perms);
7087 if (r < 0) {
7088 ldout(cct, 3) << "statx exit on error!" << dendl;
7089 return r;
7090 }
7091
7092 fill_statx(in, mask, stx);
7093 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7094 return r;
7095}
7096
7097int Client::lstat(const char *relpath, struct stat *stbuf,
7098 const UserPerm& perms, frag_info_t *dirstat, int mask)
7099{
7100 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7101 Mutex::Locker lock(client_lock);
7102 tout(cct) << "lstat" << std::endl;
7103 tout(cct) << relpath << std::endl;
181888fb
FG
7104
7105 if (unmounting)
7106 return -ENOTCONN;
7107
7c673cae
FG
7108 filepath path(relpath);
7109 InodeRef in;
7110 // don't follow symlinks
7111 int r = path_walk(path, &in, perms, false, mask);
7112 if (r < 0)
7113 return r;
7114 r = _getattr(in, mask, perms);
7115 if (r < 0) {
7116 ldout(cct, 3) << "lstat exit on error!" << dendl;
7117 return r;
7118 }
7119 fill_stat(in, stbuf, dirstat);
7120 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7121 return r;
7122}
7123
7124int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7125{
7126 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7127 << " mode 0" << oct << in->mode << dec
7128 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7129 memset(st, 0, sizeof(struct stat));
7130 if (use_faked_inos())
7131 st->st_ino = in->faked_ino;
7132 else
7133 st->st_ino = in->ino;
7134 st->st_dev = in->snapid;
7135 st->st_mode = in->mode;
7136 st->st_rdev = in->rdev;
28e407b8
AA
7137 if (in->is_dir()) {
7138 switch (in->nlink) {
7139 case 0:
7140 st->st_nlink = 0; /* dir is unlinked */
7141 break;
7142 case 1:
7143 st->st_nlink = 1 /* parent dentry */
7144 + 1 /* <dir>/. */
7145 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7146 break;
7147 default:
7148 ceph_abort();
7149 }
7150 } else {
7151 st->st_nlink = in->nlink;
7152 }
7c673cae
FG
7153 st->st_uid = in->uid;
7154 st->st_gid = in->gid;
7155 if (in->ctime > in->mtime) {
7156 stat_set_ctime_sec(st, in->ctime.sec());
7157 stat_set_ctime_nsec(st, in->ctime.nsec());
7158 } else {
7159 stat_set_ctime_sec(st, in->mtime.sec());
7160 stat_set_ctime_nsec(st, in->mtime.nsec());
7161 }
7162 stat_set_atime_sec(st, in->atime.sec());
7163 stat_set_atime_nsec(st, in->atime.nsec());
7164 stat_set_mtime_sec(st, in->mtime.sec());
7165 stat_set_mtime_nsec(st, in->mtime.nsec());
7166 if (in->is_dir()) {
7167 if (cct->_conf->client_dirsize_rbytes)
7168 st->st_size = in->rstat.rbytes;
7169 else
7170 st->st_size = in->dirstat.size();
7171 st->st_blocks = 1;
7172 } else {
7173 st->st_size = in->size;
7174 st->st_blocks = (in->size + 511) >> 9;
7175 }
7176 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7177
7178 if (dirstat)
7179 *dirstat = in->dirstat;
7180 if (rstat)
7181 *rstat = in->rstat;
7182
7183 return in->caps_issued();
7184}
7185
7186void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7187{
7188 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7189 << " mode 0" << oct << in->mode << dec
7190 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7191 memset(stx, 0, sizeof(struct ceph_statx));
7192
7193 /*
7194 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7195 * so that all bits are set.
7196 */
7197 if (!mask)
7198 mask = ~0;
7199
7200 /* These are always considered to be available */
7201 stx->stx_dev = in->snapid;
7202 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7203
7204 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7205 stx->stx_mode = S_IFMT & in->mode;
7206 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7207 stx->stx_rdev = in->rdev;
7208 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7209
7210 if (mask & CEPH_CAP_AUTH_SHARED) {
7211 stx->stx_uid = in->uid;
7212 stx->stx_gid = in->gid;
7213 stx->stx_mode = in->mode;
7214 in->btime.to_timespec(&stx->stx_btime);
7215 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7216 }
7217
7218 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7219 if (in->is_dir()) {
7220 switch (in->nlink) {
7221 case 0:
7222 stx->stx_nlink = 0; /* dir is unlinked */
7223 break;
7224 case 1:
7225 stx->stx_nlink = 1 /* parent dentry */
7226 + 1 /* <dir>/. */
7227 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7228 break;
7229 default:
7230 ceph_abort();
7231 }
7232 } else {
7233 stx->stx_nlink = in->nlink;
7234 }
7c673cae
FG
7235 stx->stx_mask |= CEPH_STATX_NLINK;
7236 }
7237
7238 if (mask & CEPH_CAP_FILE_SHARED) {
7239
7240 in->atime.to_timespec(&stx->stx_atime);
7241 in->mtime.to_timespec(&stx->stx_mtime);
7242
7243 if (in->is_dir()) {
7244 if (cct->_conf->client_dirsize_rbytes)
7245 stx->stx_size = in->rstat.rbytes;
7246 else
7247 stx->stx_size = in->dirstat.size();
7248 stx->stx_blocks = 1;
7249 } else {
7250 stx->stx_size = in->size;
7251 stx->stx_blocks = (in->size + 511) >> 9;
7252 }
7253 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7254 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7255 }
7256
7257 /* Change time and change_attr both require all shared caps to view */
7258 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7259 stx->stx_version = in->change_attr;
7260 if (in->ctime > in->mtime)
7261 in->ctime.to_timespec(&stx->stx_ctime);
7262 else
7263 in->mtime.to_timespec(&stx->stx_ctime);
7264 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7265 }
7266
7267}
7268
7269void Client::touch_dn(Dentry *dn)
7270{
7271 lru.lru_touch(dn);
7272}
7273
7274int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7275{
7276 Mutex::Locker lock(client_lock);
7277 tout(cct) << "chmod" << std::endl;
7278 tout(cct) << relpath << std::endl;
7279 tout(cct) << mode << std::endl;
181888fb
FG
7280
7281 if (unmounting)
7282 return -ENOTCONN;
7283
7c673cae
FG
7284 filepath path(relpath);
7285 InodeRef in;
7286 int r = path_walk(path, &in, perms);
7287 if (r < 0)
7288 return r;
7289 struct stat attr;
7290 attr.st_mode = mode;
7291 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7292}
7293
7294int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7295{
7296 Mutex::Locker lock(client_lock);
7297 tout(cct) << "fchmod" << std::endl;
7298 tout(cct) << fd << std::endl;
7299 tout(cct) << mode << std::endl;
181888fb
FG
7300
7301 if (unmounting)
7302 return -ENOTCONN;
7303
7c673cae
FG
7304 Fh *f = get_filehandle(fd);
7305 if (!f)
7306 return -EBADF;
7307#if defined(__linux__) && defined(O_PATH)
7308 if (f->flags & O_PATH)
7309 return -EBADF;
7310#endif
7311 struct stat attr;
7312 attr.st_mode = mode;
7313 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7314}
7315
7316int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7317{
7318 Mutex::Locker lock(client_lock);
7319 tout(cct) << "lchmod" << std::endl;
7320 tout(cct) << relpath << std::endl;
7321 tout(cct) << mode << std::endl;
181888fb
FG
7322
7323 if (unmounting)
7324 return -ENOTCONN;
7325
7c673cae
FG
7326 filepath path(relpath);
7327 InodeRef in;
7328 // don't follow symlinks
7329 int r = path_walk(path, &in, perms, false);
7330 if (r < 0)
7331 return r;
7332 struct stat attr;
7333 attr.st_mode = mode;
7334 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7335}
7336
7337int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7338 const UserPerm& perms)
7339{
7340 Mutex::Locker lock(client_lock);
7341 tout(cct) << "chown" << std::endl;
7342 tout(cct) << relpath << std::endl;
7343 tout(cct) << new_uid << std::endl;
7344 tout(cct) << new_gid << std::endl;
181888fb
FG
7345
7346 if (unmounting)
7347 return -ENOTCONN;
7348
7c673cae
FG
7349 filepath path(relpath);
7350 InodeRef in;
7351 int r = path_walk(path, &in, perms);
7352 if (r < 0)
7353 return r;
7354 struct stat attr;
7355 attr.st_uid = new_uid;
7356 attr.st_gid = new_gid;
181888fb 7357 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7358}
7359
7360int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7361{
7362 Mutex::Locker lock(client_lock);
7363 tout(cct) << "fchown" << std::endl;
7364 tout(cct) << fd << std::endl;
7365 tout(cct) << new_uid << std::endl;
7366 tout(cct) << new_gid << std::endl;
181888fb
FG
7367
7368 if (unmounting)
7369 return -ENOTCONN;
7370
7c673cae
FG
7371 Fh *f = get_filehandle(fd);
7372 if (!f)
7373 return -EBADF;
7374#if defined(__linux__) && defined(O_PATH)
7375 if (f->flags & O_PATH)
7376 return -EBADF;
7377#endif
7378 struct stat attr;
7379 attr.st_uid = new_uid;
7380 attr.st_gid = new_gid;
7381 int mask = 0;
7382 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7383 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7384 return _setattr(f->inode, &attr, mask, perms);
7385}
7386
7387int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7388 const UserPerm& perms)
7389{
7390 Mutex::Locker lock(client_lock);
7391 tout(cct) << "lchown" << std::endl;
7392 tout(cct) << relpath << std::endl;
7393 tout(cct) << new_uid << std::endl;
7394 tout(cct) << new_gid << std::endl;
181888fb
FG
7395
7396 if (unmounting)
7397 return -ENOTCONN;
7398
7c673cae
FG
7399 filepath path(relpath);
7400 InodeRef in;
7401 // don't follow symlinks
7402 int r = path_walk(path, &in, perms, false);
7403 if (r < 0)
7404 return r;
7405 struct stat attr;
7406 attr.st_uid = new_uid;
7407 attr.st_gid = new_gid;
7408 int mask = 0;
7409 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7410 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7411 return _setattr(in, &attr, mask, perms);
7412}
7413
7414int Client::utime(const char *relpath, struct utimbuf *buf,
7415 const UserPerm& perms)
7416{
7417 Mutex::Locker lock(client_lock);
7418 tout(cct) << "utime" << std::endl;
7419 tout(cct) << relpath << std::endl;
7420 tout(cct) << buf->modtime << std::endl;
7421 tout(cct) << buf->actime << std::endl;
181888fb
FG
7422
7423 if (unmounting)
7424 return -ENOTCONN;
7425
7c673cae
FG
7426 filepath path(relpath);
7427 InodeRef in;
7428 int r = path_walk(path, &in, perms);
7429 if (r < 0)
7430 return r;
7431 struct stat attr;
7432 stat_set_mtime_sec(&attr, buf->modtime);
7433 stat_set_mtime_nsec(&attr, 0);
7434 stat_set_atime_sec(&attr, buf->actime);
7435 stat_set_atime_nsec(&attr, 0);
7436 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7437}
7438
7439int Client::lutime(const char *relpath, struct utimbuf *buf,
7440 const UserPerm& perms)
7441{
7442 Mutex::Locker lock(client_lock);
7443 tout(cct) << "lutime" << std::endl;
7444 tout(cct) << relpath << std::endl;
7445 tout(cct) << buf->modtime << std::endl;
7446 tout(cct) << buf->actime << std::endl;
181888fb
FG
7447
7448 if (unmounting)
7449 return -ENOTCONN;
7450
7c673cae
FG
7451 filepath path(relpath);
7452 InodeRef in;
7453 // don't follow symlinks
7454 int r = path_walk(path, &in, perms, false);
7455 if (r < 0)
7456 return r;
7457 struct stat attr;
7458 stat_set_mtime_sec(&attr, buf->modtime);
7459 stat_set_mtime_nsec(&attr, 0);
7460 stat_set_atime_sec(&attr, buf->actime);
7461 stat_set_atime_nsec(&attr, 0);
7462 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7463}
7464
7465int Client::flock(int fd, int operation, uint64_t owner)
7466{
7467 Mutex::Locker lock(client_lock);
7468 tout(cct) << "flock" << std::endl;
7469 tout(cct) << fd << std::endl;
7470 tout(cct) << operation << std::endl;
7471 tout(cct) << owner << std::endl;
181888fb
FG
7472
7473 if (unmounting)
7474 return -ENOTCONN;
7475
7c673cae
FG
7476 Fh *f = get_filehandle(fd);
7477 if (!f)
7478 return -EBADF;
7479
7480 return _flock(f, operation, owner);
7481}
7482
7483int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7484{
7485 Mutex::Locker lock(client_lock);
7486 tout(cct) << "opendir" << std::endl;
7487 tout(cct) << relpath << std::endl;
181888fb
FG
7488
7489 if (unmounting)
7490 return -ENOTCONN;
7491
7c673cae
FG
7492 filepath path(relpath);
7493 InodeRef in;
7494 int r = path_walk(path, &in, perms, true);
7495 if (r < 0)
7496 return r;
7497 if (cct->_conf->client_permissions) {
7498 int r = may_open(in.get(), O_RDONLY, perms);
7499 if (r < 0)
7500 return r;
7501 }
7502 r = _opendir(in.get(), dirpp, perms);
7503 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7504 if (r != -ENOTDIR)
7505 tout(cct) << (unsigned long)*dirpp << std::endl;
7506 return r;
7507}
7508
7509int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7510{
7511 if (!in->is_dir())
7512 return -ENOTDIR;
7513 *dirpp = new dir_result_t(in, perms);
7514 opened_dirs.insert(*dirpp);
1adf2230 7515 ldout(cct, 8) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7516 return 0;
7517}
7518
7519
7520int Client::closedir(dir_result_t *dir)
7521{
7522 Mutex::Locker lock(client_lock);
7523 tout(cct) << "closedir" << std::endl;
7524 tout(cct) << (unsigned long)dir << std::endl;
7525
7526 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7527 _closedir(dir);
7528 return 0;
7529}
7530
7531void Client::_closedir(dir_result_t *dirp)
7532{
7533 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7534 if (dirp->inode) {
7535 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7536 dirp->inode.reset();
7537 }
7538 _readdir_drop_dirp_buffer(dirp);
7539 opened_dirs.erase(dirp);
7540 delete dirp;
7541}
7542
7543void Client::rewinddir(dir_result_t *dirp)
7544{
7545 Mutex::Locker lock(client_lock);
7c673cae 7546 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
181888fb
FG
7547
7548 if (unmounting)
7549 return;
7550
7c673cae
FG
7551 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7552 _readdir_drop_dirp_buffer(d);
7553 d->reset();
7554}
7555
7556loff_t Client::telldir(dir_result_t *dirp)
7557{
7558 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7559 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7560 return d->offset;
7561}
7562
7563void Client::seekdir(dir_result_t *dirp, loff_t offset)
7564{
7565 Mutex::Locker lock(client_lock);
7566
7567 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7568
181888fb
FG
7569 if (unmounting)
7570 return;
7571
7c673cae
FG
7572 if (offset == dirp->offset)
7573 return;
7574
7575 if (offset > dirp->offset)
7576 dirp->release_count = 0; // bump if we do a forward seek
7577 else
7578 dirp->ordered_count = 0; // disable filling readdir cache
7579
7580 if (dirp->hash_order()) {
7581 if (dirp->offset > offset) {
7582 _readdir_drop_dirp_buffer(dirp);
7583 dirp->reset();
7584 }
7585 } else {
7586 if (offset == 0 ||
7587 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7588 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7589 _readdir_drop_dirp_buffer(dirp);
7590 dirp->reset();
7591 }
7592 }
7593
7594 dirp->offset = offset;
7595}
7596
7597
7598//struct dirent {
7599// ino_t d_ino; /* inode number */
7600// off_t d_off; /* offset to the next dirent */
7601// unsigned short d_reclen; /* length of this record */
7602// unsigned char d_type; /* type of file */
7603// char d_name[256]; /* filename */
7604//};
7605void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7606{
7607 strncpy(de->d_name, name, 255);
7608 de->d_name[255] = '\0';
7609#ifndef __CYGWIN__
7610 de->d_ino = ino;
7611#if !defined(DARWIN) && !defined(__FreeBSD__)
7612 de->d_off = next_off;
7613#endif
7614 de->d_reclen = 1;
7615 de->d_type = IFTODT(type);
7616 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7617 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7618#endif
7619}
7620
7621void Client::_readdir_next_frag(dir_result_t *dirp)
7622{
7623 frag_t fg = dirp->buffer_frag;
7624
7625 if (fg.is_rightmost()) {
7626 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7627 dirp->set_end();
7628 return;
7629 }
7630
7631 // advance
7632 fg = fg.next();
7633 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7634
7635 if (dirp->hash_order()) {
7636 // keep last_name
7637 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7638 if (dirp->offset < new_offset) // don't decrease offset
7639 dirp->offset = new_offset;
7640 } else {
7641 dirp->last_name.clear();
7642 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7643 _readdir_rechoose_frag(dirp);
7644 }
7645}
7646
7647void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7648{
7649 assert(dirp->inode);
7650
7651 if (dirp->hash_order())
7652 return;
7653
7654 frag_t cur = frag_t(dirp->offset_high());
7655 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7656 if (fg != cur) {
7657 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7658 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7659 dirp->last_name.clear();
7660 dirp->next_offset = 2;
7661 }
7662}
7663
7664void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7665{
7666 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7667 dirp->buffer.clear();
7668}
7669
7670int Client::_readdir_get_frag(dir_result_t *dirp)
7671{
7672 assert(dirp);
7673 assert(dirp->inode);
7674
7675 // get the current frag.
7676 frag_t fg;
7677 if (dirp->hash_order())
7678 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7679 else
7680 fg = frag_t(dirp->offset_high());
7681
7682 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7683 << " offset " << hex << dirp->offset << dec << dendl;
7684
7685 int op = CEPH_MDS_OP_READDIR;
7686 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7687 op = CEPH_MDS_OP_LSSNAP;
7688
7689 InodeRef& diri = dirp->inode;
7690
7691 MetaRequest *req = new MetaRequest(op);
7692 filepath path;
7693 diri->make_nosnap_relative_path(path);
7694 req->set_filepath(path);
7695 req->set_inode(diri.get());
7696 req->head.args.readdir.frag = fg;
7697 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7698 if (dirp->last_name.length()) {
94b18763 7699 req->path2.set_path(dirp->last_name);
7c673cae
FG
7700 } else if (dirp->hash_order()) {
7701 req->head.args.readdir.offset_hash = dirp->offset_high();
7702 }
7703 req->dirp = dirp;
7704
7705 bufferlist dirbl;
7706 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7707
7708 if (res == -EAGAIN) {
7709 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7710 _readdir_rechoose_frag(dirp);
7711 return _readdir_get_frag(dirp);
7712 }
7713
7714 if (res == 0) {
7715 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7716 << " size " << dirp->buffer.size() << dendl;
7717 } else {
7718 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7719 dirp->set_end();
7720 }
7721
7722 return res;
7723}
7724
7725struct dentry_off_lt {
7726 bool operator()(const Dentry* dn, int64_t off) const {
7727 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7728 }
7729};
7730
7731int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7732 int caps, bool getref)
7733{
7734 assert(client_lock.is_locked());
7735 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7736 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7737 << dendl;
7738 Dir *dir = dirp->inode->dir;
7739
7740 if (!dir) {
7741 ldout(cct, 10) << " dir is empty" << dendl;
7742 dirp->set_end();
7743 return 0;
7744 }
7745
7746 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7747 dir->readdir_cache.end(),
7748 dirp->offset, dentry_off_lt());
7749
7750 string dn_name;
7751 while (true) {
7752 if (!dirp->inode->is_complete_and_ordered())
7753 return -EAGAIN;
7754 if (pd == dir->readdir_cache.end())
7755 break;
7756 Dentry *dn = *pd;
7757 if (dn->inode == NULL) {
7758 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7759 ++pd;
7760 continue;
7761 }
7762 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7763 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7764 ++pd;
7765 continue;
7766 }
7767
7768 int r = _getattr(dn->inode, caps, dirp->perms);
7769 if (r < 0)
7770 return r;
7771
7772 struct ceph_statx stx;
7773 struct dirent de;
7774 fill_statx(dn->inode, caps, &stx);
7775
7776 uint64_t next_off = dn->offset + 1;
7777 ++pd;
7778 if (pd == dir->readdir_cache.end())
7779 next_off = dir_result_t::END;
7780
7781 Inode *in = NULL;
7782 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7783 if (getref) {
7784 in = dn->inode.get();
7785 _ll_get(in);
7786 }
7787
7788 dn_name = dn->name; // fill in name while we have lock
7789
7790 client_lock.Unlock();
7791 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7792 client_lock.Lock();
7793 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7794 << " = " << r << dendl;
7795 if (r < 0) {
7796 return r;
7797 }
7798
7799 dirp->offset = next_off;
7800 if (dirp->at_end())
7801 dirp->next_offset = 2;
7802 else
7803 dirp->next_offset = dirp->offset_low();
7804 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 7805 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
7806 if (r > 0)
7807 return r;
7808 }
7809
7810 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7811 dirp->set_end();
7812 return 0;
7813}
7814
7815int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7816 unsigned want, unsigned flags, bool getref)
7817{
7818 int caps = statx_to_mask(flags, want);
7819
7820 Mutex::Locker lock(client_lock);
7821
181888fb
FG
7822 if (unmounting)
7823 return -ENOTCONN;
7824
7c673cae
FG
7825 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7826
7827 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7828 << dec << " at_end=" << dirp->at_end()
7829 << " hash_order=" << dirp->hash_order() << dendl;
7830
7831 struct dirent de;
7832 struct ceph_statx stx;
7833 memset(&de, 0, sizeof(de));
7834 memset(&stx, 0, sizeof(stx));
7835
7836 InodeRef& diri = dirp->inode;
7837
7838 if (dirp->at_end())
7839 return 0;
7840
7841 if (dirp->offset == 0) {
7842 ldout(cct, 15) << " including ." << dendl;
7843 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7844 uint64_t next_off = 1;
7845
7846 int r;
7847 r = _getattr(diri, caps, dirp->perms);
7848 if (r < 0)
7849 return r;
7850
7851 fill_statx(diri, caps, &stx);
7852 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7853
7854 Inode *inode = NULL;
7855 if (getref) {
7856 inode = diri.get();
7857 _ll_get(inode);
7858 }
7859
7860 client_lock.Unlock();
7861 r = cb(p, &de, &stx, next_off, inode);
7862 client_lock.Lock();
7863 if (r < 0)
7864 return r;
7865
7866 dirp->offset = next_off;
7867 if (r > 0)
7868 return r;
7869 }
7870 if (dirp->offset == 1) {
7871 ldout(cct, 15) << " including .." << dendl;
7872 uint64_t next_off = 2;
7873 InodeRef in;
7874 if (diri->dn_set.empty())
7875 in = diri;
7876 else
94b18763 7877 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
7878
7879 int r;
94b18763 7880 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
7881 if (r < 0)
7882 return r;
7883
7884 fill_statx(in, caps, &stx);
7885 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7886
7887 Inode *inode = NULL;
7888 if (getref) {
7889 inode = in.get();
7890 _ll_get(inode);
7891 }
7892
7893 client_lock.Unlock();
7894 r = cb(p, &de, &stx, next_off, inode);
7895 client_lock.Lock();
7896 if (r < 0)
7897 return r;
7898
7899 dirp->offset = next_off;
7900 if (r > 0)
7901 return r;
7902 }
7903
7904 // can we read from our cache?
7905 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7906 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7907 << dirp->inode->is_complete_and_ordered()
7908 << " issued " << ccap_string(dirp->inode->caps_issued())
7909 << dendl;
7910 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7911 dirp->inode->is_complete_and_ordered() &&
94b18763 7912 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
7913 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7914 if (err != -EAGAIN)
7915 return err;
7916 }
7917
7918 while (1) {
7919 if (dirp->at_end())
7920 return 0;
7921
7922 bool check_caps = true;
7923 if (!dirp->is_cached()) {
7924 int r = _readdir_get_frag(dirp);
7925 if (r)
7926 return r;
7927 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7928 // different than the requested one. (our dirfragtree was outdated)
7929 check_caps = false;
7930 }
7931 frag_t fg = dirp->buffer_frag;
7932
7933 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7934 << " offset " << hex << dirp->offset << dendl;
7935
7936 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7937 dirp->offset, dir_result_t::dentry_off_lt());
7938 it != dirp->buffer.end();
7939 ++it) {
7940 dir_result_t::dentry &entry = *it;
7941
7942 uint64_t next_off = entry.offset + 1;
7943
7944 int r;
7945 if (check_caps) {
7946 r = _getattr(entry.inode, caps, dirp->perms);
7947 if (r < 0)
7948 return r;
7949 }
7950
7951 fill_statx(entry.inode, caps, &stx);
7952 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7953
7954 Inode *inode = NULL;
7955 if (getref) {
7956 inode = entry.inode.get();
7957 _ll_get(inode);
7958 }
7959
7960 client_lock.Unlock();
7961 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7962 client_lock.Lock();
7963
7964 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7965 << " = " << r << dendl;
7966 if (r < 0)
7967 return r;
7968
7969 dirp->offset = next_off;
7970 if (r > 0)
7971 return r;
7972 }
7973
7974 if (dirp->next_offset > 2) {
7975 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7976 _readdir_drop_dirp_buffer(dirp);
7977 continue; // more!
7978 }
7979
7980 if (!fg.is_rightmost()) {
7981 // next frag!
7982 _readdir_next_frag(dirp);
7983 continue;
7984 }
7985
7986 if (diri->shared_gen == dirp->start_shared_gen &&
7987 diri->dir_release_count == dirp->release_count) {
7988 if (diri->dir_ordered_count == dirp->ordered_count) {
7989 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7990 if (diri->dir) {
7991 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7992 diri->dir->readdir_cache.resize(dirp->cache_index);
7993 }
7994 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7995 } else {
7996 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7997 diri->flags |= I_COMPLETE;
7998 }
7999 }
8000
8001 dirp->set_end();
8002 return 0;
8003 }
8004 ceph_abort();
8005 return 0;
8006}
8007
8008
8009int Client::readdir_r(dir_result_t *d, struct dirent *de)
8010{
8011 return readdirplus_r(d, de, 0, 0, 0, NULL);
8012}
8013
8014/*
8015 * readdirplus_r
8016 *
8017 * returns
8018 * 1 if we got a dirent
8019 * 0 for end of directory
8020 * <0 on error
8021 */
8022
8023struct single_readdir {
8024 struct dirent *de;
8025 struct ceph_statx *stx;
8026 Inode *inode;
8027 bool full;
8028};
8029
8030static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8031 struct ceph_statx *stx, off_t off,
8032 Inode *in)
8033{
8034 single_readdir *c = static_cast<single_readdir *>(p);
8035
8036 if (c->full)
8037 return -1; // already filled this dirent
8038
8039 *c->de = *de;
8040 if (c->stx)
8041 *c->stx = *stx;
8042 c->inode = in;
8043 c->full = true;
8044 return 1;
8045}
8046
8047struct dirent *Client::readdir(dir_result_t *d)
8048{
8049 int ret;
8050 static struct dirent de;
8051 single_readdir sr;
8052 sr.de = &de;
8053 sr.stx = NULL;
8054 sr.inode = NULL;
8055 sr.full = false;
8056
8057 // our callback fills the dirent and sets sr.full=true on first
8058 // call, and returns -1 the second time around.
8059 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8060 if (ret < -1) {
8061 errno = -ret; // this sucks.
8062 return (dirent *) NULL;
8063 }
8064 if (sr.full) {
8065 return &de;
8066 }
8067 return (dirent *) NULL;
8068}
8069
8070int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8071 struct ceph_statx *stx, unsigned want,
8072 unsigned flags, Inode **out)
8073{
8074 single_readdir sr;
8075 sr.de = de;
8076 sr.stx = stx;
8077 sr.inode = NULL;
8078 sr.full = false;
8079
8080 // our callback fills the dirent and sets sr.full=true on first
8081 // call, and returns -1 the second time around.
8082 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8083 if (r < -1)
8084 return r;
8085 if (out)
8086 *out = sr.inode;
8087 if (sr.full)
8088 return 1;
8089 return 0;
8090}
8091
8092
8093/* getdents */
8094struct getdents_result {
8095 char *buf;
8096 int buflen;
8097 int pos;
8098 bool fullent;
8099};
8100
8101static int _readdir_getdent_cb(void *p, struct dirent *de,
8102 struct ceph_statx *stx, off_t off, Inode *in)
8103{
8104 struct getdents_result *c = static_cast<getdents_result *>(p);
8105
8106 int dlen;
8107 if (c->fullent)
8108 dlen = sizeof(*de);
8109 else
8110 dlen = strlen(de->d_name) + 1;
8111
8112 if (c->pos + dlen > c->buflen)
8113 return -1; // doesn't fit
8114
8115 if (c->fullent) {
8116 memcpy(c->buf + c->pos, de, sizeof(*de));
8117 } else {
8118 memcpy(c->buf + c->pos, de->d_name, dlen);
8119 }
8120 c->pos += dlen;
8121 return 0;
8122}
8123
8124int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8125{
8126 getdents_result gr;
8127 gr.buf = buf;
8128 gr.buflen = buflen;
8129 gr.fullent = fullent;
8130 gr.pos = 0;
8131
8132 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8133
8134 if (r < 0) { // some error
8135 if (r == -1) { // buffer ran out of space
8136 if (gr.pos) { // but we got some entries already!
8137 return gr.pos;
8138 } // or we need a larger buffer
8139 return -ERANGE;
8140 } else { // actual error, return it
8141 return r;
8142 }
8143 }
8144 return gr.pos;
8145}
8146
8147
8148/* getdir */
8149struct getdir_result {
8150 list<string> *contents;
8151 int num;
8152};
8153
8154static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8155{
8156 getdir_result *r = static_cast<getdir_result *>(p);
8157
8158 r->contents->push_back(de->d_name);
8159 r->num++;
8160 return 0;
8161}
8162
8163int Client::getdir(const char *relpath, list<string>& contents,
8164 const UserPerm& perms)
8165{
8166 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8167 {
8168 Mutex::Locker lock(client_lock);
8169 tout(cct) << "getdir" << std::endl;
8170 tout(cct) << relpath << std::endl;
8171 }
8172
8173 dir_result_t *d;
8174 int r = opendir(relpath, &d, perms);
8175 if (r < 0)
8176 return r;
8177
8178 getdir_result gr;
8179 gr.contents = &contents;
8180 gr.num = 0;
8181 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8182
8183 closedir(d);
8184
8185 if (r < 0)
8186 return r;
8187 return gr.num;
8188}
8189
8190
8191/****** file i/o **********/
8192int Client::open(const char *relpath, int flags, const UserPerm& perms,
8193 mode_t mode, int stripe_unit, int stripe_count,
8194 int object_size, const char *data_pool)
8195{
8196 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8197 Mutex::Locker lock(client_lock);
8198 tout(cct) << "open" << std::endl;
8199 tout(cct) << relpath << std::endl;
8200 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8201
181888fb
FG
8202 if (unmounting)
8203 return -ENOTCONN;
8204
7c673cae
FG
8205 Fh *fh = NULL;
8206
8207#if defined(__linux__) && defined(O_PATH)
8208 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8209 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8210 * in kernel (fs/open.c). */
8211 if (flags & O_PATH)
8212 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8213#endif
8214
8215 filepath path(relpath);
8216 InodeRef in;
8217 bool created = false;
8218 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8219 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8220 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8221
8222 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8223 return -EEXIST;
8224
8225#if defined(__linux__) && defined(O_PATH)
8226 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8227#else
8228 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8229#endif
8230 return -ELOOP;
8231
8232 if (r == -ENOENT && (flags & O_CREAT)) {
8233 filepath dirpath = path;
8234 string dname = dirpath.last_dentry();
8235 dirpath.pop_dentry();
8236 InodeRef dir;
8237 r = path_walk(dirpath, &dir, perms, true,
8238 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8239 if (r < 0)
8240 goto out;
8241 if (cct->_conf->client_permissions) {
8242 r = may_create(dir.get(), perms);
8243 if (r < 0)
8244 goto out;
8245 }
8246 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8247 stripe_count, object_size, data_pool, &created, perms);
8248 }
8249 if (r < 0)
8250 goto out;
8251
8252 if (!created) {
8253 // posix says we can only check permissions of existing files
8254 if (cct->_conf->client_permissions) {
8255 r = may_open(in.get(), flags, perms);
8256 if (r < 0)
8257 goto out;
8258 }
8259 }
8260
8261 if (!fh)
8262 r = _open(in.get(), flags, mode, &fh, perms);
8263 if (r >= 0) {
8264 // allocate a integer file descriptor
8265 assert(fh);
8266 r = get_fd();
8267 assert(fd_map.count(r) == 0);
8268 fd_map[r] = fh;
8269 }
8270
8271 out:
8272 tout(cct) << r << std::endl;
8273 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8274 return r;
8275}
8276
8277int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8278{
8279 /* Use default file striping parameters */
8280 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8281}
8282
8283int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8284 const UserPerm& perms)
8285{
8286 Mutex::Locker lock(client_lock);
8287 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8288
181888fb
FG
8289 if (unmounting)
8290 return -ENOTCONN;
8291
7c673cae
FG
8292 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8293 filepath path(ino);
8294 req->set_filepath(path);
8295
8296 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8297 char f[30];
8298 sprintf(f, "%u", h);
8299 filepath path2(dirino);
8300 path2.push_dentry(string(f));
8301 req->set_filepath2(path2);
8302
8303 int r = make_request(req, perms, NULL, NULL,
8304 rand() % mdsmap->get_num_in_mds());
8305 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8306 return r;
8307}
8308
8309
8310/**
8311 * Load inode into local cache.
8312 *
8313 * If inode pointer is non-NULL, and take a reference on
8314 * the resulting Inode object in one operation, so that caller
8315 * can safely assume inode will still be there after return.
8316 */
1adf2230 8317int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8318{
1adf2230 8319 ldout(cct, 8) << "lookup_ino enter(" << ino << ")" << dendl;
7c673cae 8320
181888fb
FG
8321 if (unmounting)
8322 return -ENOTCONN;
8323
7c673cae
FG
8324 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8325 filepath path(ino);
8326 req->set_filepath(path);
8327
8328 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8329 if (r == 0 && inode != NULL) {
8330 vinodeno_t vino(ino, CEPH_NOSNAP);
8331 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8332 assert(p != inode_map.end());
8333 *inode = p->second;
8334 _ll_get(*inode);
8335 }
1adf2230 8336 ldout(cct, 8) << "lookup_ino exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8337 return r;
8338}
8339
1adf2230
AA
8340int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8341{
8342 Mutex::Locker lock(client_lock);
8343 return _lookup_ino(ino, perms, inode);
8344}
7c673cae
FG
8345
8346/**
8347 * Find the parent inode of `ino` and insert it into
8348 * our cache. Conditionally also set `parent` to a referenced
8349 * Inode* if caller provides non-NULL value.
8350 */
1adf2230 8351int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8352{
1adf2230 8353 ldout(cct, 8) << "lookup_parent enter(" << ino->ino << ")" << dendl;
7c673cae 8354
181888fb
FG
8355 if (unmounting)
8356 return -ENOTCONN;
8357
7c673cae
FG
8358 if (!ino->dn_set.empty()) {
8359 // if we exposed the parent here, we'd need to check permissions,
8360 // but right now we just rely on the MDS doing so in make_request
1adf2230 8361 ldout(cct, 8) << "lookup_parent dentry already present" << dendl;
7c673cae
FG
8362 return 0;
8363 }
8364
8365 if (ino->is_root()) {
8366 *parent = NULL;
1adf2230 8367 ldout(cct, 8) << "ino is root, no parent" << dendl;
7c673cae
FG
8368 return -EINVAL;
8369 }
8370
8371 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8372 filepath path(ino->ino);
8373 req->set_filepath(path);
8374
8375 InodeRef target;
8376 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8377 // Give caller a reference to the parent ino if they provided a pointer.
8378 if (parent != NULL) {
8379 if (r == 0) {
8380 *parent = target.get();
8381 _ll_get(*parent);
1adf2230 8382 ldout(cct, 8) << "lookup_parent found parent " << (*parent)->ino << dendl;
7c673cae
FG
8383 } else {
8384 *parent = NULL;
8385 }
8386 }
1adf2230 8387 ldout(cct, 8) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8388 return r;
8389}
8390
1adf2230
AA
8391int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8392{
8393 Mutex::Locker lock(client_lock);
8394 return _lookup_parent(ino, perms, parent);
8395}
7c673cae
FG
8396
8397/**
8398 * Populate the parent dentry for `ino`, provided it is
8399 * a child of `parent`.
8400 */
1adf2230 8401int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae
FG
8402{
8403 assert(parent->is_dir());
7c673cae
FG
8404 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8405
181888fb
FG
8406 if (unmounting)
8407 return -ENOTCONN;
8408
7c673cae
FG
8409 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8410 req->set_filepath2(filepath(parent->ino));
8411 req->set_filepath(filepath(ino->ino));
8412 req->set_inode(ino);
8413
8414 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8415 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8416 return r;
8417}
8418
1adf2230
AA
8419int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8420{
8421 Mutex::Locker lock(client_lock);
8422 return _lookup_name(ino, parent, perms);
8423}
7c673cae
FG
8424
8425 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8426{
8427 assert(in);
8428 Fh *f = new Fh(in);
8429 f->mode = cmode;
8430 f->flags = flags;
8431
8432 // inode
8433 f->actor_perms = perms;
8434
8435 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8436
8437 if (in->snapid != CEPH_NOSNAP) {
8438 in->snap_cap_refs++;
8439 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8440 << ccap_string(in->caps_issued()) << dendl;
8441 }
8442
8443 const md_config_t *conf = cct->_conf;
8444 f->readahead.set_trigger_requests(1);
8445 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8446 uint64_t max_readahead = Readahead::NO_LIMIT;
8447 if (conf->client_readahead_max_bytes) {
8448 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8449 }
8450 if (conf->client_readahead_max_periods) {
8451 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8452 }
8453 f->readahead.set_max_readahead_size(max_readahead);
8454 vector<uint64_t> alignments;
8455 alignments.push_back(in->layout.get_period());
8456 alignments.push_back(in->layout.stripe_unit);
8457 f->readahead.set_alignments(alignments);
8458
8459 return f;
8460}
8461
8462int Client::_release_fh(Fh *f)
8463{
8464 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8465 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8466 Inode *in = f->inode.get();
1adf2230 8467 ldout(cct, 8) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8468
b32b8144
FG
8469 in->unset_deleg(f);
8470
7c673cae
FG
8471 if (in->snapid == CEPH_NOSNAP) {
8472 if (in->put_open_ref(f->mode)) {
8473 _flush(in, new C_Client_FlushComplete(this, in));
8474 check_caps(in, 0);
8475 }
8476 } else {
8477 assert(in->snap_cap_refs > 0);
8478 in->snap_cap_refs--;
8479 }
8480
8481 _release_filelocks(f);
8482
8483 // Finally, read any async err (i.e. from flushes)
8484 int err = f->take_async_err();
8485 if (err != 0) {
8486 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8487 << cpp_strerror(err) << dendl;
8488 } else {
8489 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8490 }
8491
8492 _put_fh(f);
8493
8494 return err;
8495}
8496
8497void Client::_put_fh(Fh *f)
8498{
8499 int left = f->put();
8500 if (!left) {
8501 delete f;
8502 }
8503}
8504
8505int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8506 const UserPerm& perms)
8507{
8508 if (in->snapid != CEPH_NOSNAP &&
8509 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8510 return -EROFS;
8511 }
8512
8513 // use normalized flags to generate cmode
8514 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8515 if (cmode < 0)
8516 return -EINVAL;
8517 int want = ceph_caps_for_mode(cmode);
8518 int result = 0;
8519
8520 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8521
b32b8144 8522 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8523 // update wanted?
8524 check_caps(in, CHECK_CAPS_NODELAY);
8525 } else {
b32b8144 8526
7c673cae
FG
8527 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8528 filepath path;
8529 in->make_nosnap_relative_path(path);
8530 req->set_filepath(path);
8531 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8532 req->head.args.open.mode = mode;
8533 req->head.args.open.pool = -1;
8534 if (cct->_conf->client_debug_getattr_caps)
8535 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8536 else
8537 req->head.args.open.mask = 0;
8538 req->head.args.open.old_size = in->size; // for O_TRUNC
8539 req->set_inode(in);
8540 result = make_request(req, perms);
b32b8144
FG
8541
8542 /*
8543 * NFS expects that delegations will be broken on a conflicting open,
8544 * not just when there is actual conflicting access to the file. SMB leases
8545 * and oplocks also have similar semantics.
8546 *
8547 * Ensure that clients that have delegations enabled will wait on minimal
8548 * caps during open, just to ensure that other clients holding delegations
8549 * return theirs first.
8550 */
8551 if (deleg_timeout && result == 0) {
8552 int need = 0, have;
8553
8554 if (cmode & CEPH_FILE_MODE_WR)
8555 need |= CEPH_CAP_FILE_WR;
8556 if (cmode & CEPH_FILE_MODE_RD)
8557 need |= CEPH_CAP_FILE_RD;
8558
8559 result = get_caps(in, need, want, &have, -1);
8560 if (result < 0) {
1adf2230 8561 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8562 " . Denying open: " <<
8563 cpp_strerror(result) << dendl;
8564 in->put_open_ref(cmode);
8565 } else {
8566 put_cap_ref(in, need);
8567 }
8568 }
7c673cae
FG
8569 }
8570
8571 // success?
8572 if (result >= 0) {
8573 if (fhp)
8574 *fhp = _create_fh(in, flags, cmode, perms);
8575 } else {
8576 in->put_open_ref(cmode);
8577 }
8578
8579 trim_cache();
8580
8581 return result;
8582}
8583
8584int Client::_renew_caps(Inode *in)
8585{
8586 int wanted = in->caps_file_wanted();
8587 if (in->is_any_caps() &&
8588 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8589 check_caps(in, CHECK_CAPS_NODELAY);
8590 return 0;
8591 }
8592
8593 int flags = 0;
8594 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8595 flags = O_RDWR;
8596 else if (wanted & CEPH_CAP_FILE_RD)
8597 flags = O_RDONLY;
8598 else if (wanted & CEPH_CAP_FILE_WR)
8599 flags = O_WRONLY;
8600
8601 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8602 filepath path;
8603 in->make_nosnap_relative_path(path);
8604 req->set_filepath(path);
8605 req->head.args.open.flags = flags;
8606 req->head.args.open.pool = -1;
8607 if (cct->_conf->client_debug_getattr_caps)
8608 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8609 else
8610 req->head.args.open.mask = 0;
8611 req->set_inode(in);
8612
8613 // duplicate in case Cap goes away; not sure if that race is a concern?
8614 const UserPerm *pperm = in->get_best_perms();
8615 UserPerm perms;
8616 if (pperm != NULL)
8617 perms = *pperm;
8618 int ret = make_request(req, perms);
8619 return ret;
8620}
8621
8622int Client::close(int fd)
8623{
8624 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8625 Mutex::Locker lock(client_lock);
8626 tout(cct) << "close" << std::endl;
8627 tout(cct) << fd << std::endl;
8628
181888fb
FG
8629 if (unmounting)
8630 return -ENOTCONN;
8631
7c673cae
FG
8632 Fh *fh = get_filehandle(fd);
8633 if (!fh)
8634 return -EBADF;
8635 int err = _release_fh(fh);
8636 fd_map.erase(fd);
8637 put_fd(fd);
8638 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8639 return err;
8640}
8641
8642
8643// ------------
8644// read, write
8645
8646loff_t Client::lseek(int fd, loff_t offset, int whence)
8647{
8648 Mutex::Locker lock(client_lock);
8649 tout(cct) << "lseek" << std::endl;
8650 tout(cct) << fd << std::endl;
8651 tout(cct) << offset << std::endl;
8652 tout(cct) << whence << std::endl;
8653
181888fb
FG
8654 if (unmounting)
8655 return -ENOTCONN;
8656
7c673cae
FG
8657 Fh *f = get_filehandle(fd);
8658 if (!f)
8659 return -EBADF;
8660#if defined(__linux__) && defined(O_PATH)
8661 if (f->flags & O_PATH)
8662 return -EBADF;
8663#endif
8664 return _lseek(f, offset, whence);
8665}
8666
8667loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8668{
8669 Inode *in = f->inode.get();
8670 int r;
8671
8672 switch (whence) {
8673 case SEEK_SET:
8674 f->pos = offset;
8675 break;
8676
8677 case SEEK_CUR:
8678 f->pos += offset;
8679 break;
8680
8681 case SEEK_END:
8682 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8683 if (r < 0)
8684 return r;
8685 f->pos = in->size + offset;
8686 break;
8687
8688 default:
8689 ceph_abort();
8690 }
8691
1adf2230 8692 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8693 return f->pos;
8694}
8695
8696
8697void Client::lock_fh_pos(Fh *f)
8698{
8699 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8700
8701 if (f->pos_locked || !f->pos_waiters.empty()) {
8702 Cond cond;
8703 f->pos_waiters.push_back(&cond);
8704 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8705 while (f->pos_locked || f->pos_waiters.front() != &cond)
8706 cond.Wait(client_lock);
8707 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8708 assert(f->pos_waiters.front() == &cond);
8709 f->pos_waiters.pop_front();
8710 }
8711
8712 f->pos_locked = true;
8713}
8714
8715void Client::unlock_fh_pos(Fh *f)
8716{
8717 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8718 f->pos_locked = false;
8719}
8720
8721int Client::uninline_data(Inode *in, Context *onfinish)
8722{
8723 if (!in->inline_data.length()) {
8724 onfinish->complete(0);
8725 return 0;
8726 }
8727
8728 char oid_buf[32];
8729 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8730 object_t oid = oid_buf;
8731
8732 ObjectOperation create_ops;
8733 create_ops.create(false);
8734
8735 objecter->mutate(oid,
8736 OSDMap::file_to_object_locator(in->layout),
8737 create_ops,
8738 in->snaprealm->get_snap_context(),
8739 ceph::real_clock::now(),
8740 0,
8741 NULL);
8742
8743 bufferlist inline_version_bl;
8744 ::encode(in->inline_version, inline_version_bl);
8745
8746 ObjectOperation uninline_ops;
8747 uninline_ops.cmpxattr("inline_version",
8748 CEPH_OSD_CMPXATTR_OP_GT,
8749 CEPH_OSD_CMPXATTR_MODE_U64,
8750 inline_version_bl);
8751 bufferlist inline_data = in->inline_data;
8752 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8753 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8754
8755 objecter->mutate(oid,
8756 OSDMap::file_to_object_locator(in->layout),
8757 uninline_ops,
8758 in->snaprealm->get_snap_context(),
8759 ceph::real_clock::now(),
8760 0,
8761 onfinish);
8762
8763 return 0;
8764}
8765
8766//
8767
8768// blocking osd interface
8769
8770int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8771{
8772 Mutex::Locker lock(client_lock);
8773 tout(cct) << "read" << std::endl;
8774 tout(cct) << fd << std::endl;
8775 tout(cct) << size << std::endl;
8776 tout(cct) << offset << std::endl;
8777
181888fb
FG
8778 if (unmounting)
8779 return -ENOTCONN;
8780
7c673cae
FG
8781 Fh *f = get_filehandle(fd);
8782 if (!f)
8783 return -EBADF;
8784#if defined(__linux__) && defined(O_PATH)
8785 if (f->flags & O_PATH)
8786 return -EBADF;
8787#endif
8788 bufferlist bl;
8789 int r = _read(f, offset, size, &bl);
8790 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8791 if (r >= 0) {
8792 bl.copy(0, bl.length(), buf);
8793 r = bl.length();
8794 }
8795 return r;
8796}
8797
8798int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8799{
8800 if (iovcnt < 0)
8801 return -EINVAL;
8802 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8803}
8804
8805int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8806{
8807 const md_config_t *conf = cct->_conf;
8808 Inode *in = f->inode.get();
8809
8810 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8811 return -EBADF;
8812 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8813
8814 bool movepos = false;
8815 if (offset < 0) {
8816 lock_fh_pos(f);
8817 offset = f->pos;
8818 movepos = true;
8819 }
8820 loff_t start_pos = offset;
8821
8822 if (in->inline_version == 0) {
8823 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8824 if (r < 0) {
8825 if (movepos)
8826 unlock_fh_pos(f);
7c673cae 8827 return r;
c07f9fc5 8828 }
7c673cae
FG
8829 assert(in->inline_version > 0);
8830 }
8831
8832retry:
8833 int have;
8834 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8835 if (r < 0) {
8836 if (movepos)
8837 unlock_fh_pos(f);
7c673cae 8838 return r;
c07f9fc5 8839 }
7c673cae
FG
8840 if (f->flags & O_DIRECT)
8841 have &= ~CEPH_CAP_FILE_CACHE;
8842
8843 Mutex uninline_flock("Client::_read_uninline_data flock");
8844 Cond uninline_cond;
8845 bool uninline_done = false;
8846 int uninline_ret = 0;
8847 Context *onuninline = NULL;
8848
8849 if (in->inline_version < CEPH_INLINE_NONE) {
8850 if (!(have & CEPH_CAP_FILE_CACHE)) {
8851 onuninline = new C_SafeCond(&uninline_flock,
8852 &uninline_cond,
8853 &uninline_done,
8854 &uninline_ret);
8855 uninline_data(in, onuninline);
8856 } else {
8857 uint32_t len = in->inline_data.length();
8858
8859 uint64_t endoff = offset + size;
8860 if (endoff > in->size)
8861 endoff = in->size;
8862
8863 if (offset < len) {
8864 if (endoff <= len) {
8865 bl->substr_of(in->inline_data, offset, endoff - offset);
8866 } else {
8867 bl->substr_of(in->inline_data, offset, len - offset);
8868 bl->append_zero(endoff - len);
8869 }
8870 } else if ((uint64_t)offset < endoff) {
8871 bl->append_zero(endoff - offset);
8872 }
8873
8874 goto success;
8875 }
8876 }
8877
8878 if (!conf->client_debug_force_sync_read &&
8879 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8880
8881 if (f->flags & O_RSYNC) {
8882 _flush_range(in, offset, size);
8883 }
8884 r = _read_async(f, offset, size, bl);
8885 if (r < 0)
8886 goto done;
8887 } else {
8888 if (f->flags & O_DIRECT)
8889 _flush_range(in, offset, size);
8890
8891 bool checkeof = false;
8892 r = _read_sync(f, offset, size, bl, &checkeof);
8893 if (r < 0)
8894 goto done;
8895 if (checkeof) {
8896 offset += r;
8897 size -= r;
8898
8899 put_cap_ref(in, CEPH_CAP_FILE_RD);
8900 have = 0;
8901 // reverify size
8902 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8903 if (r < 0)
8904 goto done;
8905
8906 // eof? short read.
8907 if ((uint64_t)offset < in->size)
8908 goto retry;
8909 }
8910 }
8911
8912success:
8913 if (movepos) {
8914 // adjust fd pos
8915 f->pos = start_pos + bl->length();
8916 unlock_fh_pos(f);
8917 }
8918
8919done:
8920 // done!
8921
8922 if (onuninline) {
8923 client_lock.Unlock();
8924 uninline_flock.Lock();
8925 while (!uninline_done)
8926 uninline_cond.Wait(uninline_flock);
8927 uninline_flock.Unlock();
8928 client_lock.Lock();
8929
8930 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8931 in->inline_data.clear();
8932 in->inline_version = CEPH_INLINE_NONE;
28e407b8 8933 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
8934 check_caps(in, 0);
8935 } else
8936 r = uninline_ret;
8937 }
8938
8939 if (have)
8940 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8941 if (r < 0) {
8942 if (movepos)
8943 unlock_fh_pos(f);
8944 return r;
8945 } else
8946 return bl->length();
7c673cae
FG
8947}
8948
8949Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8950 client(c), f(f) {
8951 f->get();
8952 f->readahead.inc_pending();
8953}
8954
8955Client::C_Readahead::~C_Readahead() {
8956 f->readahead.dec_pending();
8957 client->_put_fh(f);
8958}
8959
8960void Client::C_Readahead::finish(int r) {
8961 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8962 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8963}
8964
8965int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8966{
8967 const md_config_t *conf = cct->_conf;
8968 Inode *in = f->inode.get();
8969
8970 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8971
8972 // trim read based on file size?
8973 if (off >= in->size)
8974 return 0;
8975 if (len == 0)
8976 return 0;
8977 if (off + len > in->size) {
8978 len = in->size - off;
8979 }
8980
8981 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8982 << " max_bytes=" << f->readahead.get_max_readahead_size()
8983 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8984
8985 // read (and possibly block)
8986 int r, rvalue = 0;
8987 Mutex flock("Client::_read_async flock");
8988 Cond cond;
8989 bool done = false;
8990 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8991 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8992 off, len, bl, 0, onfinish);
8993 if (r == 0) {
8994 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8995 client_lock.Unlock();
8996 flock.Lock();
8997 while (!done)
8998 cond.Wait(flock);
8999 flock.Unlock();
9000 client_lock.Lock();
9001 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9002 r = rvalue;
9003 } else {
9004 // it was cached.
9005 delete onfinish;
9006 }
9007
9008 if(f->readahead.get_min_readahead_size() > 0) {
9009 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9010 if (readahead_extent.second > 0) {
9011 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9012 << " (caller wants " << off << "~" << len << ")" << dendl;
9013 Context *onfinish2 = new C_Readahead(this, f);
9014 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9015 readahead_extent.first, readahead_extent.second,
9016 NULL, 0, onfinish2);
9017 if (r2 == 0) {
9018 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9019 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9020 } else {
9021 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9022 delete onfinish2;
9023 }
9024 }
9025 }
9026
9027 return r;
9028}
9029
9030int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9031 bool *checkeof)
9032{
9033 Inode *in = f->inode.get();
9034 uint64_t pos = off;
9035 int left = len;
9036 int read = 0;
9037
9038 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
9039
9040 Mutex flock("Client::_read_sync flock");
9041 Cond cond;
9042 while (left > 0) {
9043 int r = 0;
9044 bool done = false;
9045 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
9046 bufferlist tbl;
9047
9048 int wanted = left;
9049 filer->read_trunc(in->ino, &in->layout, in->snapid,
9050 pos, left, &tbl, 0,
9051 in->truncate_size, in->truncate_seq,
9052 onfinish);
9053 client_lock.Unlock();
9054 flock.Lock();
9055 while (!done)
9056 cond.Wait(flock);
9057 flock.Unlock();
9058 client_lock.Lock();
9059
9060 // if we get ENOENT from OSD, assume 0 bytes returned
9061 if (r == -ENOENT)
9062 r = 0;
9063 if (r < 0)
9064 return r;
9065 if (tbl.length()) {
9066 r = tbl.length();
9067
9068 read += r;
9069 pos += r;
9070 left -= r;
9071 bl->claim_append(tbl);
9072 }
9073 // short read?
9074 if (r >= 0 && r < wanted) {
9075 if (pos < in->size) {
9076 // zero up to known EOF
9077 int64_t some = in->size - pos;
9078 if (some > left)
9079 some = left;
9080 bufferptr z(some);
9081 z.zero();
9082 bl->push_back(z);
9083 read += some;
9084 pos += some;
9085 left -= some;
9086 if (left == 0)
9087 return read;
9088 }
9089
9090 *checkeof = true;
9091 return read;
9092 }
9093 }
9094 return read;
9095}
9096
9097
9098/*
9099 * we keep count of uncommitted sync writes on the inode, so that
9100 * fsync can DDRT.
9101 */
9102void Client::_sync_write_commit(Inode *in)
9103{
9104 assert(unsafe_sync_write > 0);
9105 unsafe_sync_write--;
9106
9107 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9108
9109 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9110 if (unsafe_sync_write == 0 && unmounting) {
9111 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9112 mount_cond.Signal();
9113 }
9114}
9115
9116int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9117{
9118 Mutex::Locker lock(client_lock);
9119 tout(cct) << "write" << std::endl;
9120 tout(cct) << fd << std::endl;
9121 tout(cct) << size << std::endl;
9122 tout(cct) << offset << std::endl;
9123
181888fb
FG
9124 if (unmounting)
9125 return -ENOTCONN;
9126
7c673cae
FG
9127 Fh *fh = get_filehandle(fd);
9128 if (!fh)
9129 return -EBADF;
9130#if defined(__linux__) && defined(O_PATH)
9131 if (fh->flags & O_PATH)
9132 return -EBADF;
9133#endif
9134 int r = _write(fh, offset, size, buf, NULL, 0);
9135 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9136 return r;
9137}
9138
9139int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9140{
9141 if (iovcnt < 0)
9142 return -EINVAL;
9143 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9144}
9145
9146int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9147{
9148 Mutex::Locker lock(client_lock);
9149 tout(cct) << fd << std::endl;
9150 tout(cct) << offset << std::endl;
9151
181888fb
FG
9152 if (unmounting)
9153 return -ENOTCONN;
9154
7c673cae
FG
9155 Fh *fh = get_filehandle(fd);
9156 if (!fh)
9157 return -EBADF;
9158#if defined(__linux__) && defined(O_PATH)
9159 if (fh->flags & O_PATH)
9160 return -EBADF;
9161#endif
9162 loff_t totallen = 0;
9163 for (unsigned i = 0; i < iovcnt; i++) {
9164 totallen += iov[i].iov_len;
9165 }
9166 if (write) {
9167 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9168 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9169 return w;
9170 } else {
9171 bufferlist bl;
9172 int r = _read(fh, offset, totallen, &bl);
9173 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9174 if (r <= 0)
9175 return r;
9176
9177 int bufoff = 0;
9178 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9179 /*
9180 * This piece of code aims to handle the case that bufferlist does not have enough data
9181 * to fill in the iov
9182 */
9183 if (resid < iov[j].iov_len) {
9184 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9185 break;
9186 } else {
9187 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9188 }
9189 resid -= iov[j].iov_len;
9190 bufoff += iov[j].iov_len;
9191 }
9192 return r;
9193 }
9194}
9195
9196int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9197 const struct iovec *iov, int iovcnt)
9198{
9199 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9200 return -EFBIG;
9201
9202 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9203 Inode *in = f->inode.get();
9204
9205 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9206 return -ENOSPC;
9207 }
9208
9209 assert(in->snapid == CEPH_NOSNAP);
9210
9211 // was Fh opened as writeable?
9212 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9213 return -EBADF;
9214
9215 // check quota
9216 uint64_t endoff = offset + size;
28e407b8
AA
9217 std::list<InodeRef> quota_roots;
9218 if (endoff > in->size &&
9219 is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
7c673cae
FG
9220 return -EDQUOT;
9221 }
9222
9223 // use/adjust fd pos?
9224 if (offset < 0) {
9225 lock_fh_pos(f);
9226 /*
9227 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9228 * change out from under us.
9229 */
9230 if (f->flags & O_APPEND) {
9231 int r = _lseek(f, 0, SEEK_END);
9232 if (r < 0) {
9233 unlock_fh_pos(f);
9234 return r;
9235 }
9236 }
9237 offset = f->pos;
9238 f->pos = offset+size;
9239 unlock_fh_pos(f);
9240 }
9241
9242 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9243
9244 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9245
9246 // time it.
9247 utime_t start = ceph_clock_now();
9248
9249 if (in->inline_version == 0) {
9250 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9251 if (r < 0)
9252 return r;
9253 assert(in->inline_version > 0);
9254 }
9255
9256 // copy into fresh buffer (since our write may be resub, async)
9257 bufferlist bl;
9258 if (buf) {
9259 if (size > 0)
9260 bl.append(buf, size);
9261 } else if (iov){
9262 for (int i = 0; i < iovcnt; i++) {
9263 if (iov[i].iov_len > 0) {
9264 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9265 }
9266 }
9267 }
9268
9269 utime_t lat;
9270 uint64_t totalwritten;
9271 int have;
9272 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9273 CEPH_CAP_FILE_BUFFER, &have, endoff);
9274 if (r < 0)
9275 return r;
9276
9277 /* clear the setuid/setgid bits, if any */
181888fb 9278 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9279 struct ceph_statx stx = { 0 };
9280
9281 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9282 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9283 if (r < 0)
9284 return r;
9285 } else {
9286 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9287 }
9288
9289 if (f->flags & O_DIRECT)
9290 have &= ~CEPH_CAP_FILE_BUFFER;
9291
9292 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9293
9294 Mutex uninline_flock("Client::_write_uninline_data flock");
9295 Cond uninline_cond;
9296 bool uninline_done = false;
9297 int uninline_ret = 0;
9298 Context *onuninline = NULL;
9299
9300 if (in->inline_version < CEPH_INLINE_NONE) {
9301 if (endoff > cct->_conf->client_max_inline_size ||
9302 endoff > CEPH_INLINE_MAX_SIZE ||
9303 !(have & CEPH_CAP_FILE_BUFFER)) {
9304 onuninline = new C_SafeCond(&uninline_flock,
9305 &uninline_cond,
9306 &uninline_done,
9307 &uninline_ret);
9308 uninline_data(in, onuninline);
9309 } else {
9310 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9311
9312 uint32_t len = in->inline_data.length();
9313
9314 if (endoff < len)
9315 in->inline_data.copy(endoff, len - endoff, bl);
9316
9317 if (offset < len)
9318 in->inline_data.splice(offset, len - offset);
9319 else if (offset > len)
9320 in->inline_data.append_zero(offset - len);
9321
9322 in->inline_data.append(bl);
9323 in->inline_version++;
9324
9325 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9326
9327 goto success;
9328 }
9329 }
9330
9331 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9332 // do buffered write
9333 if (!in->oset.dirty_or_tx)
9334 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9335
9336 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9337
9338 // async, caching, non-blocking.
9339 r = objectcacher->file_write(&in->oset, &in->layout,
9340 in->snaprealm->get_snap_context(),
9341 offset, size, bl, ceph::real_clock::now(),
9342 0);
9343 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9344
9345 if (r < 0)
9346 goto done;
9347
9348 // flush cached write if O_SYNC is set on file fh
9349 // O_DSYNC == O_SYNC on linux < 2.6.33
9350 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9351 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9352 _flush_range(in, offset, size);
9353 }
9354 } else {
9355 if (f->flags & O_DIRECT)
9356 _flush_range(in, offset, size);
9357
9358 // simple, non-atomic sync write
9359 Mutex flock("Client::_write flock");
9360 Cond cond;
9361 bool done = false;
9362 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9363
9364 unsafe_sync_write++;
9365 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9366
9367 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9368 offset, size, bl, ceph::real_clock::now(), 0,
9369 in->truncate_size, in->truncate_seq,
9370 onfinish);
9371 client_lock.Unlock();
9372 flock.Lock();
9373
9374 while (!done)
9375 cond.Wait(flock);
9376 flock.Unlock();
9377 client_lock.Lock();
9378 _sync_write_commit(in);
9379 }
9380
9381 // if we get here, write was successful, update client metadata
9382success:
9383 // time
9384 lat = ceph_clock_now();
9385 lat -= start;
9386 logger->tinc(l_c_wrlat, lat);
9387
9388 totalwritten = size;
9389 r = (int)totalwritten;
9390
9391 // extend file?
9392 if (totalwritten + offset > in->size) {
9393 in->size = totalwritten + offset;
28e407b8 9394 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9395
28e407b8 9396 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 9397 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9398 } else if (is_max_size_approaching(in)) {
9399 check_caps(in, 0);
7c673cae
FG
9400 }
9401
9402 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9403 } else {
9404 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9405 }
9406
9407 // mtime
91327a77 9408 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9409 in->change_attr++;
28e407b8 9410 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9411
9412done:
9413
9414 if (onuninline) {
9415 client_lock.Unlock();
9416 uninline_flock.Lock();
9417 while (!uninline_done)
9418 uninline_cond.Wait(uninline_flock);
9419 uninline_flock.Unlock();
9420 client_lock.Lock();
9421
9422 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9423 in->inline_data.clear();
9424 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9425 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9426 check_caps(in, 0);
9427 } else
9428 r = uninline_ret;
9429 }
9430
9431 put_cap_ref(in, CEPH_CAP_FILE_WR);
9432 return r;
9433}
9434
9435int Client::_flush(Fh *f)
9436{
9437 Inode *in = f->inode.get();
9438 int err = f->take_async_err();
9439 if (err != 0) {
9440 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9441 << cpp_strerror(err) << dendl;
9442 } else {
9443 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9444 }
9445
9446 return err;
9447}
9448
9449int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9450{
9451 struct ceph_statx stx;
9452 stx.stx_size = length;
9453 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9454}
9455
9456int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9457{
9458 Mutex::Locker lock(client_lock);
9459 tout(cct) << "ftruncate" << std::endl;
9460 tout(cct) << fd << std::endl;
9461 tout(cct) << length << std::endl;
9462
181888fb
FG
9463 if (unmounting)
9464 return -ENOTCONN;
9465
7c673cae
FG
9466 Fh *f = get_filehandle(fd);
9467 if (!f)
9468 return -EBADF;
9469#if defined(__linux__) && defined(O_PATH)
9470 if (f->flags & O_PATH)
9471 return -EBADF;
9472#endif
9473 struct stat attr;
9474 attr.st_size = length;
9475 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9476}
9477
9478int Client::fsync(int fd, bool syncdataonly)
9479{
9480 Mutex::Locker lock(client_lock);
9481 tout(cct) << "fsync" << std::endl;
9482 tout(cct) << fd << std::endl;
9483 tout(cct) << syncdataonly << std::endl;
9484
181888fb
FG
9485 if (unmounting)
9486 return -ENOTCONN;
9487
7c673cae
FG
9488 Fh *f = get_filehandle(fd);
9489 if (!f)
9490 return -EBADF;
9491#if defined(__linux__) && defined(O_PATH)
9492 if (f->flags & O_PATH)
9493 return -EBADF;
9494#endif
9495 int r = _fsync(f, syncdataonly);
9496 if (r == 0) {
9497 // The IOs in this fsync were okay, but maybe something happened
9498 // in the background that we shoudl be reporting?
9499 r = f->take_async_err();
1adf2230 9500 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9501 << ") = 0, async_err = " << r << dendl;
9502 } else {
9503 // Assume that an error we encountered during fsync, even reported
9504 // synchronously, would also have applied the error to the Fh, and we
9505 // should clear it here to avoid returning the same error again on next
9506 // call.
1adf2230 9507 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9508 << r << dendl;
9509 f->take_async_err();
9510 }
9511 return r;
9512}
9513
9514int Client::_fsync(Inode *in, bool syncdataonly)
9515{
9516 int r = 0;
9517 Mutex lock("Client::_fsync::lock");
9518 Cond cond;
9519 bool done = false;
9520 C_SafeCond *object_cacher_completion = NULL;
9521 ceph_tid_t flush_tid = 0;
9522 InodeRef tmp_ref;
9523
1adf2230 9524 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9525
9526 if (cct->_conf->client_oc) {
9527 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9528 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9529 _flush(in, object_cacher_completion);
9530 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9531 }
9532
9533 if (!syncdataonly && in->dirty_caps) {
9534 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9535 if (in->flushing_caps)
9536 flush_tid = last_flush_tid;
9537 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9538
9539 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9540 flush_mdlog_sync();
9541
7c673cae
FG
9542 MetaRequest *req = in->unsafe_ops.back();
9543 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9544
9545 req->get();
9546 wait_on_list(req->waitfor_safe);
9547 put_request(req);
9548 }
9549
9550 if (object_cacher_completion) { // wait on a real reply instead of guessing
9551 client_lock.Unlock();
9552 lock.Lock();
9553 ldout(cct, 15) << "waiting on data to flush" << dendl;
9554 while (!done)
9555 cond.Wait(lock);
9556 lock.Unlock();
9557 client_lock.Lock();
9558 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9559 } else {
9560 // FIXME: this can starve
9561 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9562 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9563 << " uncommitted, waiting" << dendl;
9564 wait_on_list(in->waitfor_commit);
9565 }
9566 }
9567
9568 if (!r) {
9569 if (flush_tid > 0)
9570 wait_sync_caps(in, flush_tid);
9571
9572 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9573 } else {
1adf2230 9574 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9575 << cpp_strerror(-r) << dendl;
9576 }
9577
9578 return r;
9579}
9580
9581int Client::_fsync(Fh *f, bool syncdataonly)
9582{
1adf2230 9583 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9584 return _fsync(f->inode.get(), syncdataonly);
9585}
9586
9587int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9588{
9589 Mutex::Locker lock(client_lock);
9590 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9591 tout(cct) << fd << std::endl;
9592
181888fb
FG
9593 if (unmounting)
9594 return -ENOTCONN;
9595
7c673cae
FG
9596 Fh *f = get_filehandle(fd);
9597 if (!f)
9598 return -EBADF;
9599 int r = _getattr(f->inode, mask, perms);
9600 if (r < 0)
9601 return r;
9602 fill_stat(f->inode, stbuf, NULL);
1adf2230 9603 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9604 return r;
9605}
9606
9607int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9608 unsigned int want, unsigned int flags)
9609{
9610 Mutex::Locker lock(client_lock);
9611 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9612 tout(cct) << fd << std::endl;
9613
181888fb
FG
9614 if (unmounting)
9615 return -ENOTCONN;
9616
7c673cae
FG
9617 Fh *f = get_filehandle(fd);
9618 if (!f)
9619 return -EBADF;
9620
9621 unsigned mask = statx_to_mask(flags, want);
9622
9623 int r = 0;
94b18763 9624 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9625 r = _getattr(f->inode, mask, perms);
9626 if (r < 0) {
9627 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9628 return r;
9629 }
9630 }
9631
9632 fill_statx(f->inode, mask, stx);
9633 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9634 return r;
9635}
9636
9637// not written yet, but i want to link!
9638
9639int Client::chdir(const char *relpath, std::string &new_cwd,
9640 const UserPerm& perms)
9641{
9642 Mutex::Locker lock(client_lock);
9643 tout(cct) << "chdir" << std::endl;
9644 tout(cct) << relpath << std::endl;
181888fb
FG
9645
9646 if (unmounting)
9647 return -ENOTCONN;
9648
7c673cae
FG
9649 filepath path(relpath);
9650 InodeRef in;
9651 int r = path_walk(path, &in, perms);
9652 if (r < 0)
9653 return r;
9654 if (cwd != in)
9655 cwd.swap(in);
9656 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9657
b5b8bbf5 9658 _getcwd(new_cwd, perms);
7c673cae
FG
9659 return 0;
9660}
9661
b5b8bbf5 9662void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9663{
9664 filepath path;
9665 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9666
9667 Inode *in = cwd.get();
9668 while (in != root) {
9669 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9670
9671 // A cwd or ancester is unlinked
9672 if (in->dn_set.empty()) {
9673 return;
9674 }
9675
9676 Dentry *dn = in->get_first_parent();
9677
9678
9679 if (!dn) {
9680 // look it up
9681 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9682 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9683 filepath path(in->ino);
9684 req->set_filepath(path);
9685 req->set_inode(in);
9686 int res = make_request(req, perms);
9687 if (res < 0)
9688 break;
9689
9690 // start over
9691 path = filepath();
9692 in = cwd.get();
9693 continue;
9694 }
9695 path.push_front_dentry(dn->name);
9696 in = dn->dir->parent_inode;
9697 }
9698 dir = "/";
9699 dir += path.get_path();
9700}
9701
b5b8bbf5
FG
9702void Client::getcwd(string& dir, const UserPerm& perms)
9703{
9704 Mutex::Locker l(client_lock);
181888fb
FG
9705 if (!unmounting)
9706 _getcwd(dir, perms);
b5b8bbf5
FG
9707}
9708
7c673cae
FG
9709int Client::statfs(const char *path, struct statvfs *stbuf,
9710 const UserPerm& perms)
9711{
9712 Mutex::Locker l(client_lock);
9713 tout(cct) << "statfs" << std::endl;
91327a77 9714 unsigned long int total_files_on_fs;
7c673cae 9715
181888fb
FG
9716 if (unmounting)
9717 return -ENOTCONN;
9718
7c673cae
FG
9719 ceph_statfs stats;
9720 C_SaferCond cond;
d2e6a577
FG
9721
9722 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9723 if (data_pools.size() == 1) {
9724 objecter->get_fs_stats(stats, data_pools[0], &cond);
9725 } else {
9726 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9727 }
7c673cae
FG
9728
9729 client_lock.Unlock();
9730 int rval = cond.wait();
91327a77
AA
9731 assert(root);
9732 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9733 client_lock.Lock();
9734
9735 if (rval < 0) {
9736 ldout(cct, 1) << "underlying call to statfs returned error: "
9737 << cpp_strerror(rval)
9738 << dendl;
9739 return rval;
9740 }
9741
9742 memset(stbuf, 0, sizeof(*stbuf));
9743
9744 /*
9745 * we're going to set a block size of 4MB so we can represent larger
9746 * FSes without overflowing. Additionally convert the space
9747 * measurements from KB to bytes while making them in terms of
9748 * blocks. We use 4MB only because it is big enough, and because it
9749 * actually *is* the (ceph) default block size.
9750 */
9751 const int CEPH_BLOCK_SHIFT = 22;
9752 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9753 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9754 stbuf->f_files = total_files_on_fs;
9755 stbuf->f_ffree = 0;
7c673cae
FG
9756 stbuf->f_favail = -1;
9757 stbuf->f_fsid = -1; // ??
9758 stbuf->f_flag = 0; // ??
9759 stbuf->f_namemax = NAME_MAX;
9760
9761 // Usually quota_root will == root_ancestor, but if the mount root has no
9762 // quota but we can see a parent of it that does have a quota, we'll
9763 // respect that one instead.
9764 assert(root != nullptr);
9765 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9766
9767 // get_quota_root should always give us something
9768 // because client quotas are always enabled
9769 assert(quota_root != nullptr);
9770
9771 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9772
9773 // Skip the getattr if any sessions are stale, as we don't want to
9774 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9775 // is unhealthy.
9776 if (!_any_stale_sessions()) {
9777 int r = _getattr(quota_root, 0, perms, true);
9778 if (r != 0) {
9779 // Ignore return value: error getting latest inode metadata is not a good
9780 // reason to break "df".
9781 lderr(cct) << "Error in getattr on quota root 0x"
9782 << std::hex << quota_root->ino << std::dec
9783 << " statfs result may be outdated" << dendl;
9784 }
9785 }
9786
9787 // Special case: if there is a size quota set on the Inode acting
9788 // as the root for this client mount, then report the quota status
9789 // as the filesystem statistics.
9790 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9791 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9792 // It is possible for a quota to be exceeded: arithmetic here must
9793 // handle case where used > total.
9794 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9795
9796 stbuf->f_blocks = total;
9797 stbuf->f_bfree = free;
9798 stbuf->f_bavail = free;
9799 } else {
d2e6a577 9800 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9801 // multiple pools may be used without one filesystem namespace via
9802 // layouts, this is the most correct thing we can do.
9803 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9804 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9805 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9806 }
9807
9808 return rval;
9809}
9810
9811int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9812 struct flock *fl, uint64_t owner, bool removing)
9813{
9814 ldout(cct, 10) << "_do_filelock ino " << in->ino
9815 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9816 << " type " << fl->l_type << " owner " << owner
9817 << " " << fl->l_start << "~" << fl->l_len << dendl;
9818
9819 int lock_cmd;
9820 if (F_RDLCK == fl->l_type)
9821 lock_cmd = CEPH_LOCK_SHARED;
9822 else if (F_WRLCK == fl->l_type)
9823 lock_cmd = CEPH_LOCK_EXCL;
9824 else if (F_UNLCK == fl->l_type)
9825 lock_cmd = CEPH_LOCK_UNLOCK;
9826 else
9827 return -EIO;
9828
9829 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9830 sleep = 0;
9831
9832 /*
9833 * Set the most significant bit, so that MDS knows the 'owner'
9834 * is sufficient to identify the owner of lock. (old code uses
9835 * both 'owner' and 'pid')
9836 */
9837 owner |= (1ULL << 63);
9838
9839 MetaRequest *req = new MetaRequest(op);
9840 filepath path;
9841 in->make_nosnap_relative_path(path);
9842 req->set_filepath(path);
9843 req->set_inode(in);
9844
9845 req->head.args.filelock_change.rule = lock_type;
9846 req->head.args.filelock_change.type = lock_cmd;
9847 req->head.args.filelock_change.owner = owner;
9848 req->head.args.filelock_change.pid = fl->l_pid;
9849 req->head.args.filelock_change.start = fl->l_start;
9850 req->head.args.filelock_change.length = fl->l_len;
9851 req->head.args.filelock_change.wait = sleep;
9852
9853 int ret;
9854 bufferlist bl;
9855
9856 if (sleep && switch_interrupt_cb) {
9857 // enable interrupt
9858 switch_interrupt_cb(callback_handle, req->get());
9859 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9860 // disable interrupt
9861 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9862 if (ret == 0 && req->aborted()) {
9863 // effect of this lock request has been revoked by the 'lock intr' request
9864 ret = req->get_abort_code();
9865 }
7c673cae
FG
9866 put_request(req);
9867 } else {
9868 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9869 }
9870
9871 if (ret == 0) {
9872 if (op == CEPH_MDS_OP_GETFILELOCK) {
9873 ceph_filelock filelock;
9874 bufferlist::iterator p = bl.begin();
9875 ::decode(filelock, p);
9876
9877 if (CEPH_LOCK_SHARED == filelock.type)
9878 fl->l_type = F_RDLCK;
9879 else if (CEPH_LOCK_EXCL == filelock.type)
9880 fl->l_type = F_WRLCK;
9881 else
9882 fl->l_type = F_UNLCK;
9883
9884 fl->l_whence = SEEK_SET;
9885 fl->l_start = filelock.start;
9886 fl->l_len = filelock.length;
9887 fl->l_pid = filelock.pid;
9888 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9889 ceph_lock_state_t *lock_state;
9890 if (lock_type == CEPH_LOCK_FCNTL) {
9891 if (!in->fcntl_locks)
9892 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9893 lock_state = in->fcntl_locks;
9894 } else if (lock_type == CEPH_LOCK_FLOCK) {
9895 if (!in->flock_locks)
9896 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9897 lock_state = in->flock_locks;
9898 } else {
9899 ceph_abort();
9900 return -EINVAL;
9901 }
9902 _update_lock_state(fl, owner, lock_state);
9903
9904 if (!removing) {
9905 if (lock_type == CEPH_LOCK_FCNTL) {
9906 if (!fh->fcntl_locks)
9907 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9908 lock_state = fh->fcntl_locks;
9909 } else {
9910 if (!fh->flock_locks)
9911 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9912 lock_state = fh->flock_locks;
9913 }
9914 _update_lock_state(fl, owner, lock_state);
9915 }
9916 } else
9917 ceph_abort();
9918 }
9919 return ret;
9920}
9921
9922int Client::_interrupt_filelock(MetaRequest *req)
9923{
31f18b77
FG
9924 // Set abort code, but do not kick. The abort code prevents the request
9925 // from being re-sent.
9926 req->abort(-EINTR);
9927 if (req->mds < 0)
9928 return 0; // haven't sent the request
9929
7c673cae
FG
9930 Inode *in = req->inode();
9931
9932 int lock_type;
9933 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9934 lock_type = CEPH_LOCK_FLOCK_INTR;
9935 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9936 lock_type = CEPH_LOCK_FCNTL_INTR;
9937 else {
9938 ceph_abort();
9939 return -EINVAL;
9940 }
9941
9942 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9943 filepath path;
9944 in->make_nosnap_relative_path(path);
9945 intr_req->set_filepath(path);
9946 intr_req->set_inode(in);
9947 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9948 intr_req->head.args.filelock_change.rule = lock_type;
9949 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9950
9951 UserPerm perms(req->get_uid(), req->get_gid());
9952 return make_request(intr_req, perms, NULL, NULL, -1);
9953}
9954
9955void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9956{
9957 if (!in->fcntl_locks && !in->flock_locks)
9958 return;
9959
9960 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9961 ::encode(nr_fcntl_locks, bl);
9962 if (nr_fcntl_locks) {
9963 ceph_lock_state_t* lock_state = in->fcntl_locks;
9964 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9965 p != lock_state->held_locks.end();
9966 ++p)
9967 ::encode(p->second, bl);
9968 }
9969
9970 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9971 ::encode(nr_flock_locks, bl);
9972 if (nr_flock_locks) {
9973 ceph_lock_state_t* lock_state = in->flock_locks;
9974 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9975 p != lock_state->held_locks.end();
9976 ++p)
9977 ::encode(p->second, bl);
9978 }
9979
9980 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9981 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9982}
9983
9984void Client::_release_filelocks(Fh *fh)
9985{
9986 if (!fh->fcntl_locks && !fh->flock_locks)
9987 return;
9988
9989 Inode *in = fh->inode.get();
9990 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9991
9992 list<pair<int, ceph_filelock> > to_release;
9993
9994 if (fh->fcntl_locks) {
9995 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9996 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9997 p != lock_state->held_locks.end();
9998 ++p)
9999 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10000 delete fh->fcntl_locks;
10001 }
10002 if (fh->flock_locks) {
10003 ceph_lock_state_t* lock_state = fh->flock_locks;
10004 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10005 p != lock_state->held_locks.end();
10006 ++p)
10007 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10008 delete fh->flock_locks;
10009 }
10010
10011 if (to_release.empty())
10012 return;
10013
10014 struct flock fl;
10015 memset(&fl, 0, sizeof(fl));
10016 fl.l_whence = SEEK_SET;
10017 fl.l_type = F_UNLCK;
10018
10019 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10020 p != to_release.end();
10021 ++p) {
10022 fl.l_start = p->second.start;
10023 fl.l_len = p->second.length;
10024 fl.l_pid = p->second.pid;
10025 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10026 p->second.owner, true);
10027 }
10028}
10029
10030void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10031 ceph_lock_state_t *lock_state)
10032{
10033 int lock_cmd;
10034 if (F_RDLCK == fl->l_type)
10035 lock_cmd = CEPH_LOCK_SHARED;
10036 else if (F_WRLCK == fl->l_type)
10037 lock_cmd = CEPH_LOCK_EXCL;
10038 else
10039 lock_cmd = CEPH_LOCK_UNLOCK;;
10040
10041 ceph_filelock filelock;
10042 filelock.start = fl->l_start;
10043 filelock.length = fl->l_len;
10044 filelock.client = 0;
10045 // see comment in _do_filelock()
10046 filelock.owner = owner | (1ULL << 63);
10047 filelock.pid = fl->l_pid;
10048 filelock.type = lock_cmd;
10049
10050 if (filelock.type == CEPH_LOCK_UNLOCK) {
10051 list<ceph_filelock> activated_locks;
10052 lock_state->remove_lock(filelock, activated_locks);
10053 } else {
10054 bool r = lock_state->add_lock(filelock, false, false, NULL);
10055 assert(r);
10056 }
10057}
10058
10059int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10060{
10061 Inode *in = fh->inode.get();
10062 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10063 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10064 return ret;
10065}
10066
10067int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10068{
10069 Inode *in = fh->inode.get();
10070 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10071 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10072 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10073 return ret;
10074}
10075
10076int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10077{
10078 Inode *in = fh->inode.get();
10079 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10080
10081 int sleep = !(cmd & LOCK_NB);
10082 cmd &= ~LOCK_NB;
10083
10084 int type;
10085 switch (cmd) {
10086 case LOCK_SH:
10087 type = F_RDLCK;
10088 break;
10089 case LOCK_EX:
10090 type = F_WRLCK;
10091 break;
10092 case LOCK_UN:
10093 type = F_UNLCK;
10094 break;
10095 default:
10096 return -EINVAL;
10097 }
10098
10099 struct flock fl;
10100 memset(&fl, 0, sizeof(fl));
10101 fl.l_type = type;
10102 fl.l_whence = SEEK_SET;
10103
10104 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10105 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10106 return ret;
10107}
10108
10109int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10110{
10111 /* Since the only thing this does is wrap a call to statfs, and
10112 statfs takes a lock, it doesn't seem we have a need to split it
10113 out. */
10114 return statfs(0, stbuf, perms);
10115}
10116
10117void Client::ll_register_callbacks(struct client_callback_args *args)
10118{
10119 if (!args)
10120 return;
10121 Mutex::Locker l(client_lock);
10122 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10123 << " invalidate_ino_cb " << args->ino_cb
10124 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10125 << " switch_interrupt_cb " << args->switch_intr_cb
10126 << " remount_cb " << args->remount_cb
10127 << dendl;
10128 callback_handle = args->handle;
10129 if (args->ino_cb) {
10130 ino_invalidate_cb = args->ino_cb;
10131 async_ino_invalidator.start();
10132 }
10133 if (args->dentry_cb) {
10134 dentry_invalidate_cb = args->dentry_cb;
10135 async_dentry_invalidator.start();
10136 }
10137 if (args->switch_intr_cb) {
10138 switch_interrupt_cb = args->switch_intr_cb;
10139 interrupt_finisher.start();
10140 }
10141 if (args->remount_cb) {
10142 remount_cb = args->remount_cb;
10143 remount_finisher.start();
10144 }
7c673cae
FG
10145 umask_cb = args->umask_cb;
10146}
10147
10148int Client::test_dentry_handling(bool can_invalidate)
10149{
10150 int r = 0;
10151
10152 can_invalidate_dentries = can_invalidate;
10153
10154 if (can_invalidate_dentries) {
10155 assert(dentry_invalidate_cb);
10156 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10157 r = 0;
7c673cae
FG
10158 } else if (remount_cb) {
10159 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10160 r = _do_remount(false);
b32b8144
FG
10161 }
10162 if (r) {
10163 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10164 if (should_abort) {
10165 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
7c673cae 10166 ceph_abort();
b32b8144
FG
10167 } else {
10168 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10169 }
7c673cae
FG
10170 }
10171 return r;
10172}
10173
10174int Client::_sync_fs()
10175{
10176 ldout(cct, 10) << "_sync_fs" << dendl;
10177
10178 // flush file data
10179 Mutex lock("Client::_fsync::lock");
10180 Cond cond;
10181 bool flush_done = false;
10182 if (cct->_conf->client_oc)
10183 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10184 else
10185 flush_done = true;
10186
10187 // flush caps
10188 flush_caps_sync();
10189 ceph_tid_t flush_tid = last_flush_tid;
10190
10191 // wait for unsafe mds requests
10192 wait_unsafe_requests();
10193
10194 wait_sync_caps(flush_tid);
10195
10196 if (!flush_done) {
10197 client_lock.Unlock();
10198 lock.Lock();
10199 ldout(cct, 15) << "waiting on data to flush" << dendl;
10200 while (!flush_done)
10201 cond.Wait(lock);
10202 lock.Unlock();
10203 client_lock.Lock();
10204 }
10205
10206 return 0;
10207}
10208
10209int Client::sync_fs()
10210{
10211 Mutex::Locker l(client_lock);
181888fb
FG
10212
10213 if (unmounting)
10214 return -ENOTCONN;
10215
7c673cae
FG
10216 return _sync_fs();
10217}
10218
10219int64_t Client::drop_caches()
10220{
10221 Mutex::Locker l(client_lock);
10222 return objectcacher->release_all();
10223}
10224
10225
10226int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10227{
10228 Mutex::Locker l(client_lock);
10229 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10230 << ", " << offset << ", " << count << ")" << dendl;
10231
10232 Fh *f = get_filehandle(fd);
10233 if (!f)
10234 return -EBADF;
10235
10236 // for now
10237 _fsync(f, true);
10238
10239 return 0;
10240}
10241
10242int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10243{
10244 Mutex::Locker l(client_lock);
10245 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10246 << ", " << offset << ", " << count << ")" << dendl;
10247
10248 Fh *f = get_filehandle(fd);
10249 if (!f)
10250 return -EBADF;
10251 Inode *in = f->inode.get();
10252
10253 _fsync(f, true);
10254 if (_release(in))
10255 check_caps(in, 0);
10256 return 0;
10257}
10258
10259
10260// =============================
10261// snaps
10262
10263int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10264{
10265 Mutex::Locker l(client_lock);
181888fb
FG
10266
10267 if (unmounting)
10268 return -ENOTCONN;
10269
7c673cae
FG
10270 filepath path(relpath);
10271 InodeRef in;
10272 int r = path_walk(path, &in, perm);
10273 if (r < 0)
10274 return r;
10275 if (cct->_conf->client_permissions) {
10276 r = may_create(in.get(), perm);
10277 if (r < 0)
10278 return r;
10279 }
10280 Inode *snapdir = open_snapdir(in.get());
10281 return _mkdir(snapdir, name, 0, perm);
10282}
181888fb 10283
7c673cae
FG
10284int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10285{
10286 Mutex::Locker l(client_lock);
181888fb
FG
10287
10288 if (unmounting)
10289 return -ENOTCONN;
10290
7c673cae
FG
10291 filepath path(relpath);
10292 InodeRef in;
10293 int r = path_walk(path, &in, perms);
10294 if (r < 0)
10295 return r;
10296 if (cct->_conf->client_permissions) {
10297 r = may_delete(in.get(), NULL, perms);
10298 if (r < 0)
10299 return r;
10300 }
10301 Inode *snapdir = open_snapdir(in.get());
10302 return _rmdir(snapdir, name, perms);
10303}
10304
10305// =============================
10306// expose caps
10307
10308int Client::get_caps_issued(int fd) {
10309
10310 Mutex::Locker lock(client_lock);
10311
181888fb
FG
10312 if (unmounting)
10313 return -ENOTCONN;
10314
7c673cae
FG
10315 Fh *f = get_filehandle(fd);
10316 if (!f)
10317 return -EBADF;
10318
10319 return f->inode->caps_issued();
10320}
10321
10322int Client::get_caps_issued(const char *path, const UserPerm& perms)
10323{
10324 Mutex::Locker lock(client_lock);
181888fb
FG
10325
10326 if (unmounting)
10327 return -ENOTCONN;
10328
7c673cae
FG
10329 filepath p(path);
10330 InodeRef in;
10331 int r = path_walk(p, &in, perms, true);
10332 if (r < 0)
10333 return r;
10334 return in->caps_issued();
10335}
10336
10337// =========================================
10338// low level
10339
10340Inode *Client::open_snapdir(Inode *diri)
10341{
10342 Inode *in;
10343 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10344 if (!inode_map.count(vino)) {
10345 in = new Inode(this, vino, &diri->layout);
10346
10347 in->ino = diri->ino;
10348 in->snapid = CEPH_SNAPDIR;
10349 in->mode = diri->mode;
10350 in->uid = diri->uid;
10351 in->gid = diri->gid;
10352 in->mtime = diri->mtime;
10353 in->ctime = diri->ctime;
10354 in->btime = diri->btime;
10355 in->size = diri->size;
10356 in->change_attr = diri->change_attr;
10357
10358 in->dirfragtree.clear();
10359 in->snapdir_parent = diri;
10360 diri->flags |= I_SNAPDIR_OPEN;
10361 inode_map[vino] = in;
10362 if (use_faked_inos())
10363 _assign_faked_ino(in);
10364 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10365 } else {
10366 in = inode_map[vino];
10367 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10368 }
10369 return in;
10370}
10371
10372int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10373 Inode **out, const UserPerm& perms)
10374{
10375 Mutex::Locker lock(client_lock);
31f18b77
FG
10376 vinodeno_t vparent = _get_vino(parent);
10377 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10378 tout(cct) << "ll_lookup" << std::endl;
10379 tout(cct) << name << std::endl;
10380
181888fb
FG
10381 if (unmounting)
10382 return -ENOTCONN;
10383
7c673cae
FG
10384 int r = 0;
10385 if (!cct->_conf->fuse_default_permissions) {
10386 r = may_lookup(parent, perms);
10387 if (r < 0)
10388 return r;
10389 }
10390
10391 string dname(name);
10392 InodeRef in;
10393
10394 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10395 if (r < 0) {
10396 attr->st_ino = 0;
10397 goto out;
10398 }
10399
10400 assert(in);
10401 fill_stat(in, attr);
10402 _ll_get(in.get());
10403
10404 out:
31f18b77 10405 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10406 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10407 tout(cct) << attr->st_ino << std::endl;
10408 *out = in.get();
10409 return r;
10410}
10411
1adf2230
AA
10412int Client::ll_lookup_inode(
10413 struct inodeno_t ino,
10414 const UserPerm& perms,
10415 Inode **inode)
10416{
10417 Mutex::Locker lock(client_lock);
10418 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10419
10420 // Num1: get inode and *inode
10421 int r = _lookup_ino(ino, perms, inode);
10422 if (r) {
10423 return r;
10424 }
10425 assert(inode != NULL);
10426 assert(*inode != NULL);
10427
10428 // Num2: Request the parent inode, so that we can look up the name
10429 Inode *parent;
10430 r = _lookup_parent(*inode, perms, &parent);
10431 if (r && r != -EINVAL) {
10432 // Unexpected error
10433 _ll_forget(*inode, 1);
10434 return r;
10435 } else if (r == -EINVAL) {
10436 // EINVAL indicates node without parents (root), drop out now
10437 // and don't try to look up the non-existent dentry.
10438 return 0;
10439 }
10440 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10441 // is already in cache
10442 assert(parent != NULL);
10443
10444 // Num3: Finally, get the name (dentry) of the requested inode
10445 r = _lookup_name(*inode, parent, perms);
10446 if (r) {
10447 // Unexpected error
10448 _ll_forget(parent, 1);
10449 _ll_forget(*inode, 1);
10450 return r;
10451 }
10452
10453 _ll_forget(parent, 1);
10454 return 0;
10455}
10456
7c673cae
FG
10457int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10458 struct ceph_statx *stx, unsigned want, unsigned flags,
10459 const UserPerm& perms)
10460{
10461 Mutex::Locker lock(client_lock);
31f18b77
FG
10462 vinodeno_t vparent = _get_vino(parent);
10463 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10464 tout(cct) << "ll_lookupx" << std::endl;
10465 tout(cct) << name << std::endl;
10466
181888fb
FG
10467 if (unmounting)
10468 return -ENOTCONN;
10469
7c673cae
FG
10470 int r = 0;
10471 if (!cct->_conf->fuse_default_permissions) {
10472 r = may_lookup(parent, perms);
10473 if (r < 0)
10474 return r;
10475 }
10476
10477 string dname(name);
10478 InodeRef in;
10479
10480 unsigned mask = statx_to_mask(flags, want);
10481 r = _lookup(parent, dname, mask, &in, perms);
10482 if (r < 0) {
10483 stx->stx_ino = 0;
10484 stx->stx_mask = 0;
10485 } else {
10486 assert(in);
10487 fill_statx(in, mask, stx);
10488 _ll_get(in.get());
10489 }
10490
31f18b77 10491 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10492 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10493 tout(cct) << stx->stx_ino << std::endl;
10494 *out = in.get();
10495 return r;
10496}
10497
10498int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10499 unsigned int want, unsigned int flags, const UserPerm& perms)
10500{
10501 Mutex::Locker lock(client_lock);
181888fb
FG
10502
10503 if (unmounting)
10504 return -ENOTCONN;
10505
7c673cae
FG
10506 filepath fp(name, 0);
10507 InodeRef in;
10508 int rc;
10509 unsigned mask = statx_to_mask(flags, want);
10510
10511 ldout(cct, 3) << "ll_walk" << name << dendl;
10512 tout(cct) << "ll_walk" << std::endl;
10513 tout(cct) << name << std::endl;
10514
10515 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10516 if (rc < 0) {
10517 /* zero out mask, just in case... */
10518 stx->stx_mask = 0;
10519 stx->stx_ino = 0;
10520 *out = NULL;
10521 return rc;
10522 } else {
10523 assert(in);
10524 fill_statx(in, mask, stx);
10525 _ll_get(in.get());
10526 *out = in.get();
10527 return 0;
10528 }
10529}
10530
10531void Client::_ll_get(Inode *in)
10532{
10533 if (in->ll_ref == 0) {
10534 in->get();
10535 if (in->is_dir() && !in->dn_set.empty()) {
10536 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10537 in->get_first_parent()->get(); // pin dentry
10538 }
10539 }
10540 in->ll_get();
10541 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10542}
10543
10544int Client::_ll_put(Inode *in, int num)
10545{
10546 in->ll_put(num);
10547 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10548 if (in->ll_ref == 0) {
10549 if (in->is_dir() && !in->dn_set.empty()) {
10550 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10551 in->get_first_parent()->put(); // unpin dentry
10552 }
10553 put_inode(in);
10554 return 0;
10555 } else {
10556 return in->ll_ref;
10557 }
10558}
10559
10560void Client::_ll_drop_pins()
10561{
10562 ldout(cct, 10) << "_ll_drop_pins" << dendl;
1adf2230 10563 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10564 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10565 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10566 it != inode_map.end();
10567 it = next) {
10568 Inode *in = it->second;
10569 next = it;
10570 ++next;
1adf2230
AA
10571 if (in->ll_ref){
10572 to_be_put.insert(in);
7c673cae 10573 _ll_put(in, in->ll_ref);
1adf2230 10574 }
7c673cae
FG
10575 }
10576}
10577
1adf2230 10578bool Client::_ll_forget(Inode *in, int count)
7c673cae 10579{
7c673cae
FG
10580 inodeno_t ino = _get_inodeno(in);
10581
1adf2230 10582 ldout(cct, 8) << "ll_forget " << ino << " " << count << dendl;
7c673cae
FG
10583 tout(cct) << "ll_forget" << std::endl;
10584 tout(cct) << ino.val << std::endl;
10585 tout(cct) << count << std::endl;
10586
181888fb
FG
10587 // Ignore forget if we're no longer mounted
10588 if (unmounting)
10589 return true;
10590
7c673cae
FG
10591 if (ino == 1) return true; // ignore forget on root.
10592
10593 bool last = false;
10594 if (in->ll_ref < count) {
10595 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10596 << ", which only has ll_ref=" << in->ll_ref << dendl;
10597 _ll_put(in, in->ll_ref);
10598 last = true;
10599 } else {
10600 if (_ll_put(in, count) == 0)
10601 last = true;
10602 }
10603
10604 return last;
10605}
10606
1adf2230
AA
10607bool Client::ll_forget(Inode *in, int count)
10608{
10609 Mutex::Locker lock(client_lock);
10610 return _ll_forget(in, count);
10611}
10612
7c673cae
FG
10613bool Client::ll_put(Inode *in)
10614{
10615 /* ll_forget already takes the lock */
10616 return ll_forget(in, 1);
10617}
10618
10619snapid_t Client::ll_get_snapid(Inode *in)
10620{
10621 Mutex::Locker lock(client_lock);
10622 return in->snapid;
10623}
10624
10625Inode *Client::ll_get_inode(ino_t ino)
10626{
10627 Mutex::Locker lock(client_lock);
181888fb
FG
10628
10629 if (unmounting)
10630 return NULL;
10631
7c673cae
FG
10632 vinodeno_t vino = _map_faked_ino(ino);
10633 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10634 if (p == inode_map.end())
10635 return NULL;
10636 Inode *in = p->second;
10637 _ll_get(in);
10638 return in;
10639}
10640
10641Inode *Client::ll_get_inode(vinodeno_t vino)
10642{
10643 Mutex::Locker lock(client_lock);
181888fb
FG
10644
10645 if (unmounting)
10646 return NULL;
10647
7c673cae
FG
10648 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10649 if (p == inode_map.end())
10650 return NULL;
10651 Inode *in = p->second;
10652 _ll_get(in);
10653 return in;
10654}
10655
10656int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10657{
10658 vinodeno_t vino = _get_vino(in);
10659
1adf2230 10660 ldout(cct, 8) << "ll_getattr " << vino << dendl;
7c673cae
FG
10661 tout(cct) << "ll_getattr" << std::endl;
10662 tout(cct) << vino.ino.val << std::endl;
10663
10664 if (vino.snapid < CEPH_NOSNAP)
10665 return 0;
10666 else
10667 return _getattr(in, caps, perms);
10668}
10669
10670int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10671{
10672 Mutex::Locker lock(client_lock);
10673
181888fb
FG
10674 if (unmounting)
10675 return -ENOTCONN;
10676
7c673cae
FG
10677 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10678
10679 if (res == 0)
10680 fill_stat(in, attr);
10681 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10682 return res;
10683}
10684
10685int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10686 unsigned int flags, const UserPerm& perms)
10687{
10688 Mutex::Locker lock(client_lock);
10689
181888fb
FG
10690 if (unmounting)
10691 return -ENOTCONN;
10692
7c673cae
FG
10693 int res = 0;
10694 unsigned mask = statx_to_mask(flags, want);
10695
94b18763 10696 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10697 res = _ll_getattr(in, mask, perms);
10698
10699 if (res == 0)
10700 fill_statx(in, mask, stx);
10701 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10702 return res;
10703}
10704
10705int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10706 const UserPerm& perms, InodeRef *inp)
10707{
10708 vinodeno_t vino = _get_vino(in);
10709
1adf2230 10710 ldout(cct, 8) << "ll_setattrx " << vino << " mask " << hex << mask << dec
7c673cae
FG
10711 << dendl;
10712 tout(cct) << "ll_setattrx" << std::endl;
10713 tout(cct) << vino.ino.val << std::endl;
10714 tout(cct) << stx->stx_mode << std::endl;
10715 tout(cct) << stx->stx_uid << std::endl;
10716 tout(cct) << stx->stx_gid << std::endl;
10717 tout(cct) << stx->stx_size << std::endl;
10718 tout(cct) << stx->stx_mtime << std::endl;
10719 tout(cct) << stx->stx_atime << std::endl;
10720 tout(cct) << stx->stx_btime << std::endl;
10721 tout(cct) << mask << std::endl;
10722
10723 if (!cct->_conf->fuse_default_permissions) {
10724 int res = may_setattr(in, stx, mask, perms);
10725 if (res < 0)
10726 return res;
10727 }
10728
10729 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10730
10731 return __setattrx(in, stx, mask, perms, inp);
10732}
10733
10734int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10735 const UserPerm& perms)
10736{
10737 Mutex::Locker lock(client_lock);
181888fb
FG
10738
10739 if (unmounting)
10740 return -ENOTCONN;
10741
7c673cae
FG
10742 InodeRef target(in);
10743 int res = _ll_setattrx(in, stx, mask, perms, &target);
10744 if (res == 0) {
10745 assert(in == target.get());
10746 fill_statx(in, in->caps_issued(), stx);
10747 }
10748
10749 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10750 return res;
10751}
10752
10753int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10754 const UserPerm& perms)
10755{
10756 struct ceph_statx stx;
10757 stat_to_statx(attr, &stx);
10758
10759 Mutex::Locker lock(client_lock);
181888fb
FG
10760
10761 if (unmounting)
10762 return -ENOTCONN;
10763
7c673cae
FG
10764 InodeRef target(in);
10765 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10766 if (res == 0) {
10767 assert(in == target.get());
10768 fill_stat(in, attr);
10769 }
10770
10771 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10772 return res;
10773}
10774
10775
10776// ----------
10777// xattrs
10778
10779int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10780 const UserPerm& perms)
10781{
10782 Mutex::Locker lock(client_lock);
181888fb
FG
10783
10784 if (unmounting)
10785 return -ENOTCONN;
10786
7c673cae
FG
10787 InodeRef in;
10788 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10789 if (r < 0)
10790 return r;
10791 return _getxattr(in, name, value, size, perms);
10792}
10793
10794int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10795 const UserPerm& perms)
10796{
10797 Mutex::Locker lock(client_lock);
181888fb
FG
10798
10799 if (unmounting)
10800 return -ENOTCONN;
10801
7c673cae
FG
10802 InodeRef in;
10803 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10804 if (r < 0)
10805 return r;
10806 return _getxattr(in, name, value, size, perms);
10807}
10808
10809int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10810 const UserPerm& perms)
10811{
10812 Mutex::Locker lock(client_lock);
181888fb
FG
10813
10814 if (unmounting)
10815 return -ENOTCONN;
10816
7c673cae
FG
10817 Fh *f = get_filehandle(fd);
10818 if (!f)
10819 return -EBADF;
10820 return _getxattr(f->inode, name, value, size, perms);
10821}
10822
10823int Client::listxattr(const char *path, char *list, size_t size,
10824 const UserPerm& perms)
10825{
10826 Mutex::Locker lock(client_lock);
181888fb
FG
10827
10828 if (unmounting)
10829 return -ENOTCONN;
10830
7c673cae
FG
10831 InodeRef in;
10832 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10833 if (r < 0)
10834 return r;
10835 return Client::_listxattr(in.get(), list, size, perms);
10836}
10837
10838int Client::llistxattr(const char *path, char *list, size_t size,
10839 const UserPerm& perms)
10840{
10841 Mutex::Locker lock(client_lock);
181888fb
FG
10842
10843 if (unmounting)
10844 return -ENOTCONN;
10845
7c673cae
FG
10846 InodeRef in;
10847 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10848 if (r < 0)
10849 return r;
10850 return Client::_listxattr(in.get(), list, size, perms);
10851}
10852
10853int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10854{
10855 Mutex::Locker lock(client_lock);
181888fb
FG
10856
10857 if (unmounting)
10858 return -ENOTCONN;
10859
7c673cae
FG
10860 Fh *f = get_filehandle(fd);
10861 if (!f)
10862 return -EBADF;
10863 return Client::_listxattr(f->inode.get(), list, size, perms);
10864}
10865
10866int Client::removexattr(const char *path, const char *name,
10867 const UserPerm& perms)
10868{
10869 Mutex::Locker lock(client_lock);
181888fb
FG
10870
10871 if (unmounting)
10872 return -ENOTCONN;
10873
7c673cae
FG
10874 InodeRef in;
10875 int r = Client::path_walk(path, &in, perms, true);
10876 if (r < 0)
10877 return r;
10878 return _removexattr(in, name, perms);
10879}
10880
10881int Client::lremovexattr(const char *path, const char *name,
10882 const UserPerm& perms)
10883{
10884 Mutex::Locker lock(client_lock);
181888fb
FG
10885
10886 if (unmounting)
10887 return -ENOTCONN;
10888
7c673cae
FG
10889 InodeRef in;
10890 int r = Client::path_walk(path, &in, perms, false);
10891 if (r < 0)
10892 return r;
10893 return _removexattr(in, name, perms);
10894}
10895
10896int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10897{
10898 Mutex::Locker lock(client_lock);
181888fb
FG
10899
10900 if (unmounting)
10901 return -ENOTCONN;
10902
7c673cae
FG
10903 Fh *f = get_filehandle(fd);
10904 if (!f)
10905 return -EBADF;
10906 return _removexattr(f->inode, name, perms);
10907}
10908
10909int Client::setxattr(const char *path, const char *name, const void *value,
10910 size_t size, int flags, const UserPerm& perms)
10911{
10912 _setxattr_maybe_wait_for_osdmap(name, value, size);
10913
10914 Mutex::Locker lock(client_lock);
181888fb
FG
10915
10916 if (unmounting)
10917 return -ENOTCONN;
10918
7c673cae
FG
10919 InodeRef in;
10920 int r = Client::path_walk(path, &in, perms, true);
10921 if (r < 0)
10922 return r;
10923 return _setxattr(in, name, value, size, flags, perms);
10924}
10925
10926int Client::lsetxattr(const char *path, const char *name, const void *value,
10927 size_t size, int flags, const UserPerm& perms)
10928{
10929 _setxattr_maybe_wait_for_osdmap(name, value, size);
10930
10931 Mutex::Locker lock(client_lock);
181888fb
FG
10932
10933 if (unmounting)
10934 return -ENOTCONN;
10935
7c673cae
FG
10936 InodeRef in;
10937 int r = Client::path_walk(path, &in, perms, false);
10938 if (r < 0)
10939 return r;
10940 return _setxattr(in, name, value, size, flags, perms);
10941}
10942
10943int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10944 int flags, const UserPerm& perms)
10945{
10946 _setxattr_maybe_wait_for_osdmap(name, value, size);
10947
10948 Mutex::Locker lock(client_lock);
181888fb
FG
10949
10950 if (unmounting)
10951 return -ENOTCONN;
10952
7c673cae
FG
10953 Fh *f = get_filehandle(fd);
10954 if (!f)
10955 return -EBADF;
10956 return _setxattr(f->inode, name, value, size, flags, perms);
10957}
10958
10959int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10960 const UserPerm& perms)
10961{
10962 int r;
10963
10964 const VXattr *vxattr = _match_vxattr(in, name);
10965 if (vxattr) {
10966 r = -ENODATA;
10967
10968 // Do a force getattr to get the latest quota before returning
10969 // a value to userspace.
28e407b8
AA
10970 int flags = 0;
10971 if (vxattr->flags & VXATTR_RSTAT) {
10972 flags |= CEPH_STAT_RSTAT;
10973 }
10974 r = _getattr(in, flags, perms, true);
7c673cae
FG
10975 if (r != 0) {
10976 // Error from getattr!
10977 return r;
10978 }
10979
10980 // call pointer-to-member function
10981 char buf[256];
10982 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10983 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10984 } else {
10985 r = -ENODATA;
10986 }
10987
10988 if (size != 0) {
10989 if (r > (int)size) {
10990 r = -ERANGE;
10991 } else if (r > 0) {
10992 memcpy(value, buf, r);
10993 }
10994 }
10995 goto out;
10996 }
10997
10998 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10999 r = -EOPNOTSUPP;
11000 goto out;
11001 }
11002
11003 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11004 if (r == 0) {
11005 string n(name);
11006 r = -ENODATA;
11007 if (in->xattrs.count(n)) {
11008 r = in->xattrs[n].length();
11009 if (r > 0 && size != 0) {
11010 if (size >= (unsigned)r)
11011 memcpy(value, in->xattrs[n].c_str(), r);
11012 else
11013 r = -ERANGE;
11014 }
11015 }
11016 }
11017 out:
1adf2230 11018 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11019 return r;
11020}
11021
11022int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11023 const UserPerm& perms)
11024{
11025 if (cct->_conf->client_permissions) {
11026 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11027 if (r < 0)
11028 return r;
11029 }
11030 return _getxattr(in.get(), name, value, size, perms);
11031}
11032
11033int Client::ll_getxattr(Inode *in, const char *name, void *value,
11034 size_t size, const UserPerm& perms)
11035{
11036 Mutex::Locker lock(client_lock);
11037
181888fb
FG
11038 if (unmounting)
11039 return -ENOTCONN;
11040
7c673cae
FG
11041 vinodeno_t vino = _get_vino(in);
11042
11043 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
11044 tout(cct) << "ll_getxattr" << std::endl;
11045 tout(cct) << vino.ino.val << std::endl;
11046 tout(cct) << name << std::endl;
11047
11048 if (!cct->_conf->fuse_default_permissions) {
11049 int r = xattr_permission(in, name, MAY_READ, perms);
11050 if (r < 0)
11051 return r;
11052 }
11053
11054 return _getxattr(in, name, value, size, perms);
11055}
11056
11057int Client::_listxattr(Inode *in, char *name, size_t size,
11058 const UserPerm& perms)
11059{
11060 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11061 if (r == 0) {
11062 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11063 p != in->xattrs.end();
11064 ++p)
11065 r += p->first.length() + 1;
11066
11067 const VXattr *vxattrs = _get_vxattrs(in);
11068 r += _vxattrs_name_size(vxattrs);
11069
11070 if (size != 0) {
11071 if (size >= (unsigned)r) {
11072 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11073 p != in->xattrs.end();
11074 ++p) {
11075 memcpy(name, p->first.c_str(), p->first.length());
11076 name += p->first.length();
11077 *name = '\0';
11078 name++;
11079 }
11080 if (vxattrs) {
11081 for (int i = 0; !vxattrs[i].name.empty(); i++) {
11082 const VXattr& vxattr = vxattrs[i];
11083 if (vxattr.hidden)
11084 continue;
11085 // call pointer-to-member function
11086 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11087 continue;
11088 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11089 name += vxattr.name.length();
11090 *name = '\0';
11091 name++;
11092 }
11093 }
11094 } else
11095 r = -ERANGE;
11096 }
11097 }
1adf2230 11098 ldout(cct, 8) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11099 return r;
11100}
11101
11102int Client::ll_listxattr(Inode *in, char *names, size_t size,
11103 const UserPerm& perms)
11104{
11105 Mutex::Locker lock(client_lock);
11106
181888fb
FG
11107 if (unmounting)
11108 return -ENOTCONN;
11109
7c673cae
FG
11110 vinodeno_t vino = _get_vino(in);
11111
11112 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
11113 tout(cct) << "ll_listxattr" << std::endl;
11114 tout(cct) << vino.ino.val << std::endl;
11115 tout(cct) << size << std::endl;
11116
11117 return _listxattr(in, names, size, perms);
11118}
11119
11120int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11121 size_t size, int flags, const UserPerm& perms)
11122{
11123
11124 int xattr_flags = 0;
11125 if (!value)
11126 xattr_flags |= CEPH_XATTR_REMOVE;
11127 if (flags & XATTR_CREATE)
11128 xattr_flags |= CEPH_XATTR_CREATE;
11129 if (flags & XATTR_REPLACE)
11130 xattr_flags |= CEPH_XATTR_REPLACE;
11131
11132 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11133 filepath path;
11134 in->make_nosnap_relative_path(path);
11135 req->set_filepath(path);
11136 req->set_string2(name);
11137 req->set_inode(in);
11138 req->head.args.setxattr.flags = xattr_flags;
11139
11140 bufferlist bl;
11141 bl.append((const char*)value, size);
11142 req->set_data(bl);
11143
11144 int res = make_request(req, perms);
11145
11146 trim_cache();
11147 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11148 res << dendl;
11149 return res;
11150}
11151
11152int Client::_setxattr(Inode *in, const char *name, const void *value,
11153 size_t size, int flags, const UserPerm& perms)
11154{
11155 if (in->snapid != CEPH_NOSNAP) {
11156 return -EROFS;
11157 }
11158
11159 bool posix_acl_xattr = false;
11160 if (acl_type == POSIX_ACL)
11161 posix_acl_xattr = !strncmp(name, "system.", 7);
11162
11163 if (strncmp(name, "user.", 5) &&
11164 strncmp(name, "security.", 9) &&
11165 strncmp(name, "trusted.", 8) &&
11166 strncmp(name, "ceph.", 5) &&
11167 !posix_acl_xattr)
11168 return -EOPNOTSUPP;
11169
11170 if (posix_acl_xattr) {
11171 if (!strcmp(name, ACL_EA_ACCESS)) {
11172 mode_t new_mode = in->mode;
11173 if (value) {
11174 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11175 if (ret < 0)
11176 return ret;
11177 if (ret == 0) {
11178 value = NULL;
11179 size = 0;
11180 }
11181 if (new_mode != in->mode) {
11182 struct ceph_statx stx;
11183 stx.stx_mode = new_mode;
11184 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11185 if (ret < 0)
11186 return ret;
11187 }
11188 }
11189 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11190 if (value) {
11191 if (!S_ISDIR(in->mode))
11192 return -EACCES;
11193 int ret = posix_acl_check(value, size);
11194 if (ret < 0)
11195 return -EINVAL;
11196 if (ret == 0) {
11197 value = NULL;
11198 size = 0;
11199 }
11200 }
11201 } else {
11202 return -EOPNOTSUPP;
11203 }
11204 } else {
11205 const VXattr *vxattr = _match_vxattr(in, name);
11206 if (vxattr && vxattr->readonly)
11207 return -EOPNOTSUPP;
11208 }
11209
11210 return _do_setxattr(in, name, value, size, flags, perms);
11211}
11212
11213int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11214 size_t size, int flags, const UserPerm& perms)
11215{
11216 if (cct->_conf->client_permissions) {
11217 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11218 if (r < 0)
11219 return r;
11220 }
11221 return _setxattr(in.get(), name, value, size, flags, perms);
11222}
11223
11224int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11225{
11226 string tmp;
11227 if (name == "layout") {
11228 string::iterator begin = value.begin();
11229 string::iterator end = value.end();
11230 keys_and_values<string::iterator> p; // create instance of parser
11231 std::map<string, string> m; // map to receive results
11232 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11233 return -EINVAL;
11234 }
11235 if (begin != end)
11236 return -EINVAL;
11237 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11238 if (q->first == "pool") {
11239 tmp = q->second;
11240 break;
11241 }
11242 }
11243 } else if (name == "layout.pool") {
11244 tmp = value;
11245 }
11246
11247 if (tmp.length()) {
11248 int64_t pool;
11249 try {
11250 pool = boost::lexical_cast<unsigned>(tmp);
11251 if (!osdmap->have_pg_pool(pool))
11252 return -ENOENT;
11253 } catch (boost::bad_lexical_cast const&) {
11254 pool = osdmap->lookup_pg_pool_name(tmp);
11255 if (pool < 0) {
11256 return -ENOENT;
11257 }
11258 }
11259 }
11260
11261 return 0;
11262}
11263
11264void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11265{
11266 // For setting pool of layout, MetaRequest need osdmap epoch.
11267 // There is a race which create a new data pool but client and mds both don't have.
11268 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11269 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11270 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11271 string rest(strstr(name, "layout"));
11272 string v((const char*)value, size);
11273 int r = objecter->with_osdmap([&](const OSDMap& o) {
11274 return _setxattr_check_data_pool(rest, v, &o);
11275 });
11276
11277 if (r == -ENOENT) {
11278 C_SaferCond ctx;
11279 objecter->wait_for_latest_osdmap(&ctx);
11280 ctx.wait();
11281 }
11282 }
11283}
11284
11285int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11286 size_t size, int flags, const UserPerm& perms)
11287{
11288 _setxattr_maybe_wait_for_osdmap(name, value, size);
11289
11290 Mutex::Locker lock(client_lock);
11291
181888fb
FG
11292 if (unmounting)
11293 return -ENOTCONN;
11294
7c673cae
FG
11295 vinodeno_t vino = _get_vino(in);
11296
11297 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11298 tout(cct) << "ll_setxattr" << std::endl;
11299 tout(cct) << vino.ino.val << std::endl;
11300 tout(cct) << name << std::endl;
11301
11302 if (!cct->_conf->fuse_default_permissions) {
11303 int r = xattr_permission(in, name, MAY_WRITE, perms);
11304 if (r < 0)
11305 return r;
11306 }
11307 return _setxattr(in, name, value, size, flags, perms);
11308}
11309
11310int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11311{
11312 if (in->snapid != CEPH_NOSNAP) {
11313 return -EROFS;
11314 }
11315
11316 // same xattrs supported by kernel client
11317 if (strncmp(name, "user.", 5) &&
11318 strncmp(name, "system.", 7) &&
11319 strncmp(name, "security.", 9) &&
11320 strncmp(name, "trusted.", 8) &&
11321 strncmp(name, "ceph.", 5))
11322 return -EOPNOTSUPP;
11323
11324 const VXattr *vxattr = _match_vxattr(in, name);
11325 if (vxattr && vxattr->readonly)
11326 return -EOPNOTSUPP;
11327
11328 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11329 filepath path;
11330 in->make_nosnap_relative_path(path);
11331 req->set_filepath(path);
11332 req->set_filepath2(name);
11333 req->set_inode(in);
11334
11335 int res = make_request(req, perms);
11336
11337 trim_cache();
1adf2230 11338 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11339 return res;
11340}
11341
11342int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11343{
11344 if (cct->_conf->client_permissions) {
11345 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11346 if (r < 0)
11347 return r;
11348 }
11349 return _removexattr(in.get(), name, perms);
11350}
11351
11352int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11353{
11354 Mutex::Locker lock(client_lock);
11355
181888fb
FG
11356 if (unmounting)
11357 return -ENOTCONN;
11358
7c673cae
FG
11359 vinodeno_t vino = _get_vino(in);
11360
11361 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11362 tout(cct) << "ll_removexattr" << std::endl;
11363 tout(cct) << vino.ino.val << std::endl;
11364 tout(cct) << name << std::endl;
11365
11366 if (!cct->_conf->fuse_default_permissions) {
11367 int r = xattr_permission(in, name, MAY_WRITE, perms);
11368 if (r < 0)
11369 return r;
11370 }
11371
11372 return _removexattr(in, name, perms);
11373}
11374
11375bool Client::_vxattrcb_quota_exists(Inode *in)
11376{
11377 return in->quota.is_enable();
11378}
11379size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11380{
11381 return snprintf(val, size,
11382 "max_bytes=%lld max_files=%lld",
11383 (long long int)in->quota.max_bytes,
11384 (long long int)in->quota.max_files);
11385}
11386size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11387{
11388 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11389}
11390size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11391{
11392 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11393}
11394
11395bool Client::_vxattrcb_layout_exists(Inode *in)
11396{
11397 return in->layout != file_layout_t();
11398}
11399size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11400{
11401 int r = snprintf(val, size,
11402 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11403 (unsigned long long)in->layout.stripe_unit,
11404 (unsigned long long)in->layout.stripe_count,
11405 (unsigned long long)in->layout.object_size);
11406 objecter->with_osdmap([&](const OSDMap& o) {
11407 if (o.have_pg_pool(in->layout.pool_id))
11408 r += snprintf(val + r, size - r, "%s",
11409 o.get_pool_name(in->layout.pool_id).c_str());
11410 else
11411 r += snprintf(val + r, size - r, "%" PRIu64,
11412 (uint64_t)in->layout.pool_id);
11413 });
11414 if (in->layout.pool_ns.length())
11415 r += snprintf(val + r, size - r, " pool_namespace=%s",
11416 in->layout.pool_ns.c_str());
11417 return r;
11418}
11419size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11420{
11421 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11422}
11423size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11424{
11425 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11426}
11427size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11428{
11429 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11430}
11431size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11432{
11433 size_t r;
11434 objecter->with_osdmap([&](const OSDMap& o) {
11435 if (o.have_pg_pool(in->layout.pool_id))
11436 r = snprintf(val, size, "%s", o.get_pool_name(
11437 in->layout.pool_id).c_str());
11438 else
11439 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11440 });
11441 return r;
11442}
11443size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11444{
11445 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11446}
11447size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11448{
11449 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11450}
11451size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11452{
11453 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11454}
11455size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11456{
11457 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11458}
11459size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11460{
11461 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11462}
11463size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11464{
11465 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11466}
11467size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11468{
11469 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11470}
11471size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11472{
11473 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11474}
11475size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11476{
11477 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11478 (long)in->rstat.rctime.nsec());
11479}
11480
11481#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11482#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11483
11484#define XATTR_NAME_CEPH(_type, _name) \
11485{ \
11486 name: CEPH_XATTR_NAME(_type, _name), \
11487 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11488 readonly: true, \
11489 hidden: false, \
11490 exists_cb: NULL, \
28e407b8
AA
11491 flags: 0, \
11492}
11493#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11494{ \
11495 name: CEPH_XATTR_NAME(_type, _name), \
11496 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11497 readonly: true, \
11498 hidden: false, \
11499 exists_cb: NULL, \
11500 flags: _flags, \
7c673cae
FG
11501}
11502#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11503{ \
11504 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11505 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11506 readonly: false, \
11507 hidden: true, \
11508 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11509 flags: 0, \
7c673cae
FG
11510}
11511#define XATTR_QUOTA_FIELD(_type, _name) \
11512{ \
11513 name: CEPH_XATTR_NAME(_type, _name), \
11514 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11515 readonly: false, \
11516 hidden: true, \
11517 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11518 flags: 0, \
7c673cae
FG
11519}
11520
11521const Client::VXattr Client::_dir_vxattrs[] = {
11522 {
11523 name: "ceph.dir.layout",
11524 getxattr_cb: &Client::_vxattrcb_layout,
11525 readonly: false,
11526 hidden: true,
11527 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11528 flags: 0,
7c673cae
FG
11529 },
11530 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11531 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11532 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11533 XATTR_LAYOUT_FIELD(dir, layout, pool),
11534 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11535 XATTR_NAME_CEPH(dir, entries),
11536 XATTR_NAME_CEPH(dir, files),
11537 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11538 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11539 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11540 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11541 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11542 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11543 {
11544 name: "ceph.quota",
11545 getxattr_cb: &Client::_vxattrcb_quota,
11546 readonly: false,
11547 hidden: true,
11548 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11549 flags: 0,
7c673cae
FG
11550 },
11551 XATTR_QUOTA_FIELD(quota, max_bytes),
11552 XATTR_QUOTA_FIELD(quota, max_files),
11553 { name: "" } /* Required table terminator */
11554};
11555
11556const Client::VXattr Client::_file_vxattrs[] = {
11557 {
11558 name: "ceph.file.layout",
11559 getxattr_cb: &Client::_vxattrcb_layout,
11560 readonly: false,
11561 hidden: true,
11562 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11563 flags: 0,
7c673cae
FG
11564 },
11565 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11566 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11567 XATTR_LAYOUT_FIELD(file, layout, object_size),
11568 XATTR_LAYOUT_FIELD(file, layout, pool),
11569 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11570 { name: "" } /* Required table terminator */
11571};
11572
11573const Client::VXattr *Client::_get_vxattrs(Inode *in)
11574{
11575 if (in->is_dir())
11576 return _dir_vxattrs;
11577 else if (in->is_file())
11578 return _file_vxattrs;
11579 return NULL;
11580}
11581
11582const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11583{
11584 if (strncmp(name, "ceph.", 5) == 0) {
11585 const VXattr *vxattr = _get_vxattrs(in);
11586 if (vxattr) {
11587 while (!vxattr->name.empty()) {
11588 if (vxattr->name == name)
11589 return vxattr;
11590 vxattr++;
11591 }
11592 }
11593 }
11594 return NULL;
11595}
11596
11597size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11598{
11599 size_t len = 0;
11600 while (!vxattr->name.empty()) {
11601 if (!vxattr->hidden)
11602 len += vxattr->name.length() + 1;
11603 vxattr++;
11604 }
11605 return len;
11606}
11607
11608int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11609{
11610 Mutex::Locker lock(client_lock);
11611
181888fb
FG
11612 if (unmounting)
11613 return -ENOTCONN;
11614
7c673cae
FG
11615 vinodeno_t vino = _get_vino(in);
11616
11617 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11618 tout(cct) << "ll_readlink" << std::endl;
11619 tout(cct) << vino.ino.val << std::endl;
11620
11621 set<Dentry*>::iterator dn = in->dn_set.begin();
11622 while (dn != in->dn_set.end()) {
11623 touch_dn(*dn);
11624 ++dn;
11625 }
11626
11627 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11628 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11629 return r;
11630}
11631
11632int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11633 const UserPerm& perms, InodeRef *inp)
11634{
1adf2230 11635 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11636 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11637 << ", gid " << perms.gid() << ")" << dendl;
11638
11639 if (strlen(name) > NAME_MAX)
11640 return -ENAMETOOLONG;
11641
11642 if (dir->snapid != CEPH_NOSNAP) {
11643 return -EROFS;
11644 }
11645 if (is_quota_files_exceeded(dir, perms)) {
11646 return -EDQUOT;
11647 }
11648
11649 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11650
11651 filepath path;
11652 dir->make_nosnap_relative_path(path);
11653 path.push_dentry(name);
11654 req->set_filepath(path);
11655 req->set_inode(dir);
11656 req->head.args.mknod.rdev = rdev;
11657 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11658 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11659
11660 bufferlist xattrs_bl;
11661 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11662 if (res < 0)
11663 goto fail;
11664 req->head.args.mknod.mode = mode;
11665 if (xattrs_bl.length() > 0)
11666 req->set_data(xattrs_bl);
11667
11668 Dentry *de;
11669 res = get_or_create(dir, name, &de);
11670 if (res < 0)
11671 goto fail;
11672 req->set_dentry(de);
11673
11674 res = make_request(req, perms, inp);
11675
11676 trim_cache();
11677
1adf2230 11678 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11679 return res;
11680
11681 fail:
11682 put_request(req);
11683 return res;
11684}
11685
11686int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11687 dev_t rdev, struct stat *attr, Inode **out,
11688 const UserPerm& perms)
11689{
11690 Mutex::Locker lock(client_lock);
11691
181888fb
FG
11692 if (unmounting)
11693 return -ENOTCONN;
11694
7c673cae
FG
11695 vinodeno_t vparent = _get_vino(parent);
11696
11697 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11698 tout(cct) << "ll_mknod" << std::endl;
11699 tout(cct) << vparent.ino.val << std::endl;
11700 tout(cct) << name << std::endl;
11701 tout(cct) << mode << std::endl;
11702 tout(cct) << rdev << std::endl;
11703
11704 if (!cct->_conf->fuse_default_permissions) {
11705 int r = may_create(parent, perms);
11706 if (r < 0)
11707 return r;
11708 }
11709
11710 InodeRef in;
11711 int r = _mknod(parent, name, mode, rdev, perms, &in);
11712 if (r == 0) {
11713 fill_stat(in, attr);
11714 _ll_get(in.get());
11715 }
11716 tout(cct) << attr->st_ino << std::endl;
11717 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11718 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11719 *out = in.get();
11720 return r;
11721}
11722
11723int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11724 dev_t rdev, Inode **out,
11725 struct ceph_statx *stx, unsigned want, unsigned flags,
11726 const UserPerm& perms)
11727{
11728 unsigned caps = statx_to_mask(flags, want);
11729 Mutex::Locker lock(client_lock);
11730
181888fb
FG
11731 if (unmounting)
11732 return -ENOTCONN;
11733
7c673cae
FG
11734 vinodeno_t vparent = _get_vino(parent);
11735
11736 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11737 tout(cct) << "ll_mknodx" << std::endl;
11738 tout(cct) << vparent.ino.val << std::endl;
11739 tout(cct) << name << std::endl;
11740 tout(cct) << mode << std::endl;
11741 tout(cct) << rdev << std::endl;
11742
11743 if (!cct->_conf->fuse_default_permissions) {
11744 int r = may_create(parent, perms);
11745 if (r < 0)
11746 return r;
11747 }
11748
11749 InodeRef in;
11750 int r = _mknod(parent, name, mode, rdev, perms, &in);
11751 if (r == 0) {
11752 fill_statx(in, caps, stx);
11753 _ll_get(in.get());
11754 }
11755 tout(cct) << stx->stx_ino << std::endl;
11756 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11757 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11758 *out = in.get();
11759 return r;
11760}
11761
11762int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11763 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11764 int object_size, const char *data_pool, bool *created,
11765 const UserPerm& perms)
11766{
1adf2230 11767 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
11768 mode << dec << ")" << dendl;
11769
11770 if (strlen(name) > NAME_MAX)
11771 return -ENAMETOOLONG;
11772 if (dir->snapid != CEPH_NOSNAP) {
11773 return -EROFS;
11774 }
11775 if (is_quota_files_exceeded(dir, perms)) {
11776 return -EDQUOT;
11777 }
11778
11779 // use normalized flags to generate cmode
11780 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11781 if (cmode < 0)
11782 return -EINVAL;
11783
11784 int64_t pool_id = -1;
11785 if (data_pool && *data_pool) {
11786 pool_id = objecter->with_osdmap(
11787 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11788 if (pool_id < 0)
11789 return -EINVAL;
11790 if (pool_id > 0xffffffffll)
11791 return -ERANGE; // bummer!
11792 }
11793
11794 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11795
11796 filepath path;
11797 dir->make_nosnap_relative_path(path);
11798 path.push_dentry(name);
11799 req->set_filepath(path);
11800 req->set_inode(dir);
11801 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11802
11803 req->head.args.open.stripe_unit = stripe_unit;
11804 req->head.args.open.stripe_count = stripe_count;
11805 req->head.args.open.object_size = object_size;
11806 if (cct->_conf->client_debug_getattr_caps)
11807 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11808 else
11809 req->head.args.open.mask = 0;
11810 req->head.args.open.pool = pool_id;
11811 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11812 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11813
11814 mode |= S_IFREG;
11815 bufferlist xattrs_bl;
11816 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11817 if (res < 0)
11818 goto fail;
11819 req->head.args.open.mode = mode;
11820 if (xattrs_bl.length() > 0)
11821 req->set_data(xattrs_bl);
11822
11823 Dentry *de;
11824 res = get_or_create(dir, name, &de);
11825 if (res < 0)
11826 goto fail;
11827 req->set_dentry(de);
11828
11829 res = make_request(req, perms, inp, created);
11830 if (res < 0) {
11831 goto reply_error;
11832 }
11833
11834 /* If the caller passed a value in fhp, do the open */
11835 if(fhp) {
11836 (*inp)->get_open_ref(cmode);
11837 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11838 }
11839
11840 reply_error:
11841 trim_cache();
11842
1adf2230 11843 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
11844 << " layout " << stripe_unit
11845 << ' ' << stripe_count
11846 << ' ' << object_size
11847 <<") = " << res << dendl;
11848 return res;
11849
11850 fail:
11851 put_request(req);
11852 return res;
11853}
11854
11855
11856int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11857 InodeRef *inp)
11858{
1adf2230 11859 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11860 << mode << dec << ", uid " << perm.uid()
11861 << ", gid " << perm.gid() << ")" << dendl;
11862
11863 if (strlen(name) > NAME_MAX)
11864 return -ENAMETOOLONG;
11865
11866 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11867 return -EROFS;
11868 }
11869 if (is_quota_files_exceeded(dir, perm)) {
11870 return -EDQUOT;
11871 }
11872 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11873 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11874
11875 filepath path;
11876 dir->make_nosnap_relative_path(path);
11877 path.push_dentry(name);
11878 req->set_filepath(path);
11879 req->set_inode(dir);
11880 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11881 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11882
11883 mode |= S_IFDIR;
11884 bufferlist xattrs_bl;
11885 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11886 if (res < 0)
11887 goto fail;
11888 req->head.args.mkdir.mode = mode;
11889 if (xattrs_bl.length() > 0)
11890 req->set_data(xattrs_bl);
11891
11892 Dentry *de;
11893 res = get_or_create(dir, name, &de);
11894 if (res < 0)
11895 goto fail;
11896 req->set_dentry(de);
11897
11898 ldout(cct, 10) << "_mkdir: making request" << dendl;
11899 res = make_request(req, perm, inp);
11900 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11901
11902 trim_cache();
11903
1adf2230 11904 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11905 return res;
11906
11907 fail:
11908 put_request(req);
11909 return res;
11910}
11911
11912int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11913 struct stat *attr, Inode **out, const UserPerm& perm)
11914{
11915 Mutex::Locker lock(client_lock);
11916
181888fb
FG
11917 if (unmounting)
11918 return -ENOTCONN;
11919
7c673cae
FG
11920 vinodeno_t vparent = _get_vino(parent);
11921
11922 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11923 tout(cct) << "ll_mkdir" << std::endl;
11924 tout(cct) << vparent.ino.val << std::endl;
11925 tout(cct) << name << std::endl;
11926 tout(cct) << mode << std::endl;
11927
11928 if (!cct->_conf->fuse_default_permissions) {
11929 int r = may_create(parent, perm);
11930 if (r < 0)
11931 return r;
11932 }
11933
11934 InodeRef in;
11935 int r = _mkdir(parent, name, mode, perm, &in);
11936 if (r == 0) {
11937 fill_stat(in, attr);
11938 _ll_get(in.get());
11939 }
11940 tout(cct) << attr->st_ino << std::endl;
11941 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11942 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11943 *out = in.get();
11944 return r;
11945}
11946
11947int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11948 struct ceph_statx *stx, unsigned want, unsigned flags,
11949 const UserPerm& perms)
11950{
11951 Mutex::Locker lock(client_lock);
11952
181888fb
FG
11953 if (unmounting)
11954 return -ENOTCONN;
11955
7c673cae
FG
11956 vinodeno_t vparent = _get_vino(parent);
11957
11958 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11959 tout(cct) << "ll_mkdirx" << std::endl;
11960 tout(cct) << vparent.ino.val << std::endl;
11961 tout(cct) << name << std::endl;
11962 tout(cct) << mode << std::endl;
11963
11964 if (!cct->_conf->fuse_default_permissions) {
11965 int r = may_create(parent, perms);
11966 if (r < 0)
11967 return r;
11968 }
11969
11970 InodeRef in;
11971 int r = _mkdir(parent, name, mode, perms, &in);
11972 if (r == 0) {
11973 fill_statx(in, statx_to_mask(flags, want), stx);
11974 _ll_get(in.get());
11975 } else {
11976 stx->stx_ino = 0;
11977 stx->stx_mask = 0;
11978 }
11979 tout(cct) << stx->stx_ino << std::endl;
11980 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11981 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11982 *out = in.get();
11983 return r;
11984}
11985
11986int Client::_symlink(Inode *dir, const char *name, const char *target,
11987 const UserPerm& perms, InodeRef *inp)
11988{
1adf2230 11989 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
11990 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11991 << dendl;
11992
11993 if (strlen(name) > NAME_MAX)
11994 return -ENAMETOOLONG;
11995
11996 if (dir->snapid != CEPH_NOSNAP) {
11997 return -EROFS;
11998 }
11999 if (is_quota_files_exceeded(dir, perms)) {
12000 return -EDQUOT;
12001 }
12002
12003 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12004
12005 filepath path;
12006 dir->make_nosnap_relative_path(path);
12007 path.push_dentry(name);
12008 req->set_filepath(path);
12009 req->set_inode(dir);
12010 req->set_string2(target);
12011 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12012 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12013
12014 Dentry *de;
12015 int res = get_or_create(dir, name, &de);
12016 if (res < 0)
12017 goto fail;
12018 req->set_dentry(de);
12019
12020 res = make_request(req, perms, inp);
12021
12022 trim_cache();
1adf2230 12023 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12024 res << dendl;
12025 return res;
12026
12027 fail:
12028 put_request(req);
12029 return res;
12030}
12031
12032int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12033 struct stat *attr, Inode **out, const UserPerm& perms)
12034{
12035 Mutex::Locker lock(client_lock);
12036
181888fb
FG
12037 if (unmounting)
12038 return -ENOTCONN;
12039
7c673cae
FG
12040 vinodeno_t vparent = _get_vino(parent);
12041
12042 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12043 << dendl;
12044 tout(cct) << "ll_symlink" << std::endl;
12045 tout(cct) << vparent.ino.val << std::endl;
12046 tout(cct) << name << std::endl;
12047 tout(cct) << value << std::endl;
12048
12049 if (!cct->_conf->fuse_default_permissions) {
12050 int r = may_create(parent, perms);
12051 if (r < 0)
12052 return r;
12053 }
12054
12055 InodeRef in;
12056 int r = _symlink(parent, name, value, perms, &in);
12057 if (r == 0) {
12058 fill_stat(in, attr);
12059 _ll_get(in.get());
12060 }
12061 tout(cct) << attr->st_ino << std::endl;
12062 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12063 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12064 *out = in.get();
12065 return r;
12066}
12067
12068int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12069 Inode **out, struct ceph_statx *stx, unsigned want,
12070 unsigned flags, const UserPerm& perms)
12071{
12072 Mutex::Locker lock(client_lock);
12073
181888fb
FG
12074 if (unmounting)
12075 return -ENOTCONN;
12076
7c673cae
FG
12077 vinodeno_t vparent = _get_vino(parent);
12078
12079 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12080 << dendl;
12081 tout(cct) << "ll_symlinkx" << std::endl;
12082 tout(cct) << vparent.ino.val << std::endl;
12083 tout(cct) << name << std::endl;
12084 tout(cct) << value << std::endl;
12085
12086 if (!cct->_conf->fuse_default_permissions) {
12087 int r = may_create(parent, perms);
12088 if (r < 0)
12089 return r;
12090 }
12091
12092 InodeRef in;
12093 int r = _symlink(parent, name, value, perms, &in);
12094 if (r == 0) {
12095 fill_statx(in, statx_to_mask(flags, want), stx);
12096 _ll_get(in.get());
12097 }
12098 tout(cct) << stx->stx_ino << std::endl;
12099 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12100 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12101 *out = in.get();
12102 return r;
12103}
12104
12105int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12106{
1adf2230 12107 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12108 << " uid " << perm.uid() << " gid " << perm.gid()
12109 << ")" << dendl;
12110
12111 if (dir->snapid != CEPH_NOSNAP) {
12112 return -EROFS;
12113 }
12114
12115 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12116
12117 filepath path;
12118 dir->make_nosnap_relative_path(path);
12119 path.push_dentry(name);
12120 req->set_filepath(path);
12121
12122 InodeRef otherin;
b32b8144 12123 Inode *in;
7c673cae 12124 Dentry *de;
b32b8144 12125
7c673cae
FG
12126 int res = get_or_create(dir, name, &de);
12127 if (res < 0)
12128 goto fail;
12129 req->set_dentry(de);
12130 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12131 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12132
12133 res = _lookup(dir, name, 0, &otherin, perm);
12134 if (res < 0)
12135 goto fail;
b32b8144
FG
12136
12137 in = otherin.get();
12138 req->set_other_inode(in);
12139 in->break_all_delegs();
7c673cae
FG
12140 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12141
12142 req->set_inode(dir);
12143
12144 res = make_request(req, perm);
12145
12146 trim_cache();
1adf2230 12147 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12148 return res;
12149
12150 fail:
12151 put_request(req);
12152 return res;
12153}
12154
12155int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12156{
12157 Mutex::Locker lock(client_lock);
12158
181888fb
FG
12159 if (unmounting)
12160 return -ENOTCONN;
12161
7c673cae
FG
12162 vinodeno_t vino = _get_vino(in);
12163
12164 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12165 tout(cct) << "ll_unlink" << std::endl;
12166 tout(cct) << vino.ino.val << std::endl;
12167 tout(cct) << name << std::endl;
12168
12169 if (!cct->_conf->fuse_default_permissions) {
12170 int r = may_delete(in, name, perm);
12171 if (r < 0)
12172 return r;
12173 }
12174 return _unlink(in, name, perm);
12175}
12176
12177int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12178{
1adf2230 12179 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12180 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12181
12182 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12183 return -EROFS;
12184 }
b32b8144
FG
12185
12186 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12187 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12188 filepath path;
12189 dir->make_nosnap_relative_path(path);
12190 path.push_dentry(name);
12191 req->set_filepath(path);
12192
12193 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12194 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12195 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12196
12197 InodeRef in;
12198
12199 Dentry *de;
12200 int res = get_or_create(dir, name, &de);
12201 if (res < 0)
12202 goto fail;
b32b8144
FG
12203 if (op == CEPH_MDS_OP_RMDIR)
12204 req->set_dentry(de);
12205 else
12206 de->get();
12207
7c673cae
FG
12208 res = _lookup(dir, name, 0, &in, perms);
12209 if (res < 0)
12210 goto fail;
b32b8144 12211 if (op == CEPH_MDS_OP_RMDIR) {
7c673cae 12212 req->set_inode(dir);
7c673cae
FG
12213 req->set_other_inode(in.get());
12214 } else {
12215 unlink(de, true, true);
b32b8144 12216 de->put();
7c673cae
FG
12217 req->set_other_inode(in.get());
12218 }
12219
12220 res = make_request(req, perms);
12221
12222 trim_cache();
1adf2230 12223 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12224 return res;
12225
12226 fail:
12227 put_request(req);
12228 return res;
12229}
12230
12231int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12232{
12233 Mutex::Locker lock(client_lock);
12234
181888fb
FG
12235 if (unmounting)
12236 return -ENOTCONN;
12237
7c673cae
FG
12238 vinodeno_t vino = _get_vino(in);
12239
12240 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12241 tout(cct) << "ll_rmdir" << std::endl;
12242 tout(cct) << vino.ino.val << std::endl;
12243 tout(cct) << name << std::endl;
12244
12245 if (!cct->_conf->fuse_default_permissions) {
12246 int r = may_delete(in, name, perms);
12247 if (r < 0)
12248 return r;
12249 }
12250
12251 return _rmdir(in, name, perms);
12252}
12253
12254int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12255{
1adf2230 12256 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12257 << todir->ino << " " << toname
12258 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12259 << dendl;
12260
12261 if (fromdir->snapid != todir->snapid)
12262 return -EXDEV;
12263
12264 int op = CEPH_MDS_OP_RENAME;
12265 if (fromdir->snapid != CEPH_NOSNAP) {
12266 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12267 op = CEPH_MDS_OP_RENAMESNAP;
12268 else
12269 return -EROFS;
12270 }
12271 if (fromdir != todir) {
12272 Inode *fromdir_root =
12273 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12274 Inode *todir_root =
12275 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12276 if (fromdir_root != todir_root) {
12277 return -EXDEV;
12278 }
12279 }
12280
12281 InodeRef target;
12282 MetaRequest *req = new MetaRequest(op);
12283
12284 filepath from;
12285 fromdir->make_nosnap_relative_path(from);
12286 from.push_dentry(fromname);
12287 filepath to;
12288 todir->make_nosnap_relative_path(to);
12289 to.push_dentry(toname);
12290 req->set_filepath(to);
12291 req->set_filepath2(from);
12292
12293 Dentry *oldde;
12294 int res = get_or_create(fromdir, fromname, &oldde);
12295 if (res < 0)
12296 goto fail;
12297 Dentry *de;
12298 res = get_or_create(todir, toname, &de);
12299 if (res < 0)
12300 goto fail;
12301
12302 if (op == CEPH_MDS_OP_RENAME) {
12303 req->set_old_dentry(oldde);
12304 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12305 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12306
12307 req->set_dentry(de);
12308 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12309 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12310
12311 InodeRef oldin, otherin;
12312 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12313 if (res < 0)
12314 goto fail;
b32b8144
FG
12315
12316 Inode *oldinode = oldin.get();
12317 oldinode->break_all_delegs();
12318 req->set_old_inode(oldinode);
7c673cae
FG
12319 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12320
12321 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12322 switch (res) {
12323 case 0:
12324 {
12325 Inode *in = otherin.get();
12326 req->set_other_inode(in);
12327 in->break_all_delegs();
12328 }
7c673cae 12329 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12330 break;
12331 case -ENOENT:
12332 break;
12333 default:
12334 goto fail;
7c673cae
FG
12335 }
12336
12337 req->set_inode(todir);
12338 } else {
12339 // renamesnap reply contains no tracedn, so we need to invalidate
12340 // dentry manually
12341 unlink(oldde, true, true);
12342 unlink(de, true, true);
12343 }
12344
12345 res = make_request(req, perm, &target);
12346 ldout(cct, 10) << "rename result is " << res << dendl;
12347
12348 // renamed item from our cache
12349
12350 trim_cache();
1adf2230 12351 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12352 return res;
12353
12354 fail:
12355 put_request(req);
12356 return res;
12357}
12358
12359int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12360 const char *newname, const UserPerm& perm)
12361{
12362 Mutex::Locker lock(client_lock);
12363
181888fb
FG
12364 if (unmounting)
12365 return -ENOTCONN;
12366
7c673cae
FG
12367 vinodeno_t vparent = _get_vino(parent);
12368 vinodeno_t vnewparent = _get_vino(newparent);
12369
12370 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12371 << vnewparent << " " << newname << dendl;
12372 tout(cct) << "ll_rename" << std::endl;
12373 tout(cct) << vparent.ino.val << std::endl;
12374 tout(cct) << name << std::endl;
12375 tout(cct) << vnewparent.ino.val << std::endl;
12376 tout(cct) << newname << std::endl;
12377
12378 if (!cct->_conf->fuse_default_permissions) {
12379 int r = may_delete(parent, name, perm);
12380 if (r < 0)
12381 return r;
12382 r = may_delete(newparent, newname, perm);
12383 if (r < 0 && r != -ENOENT)
12384 return r;
12385 }
12386
12387 return _rename(parent, name, newparent, newname, perm);
12388}
12389
12390int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12391{
1adf2230 12392 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12393 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12394
12395 if (strlen(newname) > NAME_MAX)
12396 return -ENAMETOOLONG;
12397
12398 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12399 return -EROFS;
12400 }
12401 if (is_quota_files_exceeded(dir, perm)) {
12402 return -EDQUOT;
12403 }
12404
b32b8144 12405 in->break_all_delegs();
7c673cae
FG
12406 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12407
12408 filepath path(newname, dir->ino);
12409 req->set_filepath(path);
12410 filepath existing(in->ino);
12411 req->set_filepath2(existing);
12412
12413 req->set_inode(dir);
12414 req->inode_drop = CEPH_CAP_FILE_SHARED;
12415 req->inode_unless = CEPH_CAP_FILE_EXCL;
12416
12417 Dentry *de;
12418 int res = get_or_create(dir, newname, &de);
12419 if (res < 0)
12420 goto fail;
12421 req->set_dentry(de);
12422
12423 res = make_request(req, perm, inp);
12424 ldout(cct, 10) << "link result is " << res << dendl;
12425
12426 trim_cache();
1adf2230 12427 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12428 return res;
12429
12430 fail:
12431 put_request(req);
12432 return res;
12433}
12434
12435int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12436 const UserPerm& perm)
12437{
12438 Mutex::Locker lock(client_lock);
12439
181888fb
FG
12440 if (unmounting)
12441 return -ENOTCONN;
12442
7c673cae
FG
12443 vinodeno_t vino = _get_vino(in);
12444 vinodeno_t vnewparent = _get_vino(newparent);
12445
31f18b77 12446 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12447 newname << dendl;
12448 tout(cct) << "ll_link" << std::endl;
12449 tout(cct) << vino.ino.val << std::endl;
12450 tout(cct) << vnewparent << std::endl;
12451 tout(cct) << newname << std::endl;
12452
12453 int r = 0;
12454 InodeRef target;
12455
12456 if (!cct->_conf->fuse_default_permissions) {
12457 if (S_ISDIR(in->mode))
12458 return -EPERM;
12459
12460 r = may_hardlink(in, perm);
12461 if (r < 0)
12462 return r;
12463
12464 r = may_create(newparent, perm);
12465 if (r < 0)
12466 return r;
12467 }
12468
12469 return _link(in, newparent, newname, perm, &target);
12470}
12471
12472int Client::ll_num_osds(void)
12473{
12474 Mutex::Locker lock(client_lock);
12475 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12476}
12477
12478int Client::ll_osdaddr(int osd, uint32_t *addr)
12479{
12480 Mutex::Locker lock(client_lock);
181888fb 12481
7c673cae
FG
12482 entity_addr_t g;
12483 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12484 if (!o.exists(osd))
12485 return false;
12486 g = o.get_addr(osd);
12487 return true;
12488 });
12489 if (!exists)
12490 return -1;
12491 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12492 *addr = ntohl(nb_addr);
12493 return 0;
12494}
181888fb 12495
7c673cae
FG
12496uint32_t Client::ll_stripe_unit(Inode *in)
12497{
12498 Mutex::Locker lock(client_lock);
12499 return in->layout.stripe_unit;
12500}
12501
12502uint64_t Client::ll_snap_seq(Inode *in)
12503{
12504 Mutex::Locker lock(client_lock);
12505 return in->snaprealm->seq;
12506}
12507
12508int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12509{
12510 Mutex::Locker lock(client_lock);
12511 *layout = in->layout;
12512 return 0;
12513}
12514
12515int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12516{
12517 return ll_file_layout(fh->inode.get(), layout);
12518}
12519
12520/* Currently we cannot take advantage of redundancy in reads, since we
12521 would have to go through all possible placement groups (a
12522 potentially quite large number determined by a hash), and use CRUSH
12523 to calculate the appropriate set of OSDs for each placement group,
12524 then index into that. An array with one entry per OSD is much more
12525 tractable and works for demonstration purposes. */
12526
12527int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12528 file_layout_t* layout)
12529{
12530 Mutex::Locker lock(client_lock);
181888fb 12531
28e407b8 12532 inodeno_t ino = in->ino;
7c673cae
FG
12533 uint32_t object_size = layout->object_size;
12534 uint32_t su = layout->stripe_unit;
12535 uint32_t stripe_count = layout->stripe_count;
12536 uint64_t stripes_per_object = object_size / su;
12537
12538 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12539 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12540 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12541 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12542
12543 object_t oid = file_object_t(ino, objectno);
12544 return objecter->with_osdmap([&](const OSDMap& o) {
12545 ceph_object_layout olayout =
12546 o.file_to_object_layout(oid, *layout);
12547 pg_t pg = (pg_t)olayout.ol_pgid;
12548 vector<int> osds;
12549 int primary;
12550 o.pg_to_acting_osds(pg, &osds, &primary);
12551 return primary;
12552 });
12553}
12554
12555/* Return the offset of the block, internal to the object */
12556
12557uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12558{
12559 Mutex::Locker lock(client_lock);
12560 file_layout_t *layout=&(in->layout);
12561 uint32_t object_size = layout->object_size;
12562 uint32_t su = layout->stripe_unit;
12563 uint64_t stripes_per_object = object_size / su;
12564
12565 return (blockno % stripes_per_object) * su;
12566}
12567
12568int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12569 const UserPerm& perms)
12570{
12571 Mutex::Locker lock(client_lock);
12572
181888fb
FG
12573 if (unmounting)
12574 return -ENOTCONN;
12575
7c673cae
FG
12576 vinodeno_t vino = _get_vino(in);
12577
12578 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12579 tout(cct) << "ll_opendir" << std::endl;
12580 tout(cct) << vino.ino.val << std::endl;
12581
12582 if (!cct->_conf->fuse_default_permissions) {
12583 int r = may_open(in, flags, perms);
12584 if (r < 0)
12585 return r;
12586 }
12587
12588 int r = _opendir(in, dirpp, perms);
12589 tout(cct) << (unsigned long)*dirpp << std::endl;
12590
12591 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12592 << dendl;
12593 return r;
12594}
12595
12596int Client::ll_releasedir(dir_result_t *dirp)
12597{
12598 Mutex::Locker lock(client_lock);
12599 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12600 tout(cct) << "ll_releasedir" << std::endl;
12601 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12602
12603 if (unmounting)
12604 return -ENOTCONN;
12605
7c673cae
FG
12606 _closedir(dirp);
12607 return 0;
12608}
12609
12610int Client::ll_fsyncdir(dir_result_t *dirp)
12611{
12612 Mutex::Locker lock(client_lock);
12613 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12614 tout(cct) << "ll_fsyncdir" << std::endl;
12615 tout(cct) << (unsigned long)dirp << std::endl;
12616
181888fb
FG
12617 if (unmounting)
12618 return -ENOTCONN;
12619
7c673cae
FG
12620 return _fsync(dirp->inode.get(), false);
12621}
12622
12623int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12624{
12625 assert(!(flags & O_CREAT));
12626
12627 Mutex::Locker lock(client_lock);
12628
181888fb
FG
12629 if (unmounting)
12630 return -ENOTCONN;
12631
7c673cae
FG
12632 vinodeno_t vino = _get_vino(in);
12633
12634 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12635 tout(cct) << "ll_open" << std::endl;
12636 tout(cct) << vino.ino.val << std::endl;
12637 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12638
12639 int r;
12640 if (!cct->_conf->fuse_default_permissions) {
12641 r = may_open(in, flags, perms);
12642 if (r < 0)
12643 goto out;
12644 }
12645
12646 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12647
12648 out:
12649 Fh *fhptr = fhp ? *fhp : NULL;
12650 if (fhptr) {
12651 ll_unclosed_fh_set.insert(fhptr);
12652 }
12653 tout(cct) << (unsigned long)fhptr << std::endl;
12654 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12655 " = " << r << " (" << fhptr << ")" << dendl;
12656 return r;
12657}
12658
12659int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12660 int flags, InodeRef *in, int caps, Fh **fhp,
12661 const UserPerm& perms)
12662{
12663 *fhp = NULL;
12664
12665 vinodeno_t vparent = _get_vino(parent);
12666
1adf2230 12667 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12668 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12669 << ", gid " << perms.gid() << dendl;
12670 tout(cct) << "ll_create" << std::endl;
12671 tout(cct) << vparent.ino.val << std::endl;
12672 tout(cct) << name << std::endl;
12673 tout(cct) << mode << std::endl;
12674 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12675
12676 bool created = false;
12677 int r = _lookup(parent, name, caps, in, perms);
12678
12679 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12680 return -EEXIST;
12681
12682 if (r == -ENOENT && (flags & O_CREAT)) {
12683 if (!cct->_conf->fuse_default_permissions) {
12684 r = may_create(parent, perms);
12685 if (r < 0)
12686 goto out;
12687 }
12688 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12689 perms);
12690 if (r < 0)
12691 goto out;
12692 }
12693
12694 if (r < 0)
12695 goto out;
12696
12697 assert(*in);
12698
12699 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12700 if (!created) {
12701 if (!cct->_conf->fuse_default_permissions) {
12702 r = may_open(in->get(), flags, perms);
12703 if (r < 0) {
12704 if (*fhp) {
12705 int release_r = _release_fh(*fhp);
12706 assert(release_r == 0); // during create, no async data ops should have happened
12707 }
12708 goto out;
12709 }
12710 }
12711 if (*fhp == NULL) {
12712 r = _open(in->get(), flags, mode, fhp, perms);
12713 if (r < 0)
12714 goto out;
12715 }
12716 }
12717
12718out:
12719 if (*fhp) {
12720 ll_unclosed_fh_set.insert(*fhp);
12721 }
12722
12723 ino_t ino = 0;
12724 if (r >= 0) {
12725 Inode *inode = in->get();
12726 if (use_faked_inos())
12727 ino = inode->faked_ino;
12728 else
12729 ino = inode->ino;
12730 }
12731
12732 tout(cct) << (unsigned long)*fhp << std::endl;
12733 tout(cct) << ino << std::endl;
1adf2230 12734 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12735 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12736 *fhp << " " << hex << ino << dec << ")" << dendl;
12737
12738 return r;
12739}
12740
12741int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12742 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12743 const UserPerm& perms)
12744{
12745 Mutex::Locker lock(client_lock);
12746 InodeRef in;
12747
181888fb
FG
12748 if (unmounting)
12749 return -ENOTCONN;
12750
7c673cae
FG
12751 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12752 fhp, perms);
12753 if (r >= 0) {
12754 assert(in);
12755
12756 // passing an Inode in outp requires an additional ref
12757 if (outp) {
12758 _ll_get(in.get());
12759 *outp = in.get();
12760 }
12761 fill_stat(in, attr);
12762 } else {
12763 attr->st_ino = 0;
12764 }
12765
12766 return r;
12767}
12768
12769int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12770 int oflags, Inode **outp, Fh **fhp,
12771 struct ceph_statx *stx, unsigned want, unsigned lflags,
12772 const UserPerm& perms)
12773{
12774 unsigned caps = statx_to_mask(lflags, want);
12775 Mutex::Locker lock(client_lock);
12776 InodeRef in;
12777
181888fb
FG
12778 if (unmounting)
12779 return -ENOTCONN;
7c673cae
FG
12780
12781 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12782 if (r >= 0) {
12783 assert(in);
12784
12785 // passing an Inode in outp requires an additional ref
12786 if (outp) {
12787 _ll_get(in.get());
12788 *outp = in.get();
12789 }
12790 fill_statx(in, caps, stx);
12791 } else {
12792 stx->stx_ino = 0;
12793 stx->stx_mask = 0;
12794 }
12795
12796 return r;
12797}
12798
12799loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12800{
12801 Mutex::Locker lock(client_lock);
12802 tout(cct) << "ll_lseek" << std::endl;
12803 tout(cct) << offset << std::endl;
12804 tout(cct) << whence << std::endl;
12805
181888fb
FG
12806 if (unmounting)
12807 return -ENOTCONN;
12808
7c673cae
FG
12809 return _lseek(fh, offset, whence);
12810}
12811
12812int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12813{
12814 Mutex::Locker lock(client_lock);
12815 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12816 tout(cct) << "ll_read" << std::endl;
12817 tout(cct) << (unsigned long)fh << std::endl;
12818 tout(cct) << off << std::endl;
12819 tout(cct) << len << std::endl;
12820
181888fb
FG
12821 if (unmounting)
12822 return -ENOTCONN;
12823
7c673cae
FG
12824 return _read(fh, off, len, bl);
12825}
12826
12827int Client::ll_read_block(Inode *in, uint64_t blockid,
12828 char *buf,
12829 uint64_t offset,
12830 uint64_t length,
12831 file_layout_t* layout)
12832{
12833 Mutex::Locker lock(client_lock);
181888fb
FG
12834
12835 if (unmounting)
12836 return -ENOTCONN;
12837
b32b8144 12838 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12839 object_t oid = file_object_t(vino.ino, blockid);
12840 C_SaferCond onfinish;
12841 bufferlist bl;
12842
12843 objecter->read(oid,
12844 object_locator_t(layout->pool_id),
12845 offset,
12846 length,
12847 vino.snapid,
12848 &bl,
12849 CEPH_OSD_FLAG_READ,
12850 &onfinish);
12851
12852 client_lock.Unlock();
12853 int r = onfinish.wait();
12854 client_lock.Lock();
12855
12856 if (r >= 0) {
12857 bl.copy(0, bl.length(), buf);
12858 r = bl.length();
12859 }
12860
12861 return r;
12862}
12863
12864/* It appears that the OSD doesn't return success unless the entire
12865 buffer was written, return the write length on success. */
12866
12867int Client::ll_write_block(Inode *in, uint64_t blockid,
12868 char* buf, uint64_t offset,
12869 uint64_t length, file_layout_t* layout,
12870 uint64_t snapseq, uint32_t sync)
12871{
12872 Mutex flock("Client::ll_write_block flock");
12873 vinodeno_t vino = ll_get_vino(in);
12874 Cond cond;
12875 bool done;
12876 int r = 0;
181888fb 12877 Context *onsafe = nullptr;
7c673cae
FG
12878
12879 if (length == 0) {
12880 return -EINVAL;
12881 }
12882 if (true || sync) {
12883 /* if write is stable, the epilogue is waiting on
12884 * flock */
12885 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12886 done = false;
12887 } else {
12888 /* if write is unstable, we just place a barrier for
12889 * future commits to wait on */
12890 /*onsafe = new C_Block_Sync(this, vino.ino,
12891 barrier_interval(offset, offset + length), &r);
12892 */
12893 done = true;
12894 }
12895 object_t oid = file_object_t(vino.ino, blockid);
12896 SnapContext fakesnap;
12897 bufferptr bp;
12898 if (length > 0) bp = buffer::copy(buf, length);
12899 bufferlist bl;
12900 bl.push_back(bp);
12901
12902 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12903 << dendl;
12904
12905 fakesnap.seq = snapseq;
12906
12907 /* lock just in time */
12908 client_lock.Lock();
181888fb
FG
12909 if (unmounting) {
12910 client_lock.Unlock();
12911 delete onsafe;
12912 return -ENOTCONN;
12913 }
7c673cae
FG
12914
12915 objecter->write(oid,
12916 object_locator_t(layout->pool_id),
12917 offset,
12918 length,
12919 fakesnap,
12920 bl,
12921 ceph::real_clock::now(),
12922 0,
12923 onsafe);
12924
12925 client_lock.Unlock();
12926 if (!done /* also !sync */) {
12927 flock.Lock();
12928 while (! done)
12929 cond.Wait(flock);
12930 flock.Unlock();
12931 }
12932
12933 if (r < 0) {
12934 return r;
12935 } else {
12936 return length;
12937 }
12938}
12939
12940int Client::ll_commit_blocks(Inode *in,
12941 uint64_t offset,
12942 uint64_t length)
12943{
12944 Mutex::Locker lock(client_lock);
12945 /*
12946 BarrierContext *bctx;
b32b8144 12947 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12948 uint64_t ino = vino.ino;
12949
12950 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12951 << offset << " to " << length << dendl;
12952
12953 if (length == 0) {
12954 return -EINVAL;
12955 }
12956
12957 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12958 if (p != barriers.end()) {
12959 barrier_interval civ(offset, offset + length);
12960 p->second->commit_barrier(civ);
12961 }
12962 */
12963 return 0;
12964}
12965
12966int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12967{
12968 Mutex::Locker lock(client_lock);
12969 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12970 "~" << len << dendl;
12971 tout(cct) << "ll_write" << std::endl;
12972 tout(cct) << (unsigned long)fh << std::endl;
12973 tout(cct) << off << std::endl;
12974 tout(cct) << len << std::endl;
12975
181888fb
FG
12976 if (unmounting)
12977 return -ENOTCONN;
12978
7c673cae
FG
12979 int r = _write(fh, off, len, data, NULL, 0);
12980 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12981 << dendl;
12982 return r;
12983}
12984
12985int Client::ll_flush(Fh *fh)
12986{
12987 Mutex::Locker lock(client_lock);
12988 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12989 tout(cct) << "ll_flush" << std::endl;
12990 tout(cct) << (unsigned long)fh << std::endl;
12991
181888fb
FG
12992 if (unmounting)
12993 return -ENOTCONN;
12994
7c673cae
FG
12995 return _flush(fh);
12996}
12997
12998int Client::ll_fsync(Fh *fh, bool syncdataonly)
12999{
13000 Mutex::Locker lock(client_lock);
13001 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13002 tout(cct) << "ll_fsync" << std::endl;
13003 tout(cct) << (unsigned long)fh << std::endl;
13004
181888fb
FG
13005 if (unmounting)
13006 return -ENOTCONN;
13007
7c673cae
FG
13008 int r = _fsync(fh, syncdataonly);
13009 if (r) {
13010 // If we're returning an error, clear it from the FH
13011 fh->take_async_err();
13012 }
13013 return r;
13014}
13015
28e407b8
AA
13016int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13017{
13018 Mutex::Locker lock(client_lock);
13019 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13020 tout(cct) << "ll_sync_inode" << std::endl;
13021 tout(cct) << (unsigned long)in << std::endl;
13022
13023 if (unmounting)
13024 return -ENOTCONN;
13025
13026 return _fsync(in, syncdataonly);
13027}
13028
7c673cae
FG
13029#ifdef FALLOC_FL_PUNCH_HOLE
13030
13031int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13032{
13033 if (offset < 0 || length <= 0)
13034 return -EINVAL;
13035
13036 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13037 return -EOPNOTSUPP;
13038
13039 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13040 return -EOPNOTSUPP;
13041
13042 Inode *in = fh->inode.get();
13043
13044 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13045 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13046 return -ENOSPC;
13047 }
13048
13049 if (in->snapid != CEPH_NOSNAP)
13050 return -EROFS;
13051
13052 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13053 return -EBADF;
13054
13055 uint64_t size = offset + length;
28e407b8 13056 std::list<InodeRef> quota_roots;
7c673cae
FG
13057 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13058 size > in->size &&
28e407b8 13059 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
7c673cae
FG
13060 return -EDQUOT;
13061 }
13062
13063 int have;
13064 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13065 if (r < 0)
13066 return r;
13067
13068 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
13069 Cond uninline_cond;
13070 bool uninline_done = false;
13071 int uninline_ret = 0;
13072 Context *onuninline = NULL;
13073
13074 if (mode & FALLOC_FL_PUNCH_HOLE) {
13075 if (in->inline_version < CEPH_INLINE_NONE &&
13076 (have & CEPH_CAP_FILE_BUFFER)) {
13077 bufferlist bl;
13078 int len = in->inline_data.length();
13079 if (offset < len) {
13080 if (offset > 0)
13081 in->inline_data.copy(0, offset, bl);
13082 int size = length;
13083 if (offset + size > len)
13084 size = len - offset;
13085 if (size > 0)
13086 bl.append_zero(size);
13087 if (offset + size < len)
13088 in->inline_data.copy(offset + size, len - offset - size, bl);
13089 in->inline_data = bl;
13090 in->inline_version++;
13091 }
91327a77 13092 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13093 in->change_attr++;
28e407b8 13094 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13095 } else {
13096 if (in->inline_version < CEPH_INLINE_NONE) {
13097 onuninline = new C_SafeCond(&uninline_flock,
13098 &uninline_cond,
13099 &uninline_done,
13100 &uninline_ret);
13101 uninline_data(in, onuninline);
13102 }
13103
13104 Mutex flock("Client::_punch_hole flock");
13105 Cond cond;
13106 bool done = false;
13107 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
13108
13109 unsafe_sync_write++;
13110 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13111
13112 _invalidate_inode_cache(in, offset, length);
13113 filer->zero(in->ino, &in->layout,
13114 in->snaprealm->get_snap_context(),
13115 offset, length,
13116 ceph::real_clock::now(),
13117 0, true, onfinish);
91327a77 13118 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13119 in->change_attr++;
28e407b8 13120 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13121
13122 client_lock.Unlock();
13123 flock.Lock();
13124 while (!done)
13125 cond.Wait(flock);
13126 flock.Unlock();
13127 client_lock.Lock();
13128 _sync_write_commit(in);
13129 }
13130 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13131 uint64_t size = offset + length;
13132 if (size > in->size) {
13133 in->size = size;
91327a77 13134 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13135 in->change_attr++;
28e407b8 13136 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13137
28e407b8 13138 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 13139 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13140 } else if (is_max_size_approaching(in)) {
13141 check_caps(in, 0);
7c673cae
FG
13142 }
13143 }
13144 }
13145
13146 if (onuninline) {
13147 client_lock.Unlock();
13148 uninline_flock.Lock();
13149 while (!uninline_done)
13150 uninline_cond.Wait(uninline_flock);
13151 uninline_flock.Unlock();
13152 client_lock.Lock();
13153
13154 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13155 in->inline_data.clear();
13156 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13157 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13158 check_caps(in, 0);
13159 } else
13160 r = uninline_ret;
13161 }
13162
13163 put_cap_ref(in, CEPH_CAP_FILE_WR);
13164 return r;
13165}
13166#else
13167
13168int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13169{
13170 return -EOPNOTSUPP;
13171}
13172
13173#endif
13174
13175
13176int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13177{
13178 Mutex::Locker lock(client_lock);
13179 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13180 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13181 tout(cct) << (unsigned long)fh << std::endl;
13182
181888fb
FG
13183 if (unmounting)
13184 return -ENOTCONN;
13185
7c673cae
FG
13186 return _fallocate(fh, mode, offset, length);
13187}
13188
13189int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13190{
13191 Mutex::Locker lock(client_lock);
13192 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13193
181888fb
FG
13194 if (unmounting)
13195 return -ENOTCONN;
13196
7c673cae
FG
13197 Fh *fh = get_filehandle(fd);
13198 if (!fh)
13199 return -EBADF;
13200#if defined(__linux__) && defined(O_PATH)
13201 if (fh->flags & O_PATH)
13202 return -EBADF;
13203#endif
13204 return _fallocate(fh, mode, offset, length);
13205}
13206
13207int Client::ll_release(Fh *fh)
13208{
13209 Mutex::Locker lock(client_lock);
91327a77
AA
13210
13211 if (unmounting)
13212 return -ENOTCONN;
13213
7c673cae
FG
13214 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13215 dendl;
13216 tout(cct) << "ll_release (fh)" << std::endl;
13217 tout(cct) << (unsigned long)fh << std::endl;
13218
13219 if (ll_unclosed_fh_set.count(fh))
13220 ll_unclosed_fh_set.erase(fh);
13221 return _release_fh(fh);
13222}
13223
13224int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13225{
13226 Mutex::Locker lock(client_lock);
13227
13228 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13229 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13230
181888fb
FG
13231 if (unmounting)
13232 return -ENOTCONN;
13233
7c673cae
FG
13234 return _getlk(fh, fl, owner);
13235}
13236
13237int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13238{
13239 Mutex::Locker lock(client_lock);
13240
13241 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13242 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13243
181888fb
FG
13244 if (unmounting)
13245 return -ENOTCONN;
13246
7c673cae
FG
13247 return _setlk(fh, fl, owner, sleep);
13248}
13249
13250int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13251{
13252 Mutex::Locker lock(client_lock);
13253
13254 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13255 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13256
181888fb
FG
13257 if (unmounting)
13258 return -ENOTCONN;
13259
7c673cae
FG
13260 return _flock(fh, cmd, owner);
13261}
13262
b32b8144
FG
13263int Client::set_deleg_timeout(uint32_t timeout)
13264{
13265 Mutex::Locker lock(client_lock);
13266
13267 /*
13268 * The whole point is to prevent blacklisting so we must time out the
13269 * delegation before the session autoclose timeout kicks in.
13270 */
13271 if (timeout >= mdsmap->get_session_autoclose())
13272 return -EINVAL;
13273
13274 deleg_timeout = timeout;
13275 return 0;
13276}
13277
13278int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13279{
13280 int ret = -EINVAL;
13281
13282 Mutex::Locker lock(client_lock);
13283
13284 if (!mounted)
13285 return -ENOTCONN;
13286
13287 Inode *inode = fh->inode.get();
13288
13289 switch(cmd) {
13290 case CEPH_DELEGATION_NONE:
13291 inode->unset_deleg(fh);
13292 ret = 0;
13293 break;
13294 default:
13295 try {
13296 ret = inode->set_deleg(fh, cmd, cb, priv);
13297 } catch (std::bad_alloc) {
13298 ret = -ENOMEM;
13299 }
13300 break;
13301 }
13302 return ret;
13303}
13304
7c673cae
FG
13305class C_Client_RequestInterrupt : public Context {
13306private:
13307 Client *client;
13308 MetaRequest *req;
13309public:
13310 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13311 req->get();
13312 }
13313 void finish(int r) override {
13314 Mutex::Locker l(client->client_lock);
13315 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13316 client->_interrupt_filelock(req);
13317 client->put_request(req);
13318 }
13319};
13320
13321void Client::ll_interrupt(void *d)
13322{
13323 MetaRequest *req = static_cast<MetaRequest*>(d);
13324 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13325 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13326 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13327}
13328
13329// =========================================
13330// layout
13331
13332// expose file layouts
13333
13334int Client::describe_layout(const char *relpath, file_layout_t *lp,
13335 const UserPerm& perms)
13336{
13337 Mutex::Locker lock(client_lock);
13338
181888fb
FG
13339 if (unmounting)
13340 return -ENOTCONN;
13341
7c673cae
FG
13342 filepath path(relpath);
13343 InodeRef in;
13344 int r = path_walk(path, &in, perms);
13345 if (r < 0)
13346 return r;
13347
13348 *lp = in->layout;
13349
13350 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13351 return 0;
13352}
13353
13354int Client::fdescribe_layout(int fd, file_layout_t *lp)
13355{
13356 Mutex::Locker lock(client_lock);
13357
181888fb
FG
13358 if (unmounting)
13359 return -ENOTCONN;
13360
7c673cae
FG
13361 Fh *f = get_filehandle(fd);
13362 if (!f)
13363 return -EBADF;
13364 Inode *in = f->inode.get();
13365
13366 *lp = in->layout;
13367
13368 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13369 return 0;
13370}
13371
d2e6a577
FG
13372int64_t Client::get_default_pool_id()
13373{
13374 Mutex::Locker lock(client_lock);
181888fb
FG
13375
13376 if (unmounting)
13377 return -ENOTCONN;
13378
d2e6a577
FG
13379 /* first data pool is the default */
13380 return mdsmap->get_first_data_pool();
13381}
7c673cae
FG
13382
13383// expose osdmap
13384
13385int64_t Client::get_pool_id(const char *pool_name)
13386{
13387 Mutex::Locker lock(client_lock);
181888fb
FG
13388
13389 if (unmounting)
13390 return -ENOTCONN;
13391
7c673cae
FG
13392 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13393 pool_name);
13394}
13395
13396string Client::get_pool_name(int64_t pool)
13397{
13398 Mutex::Locker lock(client_lock);
181888fb
FG
13399
13400 if (unmounting)
13401 return string();
13402
7c673cae
FG
13403 return objecter->with_osdmap([pool](const OSDMap& o) {
13404 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13405 });
13406}
13407
13408int Client::get_pool_replication(int64_t pool)
13409{
13410 Mutex::Locker lock(client_lock);
181888fb
FG
13411
13412 if (unmounting)
13413 return -ENOTCONN;
13414
7c673cae
FG
13415 return objecter->with_osdmap([pool](const OSDMap& o) {
13416 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13417 });
13418}
13419
13420int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13421{
13422 Mutex::Locker lock(client_lock);
13423
181888fb
FG
13424 if (unmounting)
13425 return -ENOTCONN;
13426
7c673cae
FG
13427 Fh *f = get_filehandle(fd);
13428 if (!f)
13429 return -EBADF;
13430 Inode *in = f->inode.get();
13431
13432 vector<ObjectExtent> extents;
13433 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13434 assert(extents.size() == 1);
13435
13436 objecter->with_osdmap([&](const OSDMap& o) {
13437 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13438 o.pg_to_acting_osds(pg, osds);
13439 });
13440
13441 if (osds.empty())
13442 return -EINVAL;
13443
13444 /*
13445 * Return the remainder of the extent (stripe unit)
13446 *
13447 * If length = 1 is passed to Striper::file_to_extents we get a single
13448 * extent back, but its length is one so we still need to compute the length
13449 * to the end of the stripe unit.
13450 *
13451 * If length = su then we may get 1 or 2 objects back in the extents vector
13452 * which would have to be examined. Even then, the offsets are local to the
13453 * object, so matching up to the file offset is extra work.
13454 *
13455 * It seems simpler to stick with length = 1 and manually compute the
13456 * remainder.
13457 */
13458 if (len) {
13459 uint64_t su = in->layout.stripe_unit;
13460 *len = su - (off % su);
13461 }
13462
13463 return 0;
13464}
13465
13466int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13467{
13468 Mutex::Locker lock(client_lock);
181888fb
FG
13469
13470 if (unmounting)
13471 return -ENOTCONN;
13472
7c673cae
FG
13473 if (id < 0)
13474 return -EINVAL;
13475 return objecter->with_osdmap([&](const OSDMap& o) {
13476 return o.crush->get_full_location_ordered(id, path);
13477 });
13478}
13479
13480int Client::get_file_stripe_address(int fd, loff_t offset,
13481 vector<entity_addr_t>& address)
13482{
13483 Mutex::Locker lock(client_lock);
13484
181888fb
FG
13485 if (unmounting)
13486 return -ENOTCONN;
13487
7c673cae
FG
13488 Fh *f = get_filehandle(fd);
13489 if (!f)
13490 return -EBADF;
13491 Inode *in = f->inode.get();
13492
13493 // which object?
13494 vector<ObjectExtent> extents;
13495 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13496 in->truncate_size, extents);
13497 assert(extents.size() == 1);
13498
13499 // now we have the object and its 'layout'
13500 return objecter->with_osdmap([&](const OSDMap& o) {
13501 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13502 vector<int> osds;
13503 o.pg_to_acting_osds(pg, osds);
13504 if (osds.empty())
13505 return -EINVAL;
13506 for (unsigned i = 0; i < osds.size(); i++) {
13507 entity_addr_t addr = o.get_addr(osds[i]);
13508 address.push_back(addr);
13509 }
13510 return 0;
13511 });
13512}
13513
13514int Client::get_osd_addr(int osd, entity_addr_t& addr)
13515{
13516 Mutex::Locker lock(client_lock);
181888fb
FG
13517
13518 if (unmounting)
13519 return -ENOTCONN;
13520
7c673cae
FG
13521 return objecter->with_osdmap([&](const OSDMap& o) {
13522 if (!o.exists(osd))
13523 return -ENOENT;
13524
13525 addr = o.get_addr(osd);
13526 return 0;
13527 });
13528}
13529
13530int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13531 loff_t length, loff_t offset)
13532{
13533 Mutex::Locker lock(client_lock);
13534
181888fb
FG
13535 if (unmounting)
13536 return -ENOTCONN;
13537
7c673cae
FG
13538 Fh *f = get_filehandle(fd);
13539 if (!f)
13540 return -EBADF;
13541 Inode *in = f->inode.get();
13542
13543 // map to a list of extents
13544 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13545
13546 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13547 return 0;
13548}
13549
13550
b32b8144 13551/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13552int Client::get_local_osd()
13553{
13554 Mutex::Locker lock(client_lock);
181888fb
FG
13555
13556 if (unmounting)
13557 return -ENOTCONN;
13558
7c673cae
FG
13559 objecter->with_osdmap([this](const OSDMap& o) {
13560 if (o.get_epoch() != local_osd_epoch) {
13561 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13562 local_osd_epoch = o.get_epoch();
13563 }
13564 });
13565 return local_osd;
13566}
13567
13568
13569
13570
13571
13572
13573// ===============================
13574
13575void Client::ms_handle_connect(Connection *con)
13576{
13577 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13578}
13579
13580bool Client::ms_handle_reset(Connection *con)
13581{
13582 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13583 return false;
13584}
13585
13586void Client::ms_handle_remote_reset(Connection *con)
13587{
13588 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13589 Mutex::Locker l(client_lock);
13590 switch (con->get_peer_type()) {
13591 case CEPH_ENTITY_TYPE_MDS:
13592 {
13593 // kludge to figure out which mds this is; fixme with a Connection* state
13594 mds_rank_t mds = MDS_RANK_NONE;
13595 MetaSession *s = NULL;
13596 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13597 p != mds_sessions.end();
13598 ++p) {
13599 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13600 mds = p->first;
13601 s = p->second;
13602 }
13603 }
13604 if (mds >= 0) {
d2e6a577 13605 assert (s != NULL);
7c673cae
FG
13606 switch (s->state) {
13607 case MetaSession::STATE_CLOSING:
13608 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13609 _closed_mds_session(s);
13610 break;
13611
13612 case MetaSession::STATE_OPENING:
13613 {
13614 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13615 list<Context*> waiters;
13616 waiters.swap(s->waiting_for_open);
13617 _closed_mds_session(s);
13618 MetaSession *news = _get_or_open_mds_session(mds);
13619 news->waiting_for_open.swap(waiters);
13620 }
13621 break;
13622
13623 case MetaSession::STATE_OPEN:
13624 {
28e407b8 13625 objecter->maybe_request_map(); /* to check if we are blacklisted */
7c673cae
FG
13626 const md_config_t *conf = cct->_conf;
13627 if (conf->client_reconnect_stale) {
13628 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13629 _closed_mds_session(s);
13630 } else {
13631 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13632 s->state = MetaSession::STATE_STALE;
13633 }
13634 }
13635 break;
13636
13637 case MetaSession::STATE_NEW:
13638 case MetaSession::STATE_CLOSED:
13639 default:
13640 break;
13641 }
13642 }
13643 }
13644 break;
13645 }
13646}
13647
13648bool Client::ms_handle_refused(Connection *con)
13649{
13650 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13651 return false;
13652}
13653
13654bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13655{
13656 if (dest_type == CEPH_ENTITY_TYPE_MON)
13657 return true;
13658 *authorizer = monclient->build_authorizer(dest_type);
13659 return true;
13660}
13661
13662Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13663{
13664 Inode *cur = in;
13665 utime_t now = ceph_clock_now();
13666
13667 while (cur) {
13668 if (cur != in && cur->quota.is_enable())
13669 break;
13670
13671 Inode *parent_in = NULL;
13672 if (!cur->dn_set.empty()) {
13673 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13674 Dentry *dn = *p;
13675 if (dn->lease_mds >= 0 &&
13676 dn->lease_ttl > now &&
13677 mds_sessions.count(dn->lease_mds)) {
13678 parent_in = dn->dir->parent_inode;
13679 } else {
13680 Inode *diri = dn->dir->parent_inode;
13681 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13682 diri->shared_gen == dn->cap_shared_gen) {
13683 parent_in = dn->dir->parent_inode;
13684 }
13685 }
13686 if (parent_in)
13687 break;
13688 }
13689 } else if (root_parents.count(cur)) {
13690 parent_in = root_parents[cur].get();
13691 }
13692
13693 if (parent_in) {
13694 cur = parent_in;
13695 continue;
13696 }
13697
13698 if (cur == root_ancestor)
13699 break;
13700
181888fb
FG
13701 // deleted inode
13702 if (cur->nlink == 0) {
13703 cur = root_ancestor;
13704 break;
13705 }
13706
7c673cae
FG
13707 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13708 filepath path(cur->ino);
13709 req->set_filepath(path);
13710 req->set_inode(cur);
13711
13712 InodeRef parent_ref;
13713 int ret = make_request(req, perms, &parent_ref);
13714 if (ret < 0) {
13715 ldout(cct, 1) << __func__ << " " << in->vino()
13716 << " failed to find parent of " << cur->vino()
13717 << " err " << ret << dendl;
13718 // FIXME: what to do?
13719 cur = root_ancestor;
13720 break;
13721 }
13722
13723 now = ceph_clock_now();
13724 if (cur == in)
13725 cur = parent_ref.get();
13726 else
13727 cur = in; // start over
13728 }
13729
13730 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13731 return cur;
13732}
13733
13734/**
13735 * Traverse quota ancestors of the Inode, return true
13736 * if any of them passes the passed function
13737 */
13738bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13739 std::function<bool (const Inode &in)> test)
13740{
13741 while (true) {
13742 assert(in != NULL);
13743 if (test(*in)) {
13744 return true;
13745 }
13746
13747 if (in == root_ancestor) {
13748 // We're done traversing, drop out
13749 return false;
13750 } else {
13751 // Continue up the tree
13752 in = get_quota_root(in, perms);
13753 }
13754 }
13755
13756 return false;
13757}
13758
13759bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13760{
13761 return check_quota_condition(in, perms,
13762 [](const Inode &in) {
13763 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13764 });
13765}
13766
13767bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
28e407b8
AA
13768 const UserPerm& perms,
13769 std::list<InodeRef>* quota_roots)
7c673cae
FG
13770{
13771 return check_quota_condition(in, perms,
28e407b8
AA
13772 [&new_bytes, quota_roots](const Inode &in) {
13773 if (quota_roots)
13774 quota_roots->emplace_back(const_cast<Inode*>(&in));
7c673cae
FG
13775 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13776 > in.quota.max_bytes;
13777 });
13778}
13779
28e407b8 13780bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
7c673cae 13781{
28e407b8
AA
13782 assert(in->size >= in->reported_size);
13783 const uint64_t size = in->size - in->reported_size;
13784
13785 for (auto& diri : quota_roots) {
13786 if (diri->quota.max_bytes) {
13787 if (diri->rstat.rbytes >= diri->quota.max_bytes)
13788 return true;
13789
13790 uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
13791 if ((space >> 4) < size)
13792 return true;
13793 }
13794 }
13795 return false;
7c673cae
FG
13796}
13797
13798enum {
13799 POOL_CHECKED = 1,
13800 POOL_CHECKING = 2,
13801 POOL_READ = 4,
13802 POOL_WRITE = 8,
13803};
13804
13805int Client::check_pool_perm(Inode *in, int need)
13806{
13807 if (!cct->_conf->client_check_pool_perm)
13808 return 0;
13809
13810 int64_t pool_id = in->layout.pool_id;
13811 std::string pool_ns = in->layout.pool_ns;
13812 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13813 int have = 0;
13814 while (true) {
13815 auto it = pool_perms.find(perm_key);
13816 if (it == pool_perms.end())
13817 break;
13818 if (it->second == POOL_CHECKING) {
13819 // avoid concurrent checkings
13820 wait_on_list(waiting_for_pool_perm);
13821 } else {
13822 have = it->second;
13823 assert(have & POOL_CHECKED);
13824 break;
13825 }
13826 }
13827
13828 if (!have) {
13829 if (in->snapid != CEPH_NOSNAP) {
13830 // pool permission check needs to write to the first object. But for snapshot,
13831 // head of the first object may have alread been deleted. To avoid creating
13832 // orphan object, skip the check for now.
13833 return 0;
13834 }
13835
13836 pool_perms[perm_key] = POOL_CHECKING;
13837
13838 char oid_buf[32];
13839 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13840 object_t oid = oid_buf;
13841
13842 SnapContext nullsnapc;
13843
13844 C_SaferCond rd_cond;
13845 ObjectOperation rd_op;
13846 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13847
13848 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13849 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13850
13851 C_SaferCond wr_cond;
13852 ObjectOperation wr_op;
13853 wr_op.create(true);
13854
13855 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13856 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13857
13858 client_lock.Unlock();
13859 int rd_ret = rd_cond.wait();
13860 int wr_ret = wr_cond.wait();
13861 client_lock.Lock();
13862
13863 bool errored = false;
13864
13865 if (rd_ret == 0 || rd_ret == -ENOENT)
13866 have |= POOL_READ;
13867 else if (rd_ret != -EPERM) {
13868 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13869 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13870 errored = true;
13871 }
13872
13873 if (wr_ret == 0 || wr_ret == -EEXIST)
13874 have |= POOL_WRITE;
13875 else if (wr_ret != -EPERM) {
13876 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13877 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13878 errored = true;
13879 }
13880
13881 if (errored) {
13882 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13883 // Raise EIO because actual error code might be misleading for
13884 // userspace filesystem user.
13885 pool_perms.erase(perm_key);
13886 signal_cond_list(waiting_for_pool_perm);
13887 return -EIO;
13888 }
13889
13890 pool_perms[perm_key] = have | POOL_CHECKED;
13891 signal_cond_list(waiting_for_pool_perm);
13892 }
13893
13894 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13895 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13896 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13897 return -EPERM;
13898 }
13899 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13900 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13901 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13902 return -EPERM;
13903 }
13904
13905 return 0;
13906}
13907
13908int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13909{
13910 if (acl_type == POSIX_ACL) {
13911 if (in->xattrs.count(ACL_EA_ACCESS)) {
13912 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13913
13914 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13915 }
13916 }
13917 return -EAGAIN;
13918}
13919
13920int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13921{
13922 if (acl_type == NO_ACL)
13923 return 0;
13924
13925 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13926 if (r < 0)
13927 goto out;
13928
13929 if (acl_type == POSIX_ACL) {
13930 if (in->xattrs.count(ACL_EA_ACCESS)) {
13931 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13932 bufferptr acl(access_acl.c_str(), access_acl.length());
13933 r = posix_acl_access_chmod(acl, mode);
13934 if (r < 0)
13935 goto out;
13936 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13937 } else {
13938 r = 0;
13939 }
13940 }
13941out:
13942 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13943 return r;
13944}
13945
13946int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13947 const UserPerm& perms)
13948{
13949 if (acl_type == NO_ACL)
13950 return 0;
13951
13952 if (S_ISLNK(*mode))
13953 return 0;
13954
13955 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13956 if (r < 0)
13957 goto out;
13958
13959 if (acl_type == POSIX_ACL) {
13960 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13961 map<string, bufferptr> xattrs;
13962
13963 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13964 bufferptr acl(default_acl.c_str(), default_acl.length());
13965 r = posix_acl_inherit_mode(acl, mode);
13966 if (r < 0)
13967 goto out;
13968
13969 if (r > 0) {
13970 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13971 if (r < 0)
13972 goto out;
13973 if (r > 0)
13974 xattrs[ACL_EA_ACCESS] = acl;
13975 }
13976
13977 if (S_ISDIR(*mode))
13978 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13979
13980 r = xattrs.size();
13981 if (r > 0)
13982 ::encode(xattrs, xattrs_bl);
13983 } else {
13984 if (umask_cb)
13985 *mode &= ~umask_cb(callback_handle);
13986 r = 0;
13987 }
13988 }
13989out:
13990 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13991 return r;
13992}
13993
13994void Client::set_filer_flags(int flags)
13995{
13996 Mutex::Locker l(client_lock);
13997 assert(flags == 0 ||
13998 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13999 objecter->add_global_op_flags(flags);
14000}
14001
14002void Client::clear_filer_flags(int flags)
14003{
14004 Mutex::Locker l(client_lock);
14005 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14006 objecter->clear_global_op_flag(flags);
14007}
14008
14009/**
14010 * This is included in cap release messages, to cause
14011 * the MDS to wait until this OSD map epoch. It is necessary
14012 * in corner cases where we cancel RADOS ops, so that
14013 * nobody else tries to do IO to the same objects in
14014 * the same epoch as the cancelled ops.
14015 */
14016void Client::set_cap_epoch_barrier(epoch_t e)
14017{
14018 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14019 cap_epoch_barrier = e;
14020}
14021
14022const char** Client::get_tracked_conf_keys() const
14023{
14024 static const char* keys[] = {
14025 "client_cache_size",
14026 "client_cache_mid",
14027 "client_acl_type",
b32b8144
FG
14028 "client_deleg_timeout",
14029 "client_deleg_break_on_open",
7c673cae
FG
14030 NULL
14031 };
14032 return keys;
14033}
14034
14035void Client::handle_conf_change(const struct md_config_t *conf,
14036 const std::set <std::string> &changed)
14037{
14038 Mutex::Locker lock(client_lock);
14039
181888fb 14040 if (changed.count("client_cache_mid")) {
7c673cae
FG
14041 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14042 }
14043 if (changed.count("client_acl_type")) {
14044 acl_type = NO_ACL;
14045 if (cct->_conf->client_acl_type == "posix_acl")
14046 acl_type = POSIX_ACL;
14047 }
14048}
14049
7c673cae
FG
14050void intrusive_ptr_add_ref(Inode *in)
14051{
14052 in->get();
14053}
14054
14055void intrusive_ptr_release(Inode *in)
14056{
14057 in->client->put_inode(in);
14058}
14059
14060mds_rank_t Client::_get_random_up_mds() const
14061{
14062 assert(client_lock.is_locked_by_me());
14063
14064 std::set<mds_rank_t> up;
14065 mdsmap->get_up_mds_set(up);
14066
14067 if (up.empty())
14068 return MDS_RANK_NONE;
14069 std::set<mds_rank_t>::const_iterator p = up.begin();
14070 for (int n = rand() % up.size(); n; n--)
14071 ++p;
14072 return *p;
14073}
14074
14075
14076StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14077 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14078{
14079 monclient->set_messenger(m);
14080 objecter->set_client_incarnation(0);
14081}
14082
14083StandaloneClient::~StandaloneClient()
14084{
14085 delete objecter;
14086 objecter = nullptr;
14087}
14088
14089int StandaloneClient::init()
14090{
14091 timer.init();
14092 objectcacher->start();
14093 objecter->init();
14094
14095 client_lock.Lock();
14096 assert(!initialized);
14097
14098 messenger->add_dispatcher_tail(objecter);
14099 messenger->add_dispatcher_tail(this);
14100
14101 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14102 int r = monclient->init();
14103 if (r < 0) {
14104 // need to do cleanup because we're in an intermediate init state
14105 timer.shutdown();
14106 client_lock.Unlock();
14107 objecter->shutdown();
14108 objectcacher->stop();
14109 monclient->shutdown();
14110 return r;
14111 }
14112 objecter->start();
14113
14114 client_lock.Unlock();
14115 _finish_init();
14116
14117 return 0;
14118}
14119
14120void StandaloneClient::shutdown()
14121{
14122 Client::shutdown();
14123 objecter->shutdown();
14124 monclient->shutdown();
14125}