]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to 12.2.8
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
b32b8144 86#include "Delegation.h"
7c673cae
FG
87#include "Dir.h"
88#include "ClientSnapRealm.h"
89#include "Fh.h"
90#include "MetaSession.h"
91#include "MetaRequest.h"
92#include "ObjecterWriteback.h"
93#include "posix_acl.h"
94
95#include "include/assert.h"
96#include "include/stat.h"
97
98#include "include/cephfs/ceph_statx.h"
99
100#if HAVE_GETGROUPLIST
101#include <grp.h>
102#include <pwd.h>
103#include <unistd.h>
104#endif
105
106#undef dout_prefix
107#define dout_prefix *_dout << "client." << whoami << " "
108
109#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111// FreeBSD fails to define this
112#ifndef O_DSYNC
113#define O_DSYNC 0x0
114#endif
115// Darwin fails to define this
116#ifndef O_RSYNC
117#define O_RSYNC 0x0
118#endif
119
120#ifndef O_DIRECT
121#define O_DIRECT 0x0
122#endif
123
124#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127{
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130}
131
132
133// -------------
134
135Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137{
138}
139
140bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142{
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163}
164
165
166// -------------
167
168dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174void Client::_reset_faked_inos()
175{
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181}
182
183void Client::_assign_faked_ino(Inode *in)
184{
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201}
202
203void Client::_release_faked_ino(Inode *in)
204{
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207}
208
209vinodeno_t Client::_map_faked_ino(ino_t ino)
210{
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220}
221
222vinodeno_t Client::map_faked_ino(ino_t ino)
223{
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226}
227
228// cons/des
229
230Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
7c673cae
FG
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
7c673cae
FG
241 async_ino_invalidator(m->cct),
242 async_dentry_invalidator(m->cct),
243 interrupt_finisher(m->cct),
244 remount_finisher(m->cct),
245 objecter_finisher(m->cct),
246 tick_event(NULL),
247 messenger(m), monclient(mc),
248 objecter(objecter_),
249 whoami(mc->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
251 initialized(false),
31f18b77 252 mounted(false), unmounting(false), blacklisted(false),
b32b8144 253 local_osd(-ENXIO), local_osd_epoch(0),
7c673cae 254 unsafe_sync_write(0),
b32b8144
FG
255 client_lock("Client::client_lock"),
256 deleg_timeout(0)
7c673cae
FG
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
7c673cae
FG
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
275
276 // file handles
277 free_fd_set.insert(10, 1<<30);
278
279 mdsmap.reset(new MDSMap);
280
281 // osd interfaces
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
283 &client_lock));
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
286 (void*)this,
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
292 true));
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 295 objecter->enable_blacklist_events();
7c673cae
FG
296}
297
298
299Client::~Client()
300{
301 assert(!client_lock.is_locked());
302
31f18b77
FG
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
306 client_lock.Lock();
7c673cae 307 tear_down_cache();
31f18b77 308 client_lock.Unlock();
7c673cae
FG
309}
310
311void Client::tear_down_cache()
312{
313 // fd's
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
315 it != fd_map.end();
316 ++it) {
317 Fh *fh = it->second;
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
319 _release_fh(fh);
320 }
321 fd_map.clear();
322
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
326 _closedir(dirp);
327 }
328
329 // caps!
330 // *** FIXME ***
331
332 // empty lru
7c673cae
FG
333 trim_cache();
334 assert(lru.lru_get_size() == 0);
335
336 // close root ino
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
339 delete root;
340 root = 0;
341 root_ancestor = 0;
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
344 inode_map.clear();
345 _reset_faked_inos();
346 }
347
348 assert(inode_map.empty());
349}
350
351inodeno_t Client::get_root_ino()
352{
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
356 else
357 return root->ino;
358}
359
360Inode *Client::get_root()
361{
362 Mutex::Locker l(client_lock);
363 root->ll_get();
364 return root;
365}
366
367
368// debug crapola
369
370void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
371{
372 filepath path;
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
377 << " " << path
378 << " ref " << in->get_num_ref()
379 << *in << dendl;
380
381 if (f) {
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
384 if (disconnected)
385 f->dump_int("disconnected", 1);
386 in->dump(f);
387 f->close_section();
388 }
389
390 did.insert(in);
391 if (in->dir) {
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
395 ++it) {
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
397 if (f) {
398 f->open_object_section("dentry");
399 it->second->dump(f);
400 f->close_section();
401 }
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
404 }
405 }
406}
407
408void Client::dump_cache(Formatter *f)
409{
410 set<Inode*> did;
411
412 ldout(cct, 1) << "dump_cache" << dendl;
413
414 if (f)
415 f->open_array_section("cache");
416
417 if (root)
418 dump_inode(f, root, did, true);
419
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
423 ++it) {
424 if (did.count(it->second))
425 continue;
426 dump_inode(f, it->second, did, true);
427 }
428
429 if (f)
430 f->close_section();
431}
432
433void Client::dump_status(Formatter *f)
434{
435 assert(client_lock.is_locked_by_me());
436
437 ldout(cct, 1) << __func__ << dendl;
438
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
441
442 if (f) {
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
446 f->close_section();
447
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
1adf2230
AA
451 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
452 f->dump_object("inst", inst);
453 f->dump_stream("inst_str") << inst;
454 f->dump_stream("addr_str") << inst.addr;
7c673cae
FG
455 f->dump_int("inode_count", inode_map.size());
456 f->dump_int("mds_epoch", mdsmap->get_epoch());
457 f->dump_int("osd_epoch", osd_epoch);
458 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
459 }
460}
461
462int Client::init()
463{
464 timer.init();
465 objectcacher->start();
466
467 client_lock.Lock();
468 assert(!initialized);
469
470 messenger->add_dispatcher_tail(this);
471 client_lock.Unlock();
472
473 _finish_init();
474 return 0;
475}
476
477void Client::_finish_init()
478{
479 client_lock.Lock();
480 // logger
481 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
482 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
483 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
484 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
485 logger.reset(plb.create_perf_counters());
486 cct->get_perfcounters_collection()->add(logger.get());
487
488 client_lock.Unlock();
489
490 cct->_conf->add_observer(this);
491
492 AdminSocket* admin_socket = cct->get_admin_socket();
493 int ret = admin_socket->register_command("mds_requests",
494 "mds_requests",
495 &m_command_hook,
496 "show in-progress mds requests");
497 if (ret < 0) {
498 lderr(cct) << "error registering admin socket command: "
499 << cpp_strerror(-ret) << dendl;
500 }
501 ret = admin_socket->register_command("mds_sessions",
502 "mds_sessions",
503 &m_command_hook,
504 "show mds session state");
505 if (ret < 0) {
506 lderr(cct) << "error registering admin socket command: "
507 << cpp_strerror(-ret) << dendl;
508 }
509 ret = admin_socket->register_command("dump_cache",
510 "dump_cache",
511 &m_command_hook,
512 "show in-memory metadata cache contents");
513 if (ret < 0) {
514 lderr(cct) << "error registering admin socket command: "
515 << cpp_strerror(-ret) << dendl;
516 }
517 ret = admin_socket->register_command("kick_stale_sessions",
518 "kick_stale_sessions",
519 &m_command_hook,
520 "kick sessions that were remote reset");
521 if (ret < 0) {
522 lderr(cct) << "error registering admin socket command: "
523 << cpp_strerror(-ret) << dendl;
524 }
525 ret = admin_socket->register_command("status",
526 "status",
527 &m_command_hook,
528 "show overall client status");
529 if (ret < 0) {
530 lderr(cct) << "error registering admin socket command: "
531 << cpp_strerror(-ret) << dendl;
532 }
533
534 client_lock.Lock();
535 initialized = true;
536 client_lock.Unlock();
537}
538
539void Client::shutdown()
540{
541 ldout(cct, 1) << "shutdown" << dendl;
542
543 // If we were not mounted, but were being used for sending
544 // MDS commands, we may have sessions that need closing.
545 client_lock.Lock();
546 _close_sessions();
547 client_lock.Unlock();
548
549 cct->_conf->remove_observer(this);
550
551 AdminSocket* admin_socket = cct->get_admin_socket();
552 admin_socket->unregister_command("mds_requests");
553 admin_socket->unregister_command("mds_sessions");
554 admin_socket->unregister_command("dump_cache");
555 admin_socket->unregister_command("kick_stale_sessions");
556 admin_socket->unregister_command("status");
557
558 if (ino_invalidate_cb) {
559 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
560 async_ino_invalidator.wait_for_empty();
561 async_ino_invalidator.stop();
562 }
563
564 if (dentry_invalidate_cb) {
565 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
566 async_dentry_invalidator.wait_for_empty();
567 async_dentry_invalidator.stop();
568 }
569
570 if (switch_interrupt_cb) {
571 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
572 interrupt_finisher.wait_for_empty();
573 interrupt_finisher.stop();
574 }
575
576 if (remount_cb) {
577 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
578 remount_finisher.wait_for_empty();
579 remount_finisher.stop();
580 }
581
582 objectcacher->stop(); // outside of client_lock! this does a join.
583
584 client_lock.Lock();
585 assert(initialized);
586 initialized = false;
587 timer.shutdown();
588 client_lock.Unlock();
589
590 objecter_finisher.wait_for_empty();
591 objecter_finisher.stop();
592
593 if (logger) {
594 cct->get_perfcounters_collection()->remove(logger.get());
595 logger.reset();
596 }
597}
598
599
600// ===================
601// metadata cache stuff
602
603void Client::trim_cache(bool trim_kernel_dcache)
604{
181888fb
FG
605 uint64_t max = cct->_conf->client_cache_size;
606 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
607 unsigned last = 0;
608 while (lru.lru_get_size() != last) {
609 last = lru.lru_get_size();
610
181888fb 611 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
612
613 // trim!
31f18b77 614 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
615 if (!dn)
616 break; // done
617
618 trim_dentry(dn);
619 }
620
181888fb 621 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
622 _invalidate_kernel_dcache();
623
624 // hose root?
625 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
626 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
627 delete root;
628 root = 0;
629 root_ancestor = 0;
630 while (!root_parents.empty())
631 root_parents.erase(root_parents.begin());
632 inode_map.clear();
633 _reset_faked_inos();
634 }
635}
636
637void Client::trim_cache_for_reconnect(MetaSession *s)
638{
639 mds_rank_t mds = s->mds_num;
640 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
641
642 int trimmed = 0;
643 list<Dentry*> skipped;
644 while (lru.lru_get_size() > 0) {
645 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
646 if (!dn)
647 break;
648
649 if ((dn->inode && dn->inode->caps.count(mds)) ||
650 dn->dir->parent_inode->caps.count(mds)) {
651 trim_dentry(dn);
652 trimmed++;
653 } else
654 skipped.push_back(dn);
655 }
656
657 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
658 lru.lru_insert_mid(*p);
659
660 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
661 << " trimmed " << trimmed << " dentries" << dendl;
662
663 if (s->caps.size() > 0)
664 _invalidate_kernel_dcache();
665}
666
667void Client::trim_dentry(Dentry *dn)
668{
669 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
670 << " in dir " << hex << dn->dir->parent_inode->ino
671 << dendl;
672 if (dn->inode) {
673 Inode *diri = dn->dir->parent_inode;
674 diri->dir_release_count++;
675 clear_dir_complete_and_ordered(diri, true);
676 }
677 unlink(dn, false, false); // drop dir, drop dentry
678}
679
680
1adf2230
AA
681void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
682 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 683{
7c673cae
FG
684 uint64_t prior_size = in->size;
685
7c673cae
FG
686 if (truncate_seq > in->truncate_seq ||
687 (truncate_seq == in->truncate_seq && size > in->size)) {
688 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
689 in->size = size;
690 in->reported_size = size;
691 if (truncate_seq != in->truncate_seq) {
692 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
693 << truncate_seq << dendl;
694 in->truncate_seq = truncate_seq;
695 in->oset.truncate_seq = truncate_seq;
696
697 // truncate cached file data
698 if (prior_size > size) {
699 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
700 }
701 }
702
703 // truncate inline data
704 if (in->inline_version < CEPH_INLINE_NONE) {
705 uint32_t len = in->inline_data.length();
706 if (size < len)
707 in->inline_data.splice(size, len - size);
708 }
709 }
710 if (truncate_seq >= in->truncate_seq &&
711 in->truncate_size != truncate_size) {
712 if (in->is_file()) {
713 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
714 << truncate_size << dendl;
715 in->truncate_size = truncate_size;
716 in->oset.truncate_size = truncate_size;
717 } else {
718 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
719 }
720 }
1adf2230
AA
721}
722
723void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
724 utime_t ctime, utime_t mtime, utime_t atime)
725{
726 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
727 << " ctime " << ctime << " mtime " << mtime << dendl;
728
729 if (time_warp_seq > in->time_warp_seq)
730 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
731 << " is higher than local time_warp_seq "
732 << in->time_warp_seq << dendl;
733
734 int warn = false;
7c673cae
FG
735 // be careful with size, mtime, atime
736 if (issued & (CEPH_CAP_FILE_EXCL|
737 CEPH_CAP_FILE_WR|
738 CEPH_CAP_FILE_BUFFER|
739 CEPH_CAP_AUTH_EXCL|
740 CEPH_CAP_XATTR_EXCL)) {
741 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
742 if (ctime > in->ctime)
743 in->ctime = ctime;
744 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
745 //the mds updated times, so take those!
746 in->mtime = mtime;
747 in->atime = atime;
748 in->time_warp_seq = time_warp_seq;
749 } else if (time_warp_seq == in->time_warp_seq) {
750 //take max times
751 if (mtime > in->mtime)
752 in->mtime = mtime;
753 if (atime > in->atime)
754 in->atime = atime;
755 } else if (issued & CEPH_CAP_FILE_EXCL) {
756 //ignore mds values as we have a higher seq
757 } else warn = true;
758 } else {
759 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
760 if (time_warp_seq >= in->time_warp_seq) {
761 in->ctime = ctime;
762 in->mtime = mtime;
763 in->atime = atime;
764 in->time_warp_seq = time_warp_seq;
765 } else warn = true;
766 }
767 if (warn) {
768 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
769 << time_warp_seq << " is lower than local time_warp_seq "
770 << in->time_warp_seq
771 << dendl;
772 }
773}
774
775void Client::_fragmap_remove_non_leaves(Inode *in)
776{
777 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
778 if (!in->dirfragtree.is_leaf(p->first))
779 in->fragmap.erase(p++);
780 else
781 ++p;
782}
783
784void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
785{
786 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (p->second == mds)
788 in->fragmap.erase(p++);
789 else
790 ++p;
791}
792
793Inode * Client::add_update_inode(InodeStat *st, utime_t from,
794 MetaSession *session,
795 const UserPerm& request_perms)
796{
797 Inode *in;
798 bool was_new = false;
799 if (inode_map.count(st->vino)) {
800 in = inode_map[st->vino];
801 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
802 } else {
803 in = new Inode(this, st->vino, &st->layout);
804 inode_map[st->vino] = in;
805
806 if (use_faked_inos())
807 _assign_faked_ino(in);
808
809 if (!root) {
810 root = in;
811 root_ancestor = in;
812 cwd = root;
813 } else if (!mounted) {
814 root_parents[root_ancestor] = in;
815 root_ancestor = in;
816 }
817
818 // immutable bits
819 in->ino = st->vino.ino;
820 in->snapid = st->vino.snapid;
821 in->mode = st->mode & S_IFMT;
822 was_new = true;
823 }
824
825 in->rdev = st->rdev;
826 if (in->is_symlink())
827 in->symlink = st->symlink;
828
7c673cae 829 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
830 bool new_version = false;
831 if (in->version == 0 ||
832 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
833 (in->version & ~1) < st->version))
834 new_version = true;
7c673cae 835
1adf2230
AA
836 int issued;
837 in->caps_issued(&issued);
838 issued |= in->caps_dirty();
839 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 840
1adf2230
AA
841 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
842 !(issued & CEPH_CAP_AUTH_EXCL)) {
843 in->mode = st->mode;
844 in->uid = st->uid;
845 in->gid = st->gid;
846 in->btime = st->btime;
847 }
7c673cae 848
1adf2230
AA
849 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
850 !(issued & CEPH_CAP_LINK_EXCL)) {
851 in->nlink = st->nlink;
852 }
7c673cae 853
1adf2230
AA
854 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
855 update_inode_file_time(in, issued, st->time_warp_seq,
856 st->ctime, st->mtime, st->atime);
857 }
7c673cae 858
1adf2230
AA
859 if (new_version ||
860 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 861 in->layout = st->layout;
1adf2230
AA
862 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
863 }
7c673cae 864
1adf2230
AA
865 if (in->is_dir()) {
866 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
867 in->dirstat = st->dirstat;
868 }
869 // dir_layout/rstat/quota are not tracked by capability, update them only if
870 // the inode stat is from auth mds
871 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
872 in->dir_layout = st->dir_layout;
873 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
874 in->rstat = st->rstat;
875 in->quota = st->quota;
876 }
877 // move me if/when version reflects fragtree changes.
878 if (in->dirfragtree != st->dirfragtree) {
879 in->dirfragtree = st->dirfragtree;
880 _fragmap_remove_non_leaves(in);
7c673cae 881 }
7c673cae
FG
882 }
883
884 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
885 st->xattrbl.length() &&
886 st->xattr_version > in->xattr_version) {
887 bufferlist::iterator p = st->xattrbl.begin();
888 ::decode(in->xattrs, p);
889 in->xattr_version = st->xattr_version;
890 }
891
1adf2230
AA
892 if (st->inline_version > in->inline_version) {
893 in->inline_data = st->inline_data;
894 in->inline_version = st->inline_version;
7c673cae
FG
895 }
896
1adf2230
AA
897 /* always take a newer change attr */
898 if (st->change_attr > in->change_attr)
899 in->change_attr = st->change_attr;
900
901 if (st->version > in->version)
902 in->version = st->version;
903
904 if (was_new)
905 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
906
907 if (!st->cap.caps)
908 return in; // as with readdir returning indoes in different snaprealms (no caps!)
909
7c673cae
FG
910 if (in->snapid == CEPH_NOSNAP) {
911 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
912 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
913 request_perms);
28e407b8 914 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 915 in->max_size = st->max_size;
28e407b8
AA
916 in->rstat = st->rstat;
917 }
7c673cae 918
1adf2230
AA
919 // setting I_COMPLETE needs to happen after adding the cap
920 if (in->is_dir() &&
921 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
922 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
923 in->dirstat.nfiles == 0 &&
924 in->dirstat.nsubdirs == 0) {
925 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
926 in->flags |= I_COMPLETE | I_DIR_ORDERED;
927 if (in->dir) {
928 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
929 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
930 in->dir->readdir_cache.clear();
931 for (const auto& p : in->dir->dentries) {
932 unlink(p.second, true, true); // keep dir, keep dentry
933 }
934 if (in->dir->dentries.empty())
935 close_dir(in->dir);
7c673cae 936 }
7c673cae 937 }
1adf2230
AA
938 } else {
939 in->snap_caps |= st->cap.caps;
7c673cae
FG
940 }
941
942 return in;
943}
944
945
946/*
947 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
948 */
949Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
950 Inode *in, utime_t from, MetaSession *session,
951 Dentry *old_dentry)
952{
953 Dentry *dn = NULL;
954 if (dir->dentries.count(dname))
955 dn = dir->dentries[dname];
956
957 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
958 << " in dir " << dir->parent_inode->vino() << " dn " << dn
959 << dendl;
960
961 if (dn && dn->inode) {
962 if (dn->inode->vino() == in->vino()) {
963 touch_dn(dn);
964 ldout(cct, 12) << " had dentry " << dname
965 << " with correct vino " << dn->inode->vino()
966 << dendl;
967 } else {
968 ldout(cct, 12) << " had dentry " << dname
969 << " with WRONG vino " << dn->inode->vino()
970 << dendl;
971 unlink(dn, true, true); // keep dir, keep dentry
972 }
973 }
974
975 if (!dn || !dn->inode) {
976 InodeRef tmp_ref(in);
977 if (old_dentry) {
978 if (old_dentry->dir != dir) {
979 Inode *old_diri = old_dentry->dir->parent_inode;
980 old_diri->dir_ordered_count++;
981 clear_dir_complete_and_ordered(old_diri, false);
982 }
983 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
984 }
985 Inode *diri = dir->parent_inode;
986 diri->dir_ordered_count++;
987 clear_dir_complete_and_ordered(diri, false);
988 dn = link(dir, dname, in, dn);
989 }
990
991 update_dentry_lease(dn, dlease, from, session);
992 return dn;
993}
994
995void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
996{
997 utime_t dttl = from;
998 dttl += (float)dlease->duration_ms / 1000.0;
999
1000 assert(dn);
1001
1002 if (dlease->mask & CEPH_LOCK_DN) {
1003 if (dttl > dn->lease_ttl) {
1004 ldout(cct, 10) << "got dentry lease on " << dn->name
1005 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1006 dn->lease_ttl = dttl;
1007 dn->lease_mds = session->mds_num;
1008 dn->lease_seq = dlease->seq;
1009 dn->lease_gen = session->cap_gen;
1010 }
1011 }
1012 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1013}
1014
1015
1016/*
1017 * update MDS location cache for a single inode
1018 */
1019void Client::update_dir_dist(Inode *in, DirStat *dst)
1020{
1021 // auth
1022 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1023 if (dst->auth >= 0) {
1024 in->fragmap[dst->frag] = dst->auth;
1025 } else {
1026 in->fragmap.erase(dst->frag);
1027 }
1028 if (!in->dirfragtree.is_leaf(dst->frag)) {
1029 in->dirfragtree.force_to_leaf(cct, dst->frag);
1030 _fragmap_remove_non_leaves(in);
1031 }
1032
1033 // replicated
1034 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1035
1036 // dist
1037 /*
1038 if (!st->dirfrag_dist.empty()) { // FIXME
1039 set<int> dist = st->dirfrag_dist.begin()->second;
1040 if (dist.empty() && !in->dir_contacts.empty())
1041 ldout(cct, 9) << "lost dist spec for " << in->ino
1042 << " " << dist << dendl;
1043 if (!dist.empty() && in->dir_contacts.empty())
1044 ldout(cct, 9) << "got dist spec for " << in->ino
1045 << " " << dist << dendl;
1046 in->dir_contacts = dist;
1047 }
1048 */
1049}
1050
1051void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1052{
1053 if (diri->flags & I_COMPLETE) {
1054 if (complete) {
1055 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1056 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1057 } else {
1058 if (diri->flags & I_DIR_ORDERED) {
1059 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1060 diri->flags &= ~I_DIR_ORDERED;
1061 }
1062 }
1063 if (diri->dir)
1064 diri->dir->readdir_cache.clear();
1065 }
1066}
1067
1068/*
1069 * insert results from readdir or lssnap into the metadata cache.
1070 */
1071void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1072
1073 MClientReply *reply = request->reply;
1074 ConnectionRef con = request->reply->get_connection();
1075 uint64_t features = con->get_features();
1076
1077 dir_result_t *dirp = request->dirp;
1078 assert(dirp);
1079
1080 // the extra buffer list is only set for readdir and lssnap replies
1081 bufferlist::iterator p = reply->get_extra_bl().begin();
1082 if (!p.end()) {
1083 // snapdir?
1084 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1085 assert(diri);
1086 diri = open_snapdir(diri);
1087 }
1088
1089 // only open dir if we're actually adding stuff to it!
1090 Dir *dir = diri->open_dir();
1091 assert(dir);
1092
1093 // dirstat
1094 DirStat dst(p);
1095 __u32 numdn;
1096 __u16 flags;
1097 ::decode(numdn, p);
1098 ::decode(flags, p);
1099
1100 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1101 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1102
1103 frag_t fg = (unsigned)request->head.args.readdir.frag;
1104 unsigned readdir_offset = dirp->next_offset;
1105 string readdir_start = dirp->last_name;
1106 assert(!readdir_start.empty() || readdir_offset == 2);
1107
1108 unsigned last_hash = 0;
1109 if (hash_order) {
1110 if (!readdir_start.empty()) {
1111 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1112 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1113 /* mds understands offset_hash */
1114 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1115 }
1116 }
1117
1118 if (fg != dst.frag) {
1119 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1120 fg = dst.frag;
1121 if (!hash_order) {
1122 readdir_offset = 2;
1123 readdir_start.clear();
1124 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1125 }
1126 }
1127
1128 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1129 << ", hash_order=" << hash_order
1130 << ", readdir_start " << readdir_start
1131 << ", last_hash " << last_hash
1132 << ", next_offset " << readdir_offset << dendl;
1133
1134 if (diri->snapid != CEPH_SNAPDIR &&
1135 fg.is_leftmost() && readdir_offset == 2 &&
1136 !(hash_order && last_hash)) {
1137 dirp->release_count = diri->dir_release_count;
1138 dirp->ordered_count = diri->dir_ordered_count;
1139 dirp->start_shared_gen = diri->shared_gen;
1140 dirp->cache_index = 0;
1141 }
1142
1143 dirp->buffer_frag = fg;
1144
1145 _readdir_drop_dirp_buffer(dirp);
1146 dirp->buffer.reserve(numdn);
1147
1148 string dname;
1149 LeaseStat dlease;
1150 for (unsigned i=0; i<numdn; i++) {
1151 ::decode(dname, p);
1152 ::decode(dlease, p);
1153 InodeStat ist(p, features);
1154
1155 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1156
1157 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1158 request->perms);
1159 Dentry *dn;
1160 if (diri->dir->dentries.count(dname)) {
1161 Dentry *olddn = diri->dir->dentries[dname];
1162 if (olddn->inode != in) {
1163 // replace incorrect dentry
1164 unlink(olddn, true, true); // keep dir, dentry
1165 dn = link(dir, dname, in, olddn);
1166 assert(dn == olddn);
1167 } else {
1168 // keep existing dn
1169 dn = olddn;
1170 touch_dn(dn);
1171 }
1172 } else {
1173 // new dn
1174 dn = link(dir, dname, in, NULL);
1175 }
1176
1177 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1178 if (hash_order) {
1179 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1180 if (hash != last_hash)
1181 readdir_offset = 2;
1182 last_hash = hash;
1183 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1184 } else {
1185 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1186 }
1187 // add to readdir cache
1188 if (dirp->release_count == diri->dir_release_count &&
1189 dirp->ordered_count == diri->dir_ordered_count &&
1190 dirp->start_shared_gen == diri->shared_gen) {
1191 if (dirp->cache_index == dir->readdir_cache.size()) {
1192 if (i == 0) {
1193 assert(!dirp->inode->is_complete_and_ordered());
1194 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1195 }
1196 dir->readdir_cache.push_back(dn);
1197 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1198 if (dirp->inode->is_complete_and_ordered())
1199 assert(dir->readdir_cache[dirp->cache_index] == dn);
1200 else
1201 dir->readdir_cache[dirp->cache_index] = dn;
1202 } else {
1203 assert(0 == "unexpected readdir buffer idx");
1204 }
1205 dirp->cache_index++;
1206 }
1207 // add to cached result list
1208 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1209 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1210 }
1211
1212 if (numdn > 0)
1213 dirp->last_name = dname;
1214 if (end)
1215 dirp->next_offset = 2;
1216 else
1217 dirp->next_offset = readdir_offset;
1218
1219 if (dir->is_empty())
1220 close_dir(dir);
1221 }
1222}
1223
1224/** insert_trace
1225 *
1226 * insert a trace from a MDS reply into the cache.
1227 */
1228Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1229{
1230 MClientReply *reply = request->reply;
1231 int op = request->get_op();
1232
1233 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1234 << " is_target=" << (int)reply->head.is_target
1235 << " is_dentry=" << (int)reply->head.is_dentry
1236 << dendl;
1237
1238 bufferlist::iterator p = reply->get_trace_bl().begin();
1239 if (request->got_unsafe) {
1240 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1241 assert(p.end());
1242 return NULL;
1243 }
1244
1245 if (p.end()) {
1246 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1247
1248 Dentry *d = request->dentry();
1249 if (d) {
1250 Inode *diri = d->dir->parent_inode;
1251 diri->dir_release_count++;
1252 clear_dir_complete_and_ordered(diri, true);
1253 }
1254
1255 if (d && reply->get_result() == 0) {
1256 if (op == CEPH_MDS_OP_RENAME) {
1257 // rename
1258 Dentry *od = request->old_dentry();
1259 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1260 assert(od);
1261 unlink(od, true, true); // keep dir, dentry
1262 } else if (op == CEPH_MDS_OP_RMDIR ||
1263 op == CEPH_MDS_OP_UNLINK) {
1264 // unlink, rmdir
1265 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1266 unlink(d, true, true); // keep dir, dentry
1267 }
1268 }
1269 return NULL;
1270 }
1271
1272 ConnectionRef con = request->reply->get_connection();
1273 uint64_t features = con->get_features();
1274 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1275
1276 // snap trace
1277 SnapRealm *realm = NULL;
1278 if (reply->snapbl.length())
1279 update_snap_trace(reply->snapbl, &realm);
1280
1281 ldout(cct, 10) << " hrm "
1282 << " is_target=" << (int)reply->head.is_target
1283 << " is_dentry=" << (int)reply->head.is_dentry
1284 << dendl;
1285
1286 InodeStat dirst;
1287 DirStat dst;
1288 string dname;
1289 LeaseStat dlease;
1290 InodeStat ist;
1291
1292 if (reply->head.is_dentry) {
1293 dirst.decode(p, features);
1294 dst.decode(p);
1295 ::decode(dname, p);
1296 ::decode(dlease, p);
1297 }
1298
1299 Inode *in = 0;
1300 if (reply->head.is_target) {
1301 ist.decode(p, features);
1302 if (cct->_conf->client_debug_getattr_caps) {
1303 unsigned wanted = 0;
1304 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1305 wanted = request->head.args.getattr.mask;
1306 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1307 wanted = request->head.args.open.mask;
1308
1309 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1310 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1311 assert(0 == "MDS reply does not contain xattrs");
1312 }
1313
1314 in = add_update_inode(&ist, request->sent_stamp, session,
1315 request->perms);
1316 }
1317
1318 Inode *diri = NULL;
1319 if (reply->head.is_dentry) {
1320 diri = add_update_inode(&dirst, request->sent_stamp, session,
1321 request->perms);
1322 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1323
1324 if (in) {
1325 Dir *dir = diri->open_dir();
1326 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1327 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1328 } else {
1329 Dentry *dn = NULL;
1330 if (diri->dir && diri->dir->dentries.count(dname)) {
1331 dn = diri->dir->dentries[dname];
1332 if (dn->inode) {
1333 diri->dir_ordered_count++;
1334 clear_dir_complete_and_ordered(diri, false);
1335 unlink(dn, true, true); // keep dir, dentry
1336 }
1337 }
1338 if (dlease.duration_ms > 0) {
1339 if (!dn) {
1340 Dir *dir = diri->open_dir();
1341 dn = link(dir, dname, NULL, NULL);
1342 }
1343 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1344 }
1345 }
1346 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1347 op == CEPH_MDS_OP_MKSNAP) {
1348 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1349 // fake it for snap lookup
1350 vinodeno_t vino = ist.vino;
1351 vino.snapid = CEPH_SNAPDIR;
1352 assert(inode_map.count(vino));
1353 diri = inode_map[vino];
1354
1355 string dname = request->path.last_dentry();
1356
1357 LeaseStat dlease;
1358 dlease.duration_ms = 0;
1359
1360 if (in) {
1361 Dir *dir = diri->open_dir();
1362 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1363 } else {
1364 if (diri->dir && diri->dir->dentries.count(dname)) {
1365 Dentry *dn = diri->dir->dentries[dname];
1366 if (dn->inode)
1367 unlink(dn, true, true); // keep dir, dentry
1368 }
1369 }
1370 }
1371
1372 if (in) {
1373 if (op == CEPH_MDS_OP_READDIR ||
1374 op == CEPH_MDS_OP_LSSNAP) {
1375 insert_readdir_results(request, session, in);
1376 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1377 // hack: return parent inode instead
1378 in = diri;
1379 }
1380
1381 if (request->dentry() == NULL && in != request->inode()) {
1382 // pin the target inode if its parent dentry is not pinned
1383 request->set_other_inode(in);
1384 }
1385 }
1386
1387 if (realm)
1388 put_snap_realm(realm);
1389
1390 request->target = in;
1391 return in;
1392}
1393
1394// -------
1395
1396mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1397{
1398 mds_rank_t mds = MDS_RANK_NONE;
1399 __u32 hash = 0;
1400 bool is_hash = false;
1401
1402 Inode *in = NULL;
1403 Dentry *de = NULL;
1404 Cap *cap = NULL;
1405
1406 if (req->resend_mds >= 0) {
1407 mds = req->resend_mds;
1408 req->resend_mds = -1;
1409 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1410 goto out;
1411 }
1412
1413 if (cct->_conf->client_use_random_mds)
1414 goto random_mds;
1415
1416 in = req->inode();
1417 de = req->dentry();
1418 if (in) {
1419 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1420 if (req->path.depth()) {
1421 hash = in->hash_dentry_name(req->path[0]);
1422 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1423 << " on " << req->path[0]
1424 << " => " << hash << dendl;
1425 is_hash = true;
1426 }
1427 } else if (de) {
1428 if (de->inode) {
1429 in = de->inode.get();
1430 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1431 } else {
1432 in = de->dir->parent_inode;
1433 hash = in->hash_dentry_name(de->name);
1434 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1435 << " on " << de->name
1436 << " => " << hash << dendl;
1437 is_hash = true;
1438 }
1439 }
1440 if (in) {
1441 if (in->snapid != CEPH_NOSNAP) {
1442 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1443 while (in->snapid != CEPH_NOSNAP) {
1444 if (in->snapid == CEPH_SNAPDIR)
1445 in = in->snapdir_parent.get();
1446 else if (!in->dn_set.empty())
1447 /* In most cases there will only be one dentry, so getting it
1448 * will be the correct action. If there are multiple hard links,
1449 * I think the MDS should be able to redirect as needed*/
1450 in = in->get_first_parent()->dir->parent_inode;
1451 else {
1452 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1453 break;
1454 }
1455 }
1456 is_hash = false;
1457 }
1458
1459 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1460 << " hash=" << hash << dendl;
1461
1462 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1463 frag_t fg = in->dirfragtree[hash];
1464 if (in->fragmap.count(fg)) {
1465 mds = in->fragmap[fg];
1466 if (phash_diri)
1467 *phash_diri = in;
1468 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1469 goto out;
1470 }
1471 }
1472
1473 if (req->auth_is_best())
1474 cap = in->auth_cap;
1475 if (!cap && !in->caps.empty())
1476 cap = in->caps.begin()->second;
1477 if (!cap)
1478 goto random_mds;
1479 mds = cap->session->mds_num;
1480 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1481
1482 goto out;
1483 }
1484
1485random_mds:
1486 if (mds < 0) {
1487 mds = _get_random_up_mds();
1488 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1489 }
1490
1491out:
1492 ldout(cct, 20) << "mds is " << mds << dendl;
1493 return mds;
1494}
1495
1496
1497void Client::connect_mds_targets(mds_rank_t mds)
1498{
1499 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1500 assert(mds_sessions.count(mds));
1501 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1502 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1503 q != info.export_targets.end();
1504 ++q) {
1505 if (mds_sessions.count(*q) == 0 &&
1506 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1507 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1508 << " export target mds." << *q << dendl;
1509 _open_mds_session(*q);
1510 }
1511 }
1512}
1513
1514void Client::dump_mds_sessions(Formatter *f)
1515{
1516 f->dump_int("id", get_nodeid().v);
1adf2230
AA
1517 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
1518 f->dump_object("inst", inst);
1519 f->dump_stream("inst_str") << inst;
1520 f->dump_stream("addr_str") << inst.addr;
7c673cae
FG
1521 f->open_array_section("sessions");
1522 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1523 f->open_object_section("session");
1524 p->second->dump(f);
1525 f->close_section();
1526 }
1527 f->close_section();
1528 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1529}
1530void Client::dump_mds_requests(Formatter *f)
1531{
1532 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1533 p != mds_requests.end();
1534 ++p) {
1535 f->open_object_section("request");
1536 p->second->dump(f);
1537 f->close_section();
1538 }
1539}
1540
1541int Client::verify_reply_trace(int r,
1542 MetaRequest *request, MClientReply *reply,
1543 InodeRef *ptarget, bool *pcreated,
1544 const UserPerm& perms)
1545{
1546 // check whether this request actually did the create, and set created flag
1547 bufferlist extra_bl;
1548 inodeno_t created_ino;
1549 bool got_created_ino = false;
1550 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1551
1552 extra_bl.claim(reply->get_extra_bl());
1553 if (extra_bl.length() >= 8) {
1554 // if the extra bufferlist has a buffer, we assume its the created inode
1555 // and that this request to create succeeded in actually creating
1556 // the inode (won the race with other create requests)
1557 ::decode(created_ino, extra_bl);
1558 got_created_ino = true;
1559 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1560 }
1561
1562 if (pcreated)
1563 *pcreated = got_created_ino;
1564
1565 if (request->target) {
1566 *ptarget = request->target;
1567 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1568 } else {
1569 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1570 (*ptarget) = p->second;
1571 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1572 } else {
1573 // we got a traceless reply, and need to look up what we just
1574 // created. for now, do this by name. someday, do this by the
1575 // ino... which we know! FIXME.
1576 InodeRef target;
1577 Dentry *d = request->dentry();
1578 if (d) {
1579 if (d->dir) {
1580 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1581 << d->dir->parent_inode->ino << "/" << d->name
1582 << " got_ino " << got_created_ino
1583 << " ino " << created_ino
1584 << dendl;
1585 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1586 &target, perms);
1587 } else {
1588 // if the dentry is not linked, just do our best. see #5021.
1589 assert(0 == "how did this happen? i want logs!");
1590 }
1591 } else {
1592 Inode *in = request->inode();
1593 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1594 << in->ino << dendl;
1595 r = _getattr(in, request->regetattr_mask, perms, true);
1596 target = in;
1597 }
1598 if (r >= 0) {
1599 // verify ino returned in reply and trace_dist are the same
1600 if (got_created_ino &&
1601 created_ino.val != target->ino.val) {
1602 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1603 r = -EINTR;
1604 }
1605 if (ptarget)
1606 ptarget->swap(target);
1607 }
1608 }
1609 }
1610
1611 return r;
1612}
1613
1614
1615/**
1616 * make a request
1617 *
1618 * Blocking helper to make an MDS request.
1619 *
1620 * If the ptarget flag is set, behavior changes slightly: the caller
1621 * expects to get a pointer to the inode we are creating or operating
1622 * on. As a result, we will follow up any traceless mutation reply
1623 * with a getattr or lookup to transparently handle a traceless reply
1624 * from the MDS (as when the MDS restarts and the client has to replay
1625 * a request).
1626 *
1627 * @param request the MetaRequest to execute
1628 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1629 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1630 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1631 * @param use_mds [optional] prefer a specific mds (-1 for default)
1632 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1633 */
1634int Client::make_request(MetaRequest *request,
1635 const UserPerm& perms,
1636 InodeRef *ptarget, bool *pcreated,
1637 mds_rank_t use_mds,
1638 bufferlist *pdirbl)
1639{
1640 int r = 0;
1641
1642 // assign a unique tid
1643 ceph_tid_t tid = ++last_tid;
1644 request->set_tid(tid);
1645
1646 // and timestamp
1647 request->op_stamp = ceph_clock_now();
1648
1649 // make note
1650 mds_requests[tid] = request->get();
1651 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1652 oldest_tid = tid;
1653
1654 request->set_caller_perms(perms);
1655
1656 if (cct->_conf->client_inject_fixed_oldest_tid) {
1657 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1658 request->set_oldest_client_tid(1);
1659 } else {
1660 request->set_oldest_client_tid(oldest_tid);
1661 }
1662
1663 // hack target mds?
1664 if (use_mds >= 0)
1665 request->resend_mds = use_mds;
1666
1667 while (1) {
1668 if (request->aborted())
1669 break;
1670
31f18b77
FG
1671 if (blacklisted) {
1672 request->abort(-EBLACKLISTED);
1673 break;
1674 }
1675
7c673cae
FG
1676 // set up wait cond
1677 Cond caller_cond;
1678 request->caller_cond = &caller_cond;
1679
1680 // choose mds
1681 Inode *hash_diri = NULL;
1682 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1683 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1684 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1685 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1686 if (hash_diri) {
1687 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1688 _fragmap_remove_stopped_mds(hash_diri, mds);
1689 } else {
1690 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1691 request->resend_mds = _get_random_up_mds();
1692 }
1693 } else {
1694 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1695 wait_on_list(waiting_for_mdsmap);
1696 }
1697 continue;
1698 }
1699
1700 // open a session?
1701 MetaSession *session = NULL;
1702 if (!have_open_session(mds)) {
1703 session = _get_or_open_mds_session(mds);
1704
1705 // wait
1706 if (session->state == MetaSession::STATE_OPENING) {
1707 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1708 wait_on_context_list(session->waiting_for_open);
1709 // Abort requests on REJECT from MDS
1710 if (rejected_by_mds.count(mds)) {
1711 request->abort(-EPERM);
1712 break;
1713 }
1714 continue;
1715 }
1716
1717 if (!have_open_session(mds))
1718 continue;
1719 } else {
1720 session = mds_sessions[mds];
1721 }
1722
1723 // send request.
1724 send_request(request, session);
1725
1726 // wait for signal
1727 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1728 request->kick = false;
1729 while (!request->reply && // reply
1730 request->resend_mds < 0 && // forward
1731 !request->kick)
1732 caller_cond.Wait(client_lock);
1733 request->caller_cond = NULL;
1734
1735 // did we get a reply?
1736 if (request->reply)
1737 break;
1738 }
1739
1740 if (!request->reply) {
1741 assert(request->aborted());
1742 assert(!request->got_unsafe);
1743 r = request->get_abort_code();
1744 request->item.remove_myself();
1745 unregister_request(request);
1746 put_request(request); // ours
1747 return r;
1748 }
1749
1750 // got it!
1751 MClientReply *reply = request->reply;
1752 request->reply = NULL;
1753 r = reply->get_result();
1754 if (r >= 0)
1755 request->success = true;
1756
1757 // kick dispatcher (we've got it!)
1758 assert(request->dispatch_cond);
1759 request->dispatch_cond->Signal();
1760 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1761 request->dispatch_cond = 0;
1762
1763 if (r >= 0 && ptarget)
1764 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1765
1766 if (pdirbl)
1767 pdirbl->claim(reply->get_extra_bl());
1768
1769 // -- log times --
1770 utime_t lat = ceph_clock_now();
1771 lat -= request->sent_stamp;
1772 ldout(cct, 20) << "lat " << lat << dendl;
1773 logger->tinc(l_c_lat, lat);
1774 logger->tinc(l_c_reply, lat);
1775
1776 put_request(request);
1777
1778 reply->put();
1779 return r;
1780}
1781
1782void Client::unregister_request(MetaRequest *req)
1783{
1784 mds_requests.erase(req->tid);
1785 if (req->tid == oldest_tid) {
1786 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1787 while (true) {
1788 if (p == mds_requests.end()) {
1789 oldest_tid = 0;
1790 break;
1791 }
1792 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1793 oldest_tid = p->first;
1794 break;
1795 }
1796 ++p;
1797 }
1798 }
1799 put_request(req);
1800}
1801
1802void Client::put_request(MetaRequest *request)
1803{
1804 if (request->_put()) {
1805 int op = -1;
1806 if (request->success)
1807 op = request->get_op();
1808 InodeRef other_in;
1809 request->take_other_inode(&other_in);
1810 delete request;
1811
1812 if (other_in &&
1813 (op == CEPH_MDS_OP_RMDIR ||
1814 op == CEPH_MDS_OP_RENAME ||
1815 op == CEPH_MDS_OP_RMSNAP)) {
1816 _try_to_trim_inode(other_in.get(), false);
1817 }
1818 }
1819}
1820
1821int Client::encode_inode_release(Inode *in, MetaRequest *req,
1822 mds_rank_t mds, int drop,
1823 int unless, int force)
1824{
1825 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1826 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1827 << ", have:" << ", force:" << force << ")" << dendl;
1828 int released = 0;
1829 if (in->caps.count(mds)) {
1830 Cap *caps = in->caps[mds];
1831 drop &= ~(in->dirty_caps | get_caps_used(in));
1832 if ((drop & caps->issued) &&
1833 !(unless & caps->issued)) {
1834 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1835 caps->issued &= ~drop;
1836 caps->implemented &= ~drop;
1837 released = 1;
1838 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1839 } else {
1840 released = force;
1841 }
1842 if (released) {
1843 ceph_mds_request_release rel;
1844 rel.ino = in->ino;
1845 rel.cap_id = caps->cap_id;
1846 rel.seq = caps->seq;
1847 rel.issue_seq = caps->issue_seq;
1848 rel.mseq = caps->mseq;
1849 rel.caps = caps->implemented;
1850 rel.wanted = caps->wanted;
1851 rel.dname_len = 0;
1852 rel.dname_seq = 0;
1853 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1854 }
1855 }
1856 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1857 << released << dendl;
1858 return released;
1859}
1860
1861void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1862 mds_rank_t mds, int drop, int unless)
1863{
1864 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1865 << dn << ")" << dendl;
1866 int released = 0;
1867 if (dn->dir)
1868 released = encode_inode_release(dn->dir->parent_inode, req,
1869 mds, drop, unless, 1);
1870 if (released && dn->lease_mds == mds) {
1871 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1872 MClientRequest::Release& rel = req->cap_releases.back();
1873 rel.item.dname_len = dn->name.length();
1874 rel.item.dname_seq = dn->lease_seq;
1875 rel.dname = dn->name;
1876 }
1877 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1878 << dn << ")" << dendl;
1879}
1880
1881
1882/*
1883 * This requires the MClientRequest *request member to be set.
1884 * It will error out horribly without one.
1885 * Additionally, if you set any *drop member, you'd better have
1886 * set the corresponding dentry!
1887 */
1888void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1889{
1890 ldout(cct, 20) << "encode_cap_releases enter (req: "
1891 << req << ", mds: " << mds << ")" << dendl;
1892 if (req->inode_drop && req->inode())
1893 encode_inode_release(req->inode(), req,
1894 mds, req->inode_drop,
1895 req->inode_unless);
1896
1897 if (req->old_inode_drop && req->old_inode())
1898 encode_inode_release(req->old_inode(), req,
1899 mds, req->old_inode_drop,
1900 req->old_inode_unless);
1901 if (req->other_inode_drop && req->other_inode())
1902 encode_inode_release(req->other_inode(), req,
1903 mds, req->other_inode_drop,
1904 req->other_inode_unless);
1905
1906 if (req->dentry_drop && req->dentry())
1907 encode_dentry_release(req->dentry(), req,
1908 mds, req->dentry_drop,
1909 req->dentry_unless);
1910
1911 if (req->old_dentry_drop && req->old_dentry())
1912 encode_dentry_release(req->old_dentry(), req,
1913 mds, req->old_dentry_drop,
1914 req->old_dentry_unless);
1915 ldout(cct, 25) << "encode_cap_releases exit (req: "
1916 << req << ", mds " << mds <<dendl;
1917}
1918
1919bool Client::have_open_session(mds_rank_t mds)
1920{
1921 return
1922 mds_sessions.count(mds) &&
1923 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1924 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1925}
1926
1927MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1928{
1929 if (mds_sessions.count(mds) == 0)
1930 return NULL;
1931 MetaSession *s = mds_sessions[mds];
1932 if (s->con != con)
1933 return NULL;
1934 return s;
1935}
1936
1937MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1938{
1939 if (mds_sessions.count(mds))
1940 return mds_sessions[mds];
1941 return _open_mds_session(mds);
1942}
1943
1944/**
1945 * Populate a map of strings with client-identifying metadata,
1946 * such as the hostname. Call this once at initialization.
1947 */
1948void Client::populate_metadata(const std::string &mount_root)
1949{
1950 // Hostname
1951 struct utsname u;
1952 int r = uname(&u);
1953 if (r >= 0) {
1954 metadata["hostname"] = u.nodename;
1955 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1956 } else {
1957 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1958 }
1959
1960 metadata["pid"] = stringify(getpid());
1961
1962 // Ceph entity id (the '0' in "client.0")
1963 metadata["entity_id"] = cct->_conf->name.get_id();
1964
1965 // Our mount position
1966 if (!mount_root.empty()) {
1967 metadata["root"] = mount_root;
1968 }
1969
1970 // Ceph version
1971 metadata["ceph_version"] = pretty_version_to_str();
1972 metadata["ceph_sha1"] = git_version_to_str();
1973
1974 // Apply any metadata from the user's configured overrides
1975 std::vector<std::string> tokens;
1976 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1977 for (const auto &i : tokens) {
1978 auto eqpos = i.find("=");
1979 // Throw out anything that isn't of the form "<str>=<str>"
1980 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1981 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1982 continue;
1983 }
1984 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1985 }
1986}
1987
1988/**
1989 * Optionally add or override client metadata fields.
1990 */
1991void Client::update_metadata(std::string const &k, std::string const &v)
1992{
1993 Mutex::Locker l(client_lock);
1994 assert(initialized);
1995
1996 if (metadata.count(k)) {
1997 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1998 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1999 }
2000
2001 metadata[k] = v;
2002}
2003
2004MetaSession *Client::_open_mds_session(mds_rank_t mds)
2005{
2006 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
2007 assert(mds_sessions.count(mds) == 0);
2008 MetaSession *session = new MetaSession;
2009 session->mds_num = mds;
2010 session->seq = 0;
2011 session->inst = mdsmap->get_inst(mds);
2012 session->con = messenger->get_connection(session->inst);
2013 session->state = MetaSession::STATE_OPENING;
2014 session->mds_state = MDSMap::STATE_NULL;
2015 mds_sessions[mds] = session;
2016
2017 // Maybe skip sending a request to open if this MDS daemon
2018 // has previously sent us a REJECT.
2019 if (rejected_by_mds.count(mds)) {
2020 if (rejected_by_mds[mds] == session->inst) {
2021 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2022 "because we were rejected" << dendl;
2023 return session;
2024 } else {
2025 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2026 "rejected us, trying with new inst" << dendl;
2027 rejected_by_mds.erase(mds);
2028 }
2029 }
2030
2031 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2032 m->client_meta = metadata;
2033 session->con->send_message(m);
2034 return session;
2035}
2036
2037void Client::_close_mds_session(MetaSession *s)
2038{
2039 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2040 s->state = MetaSession::STATE_CLOSING;
2041 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2042}
2043
2044void Client::_closed_mds_session(MetaSession *s)
2045{
2046 s->state = MetaSession::STATE_CLOSED;
2047 s->con->mark_down();
2048 signal_context_list(s->waiting_for_open);
2049 mount_cond.Signal();
2050 remove_session_caps(s);
2051 kick_requests_closed(s);
2052 mds_sessions.erase(s->mds_num);
2053 delete s;
2054}
2055
2056void Client::handle_client_session(MClientSession *m)
2057{
2058 mds_rank_t from = mds_rank_t(m->get_source().num());
2059 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2060
2061 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2062 if (!session) {
2063 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2064 m->put();
2065 return;
2066 }
2067
2068 switch (m->get_op()) {
2069 case CEPH_SESSION_OPEN:
2070 renew_caps(session);
2071 session->state = MetaSession::STATE_OPEN;
2072 if (unmounting)
2073 mount_cond.Signal();
2074 else
2075 connect_mds_targets(from);
2076 signal_context_list(session->waiting_for_open);
2077 break;
2078
2079 case CEPH_SESSION_CLOSE:
2080 _closed_mds_session(session);
2081 break;
2082
2083 case CEPH_SESSION_RENEWCAPS:
2084 if (session->cap_renew_seq == m->get_seq()) {
2085 session->cap_ttl =
2086 session->last_cap_renew_request + mdsmap->get_session_timeout();
2087 wake_inode_waiters(session);
2088 }
2089 break;
2090
2091 case CEPH_SESSION_STALE:
28e407b8
AA
2092 // invalidate session caps/leases
2093 session->cap_gen++;
2094 session->cap_ttl = ceph_clock_now();
2095 session->cap_ttl -= 1;
7c673cae
FG
2096 renew_caps(session);
2097 break;
2098
2099 case CEPH_SESSION_RECALL_STATE:
2100 trim_caps(session, m->get_max_caps());
2101 break;
2102
2103 case CEPH_SESSION_FLUSHMSG:
2104 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2105 break;
2106
2107 case CEPH_SESSION_FORCE_RO:
2108 force_session_readonly(session);
2109 break;
2110
2111 case CEPH_SESSION_REJECT:
2112 rejected_by_mds[session->mds_num] = session->inst;
2113 _closed_mds_session(session);
2114
2115 break;
2116
2117 default:
2118 ceph_abort();
2119 }
2120
2121 m->put();
2122}
2123
2124bool Client::_any_stale_sessions() const
2125{
2126 assert(client_lock.is_locked_by_me());
2127
2128 for (const auto &i : mds_sessions) {
2129 if (i.second->state == MetaSession::STATE_STALE) {
2130 return true;
2131 }
2132 }
2133
2134 return false;
2135}
2136
2137void Client::_kick_stale_sessions()
2138{
2139 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2140
2141 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2142 p != mds_sessions.end(); ) {
2143 MetaSession *s = p->second;
2144 ++p;
2145 if (s->state == MetaSession::STATE_STALE)
2146 _closed_mds_session(s);
2147 }
2148}
2149
2150void Client::send_request(MetaRequest *request, MetaSession *session,
2151 bool drop_cap_releases)
2152{
2153 // make the request
2154 mds_rank_t mds = session->mds_num;
2155 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2156 << " for mds." << mds << dendl;
2157 MClientRequest *r = build_client_request(request);
2158 if (request->dentry()) {
2159 r->set_dentry_wanted();
2160 }
2161 if (request->got_unsafe) {
2162 r->set_replayed_op();
2163 if (request->target)
2164 r->head.ino = request->target->ino;
2165 } else {
2166 encode_cap_releases(request, mds);
2167 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2168 request->cap_releases.clear();
2169 else
2170 r->releases.swap(request->cap_releases);
2171 }
2172 r->set_mdsmap_epoch(mdsmap->get_epoch());
2173 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2174 objecter->with_osdmap([r](const OSDMap& o) {
2175 r->set_osdmap_epoch(o.get_epoch());
2176 });
2177 }
2178
2179 if (request->mds == -1) {
2180 request->sent_stamp = ceph_clock_now();
2181 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2182 }
2183 request->mds = mds;
2184
2185 Inode *in = request->inode();
2186 if (in && in->caps.count(mds))
2187 request->sent_on_mseq = in->caps[mds]->mseq;
2188
2189 session->requests.push_back(&request->item);
2190
2191 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2192 session->con->send_message(r);
2193}
2194
2195MClientRequest* Client::build_client_request(MetaRequest *request)
2196{
2197 MClientRequest *req = new MClientRequest(request->get_op());
2198 req->set_tid(request->tid);
2199 req->set_stamp(request->op_stamp);
2200 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2201
2202 // if the filepath's haven't been set, set them!
2203 if (request->path.empty()) {
2204 Inode *in = request->inode();
2205 Dentry *de = request->dentry();
2206 if (in)
2207 in->make_nosnap_relative_path(request->path);
2208 else if (de) {
2209 if (de->inode)
2210 de->inode->make_nosnap_relative_path(request->path);
2211 else if (de->dir) {
2212 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2213 request->path.push_dentry(de->name);
2214 }
2215 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2216 << " No path, inode, or appropriately-endowed dentry given!"
2217 << dendl;
2218 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2219 << " No path, inode, or dentry given!"
2220 << dendl;
2221 }
2222 req->set_filepath(request->get_filepath());
2223 req->set_filepath2(request->get_filepath2());
2224 req->set_data(request->data);
2225 req->set_retry_attempt(request->retry_attempt++);
2226 req->head.num_fwd = request->num_fwd;
2227 const gid_t *_gids;
2228 int gid_count = request->perms.get_gids(&_gids);
2229 req->set_gid_list(gid_count, _gids);
2230 return req;
2231}
2232
2233
2234
2235void Client::handle_client_request_forward(MClientRequestForward *fwd)
2236{
2237 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2238 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2239 if (!session) {
2240 fwd->put();
2241 return;
2242 }
2243 ceph_tid_t tid = fwd->get_tid();
2244
2245 if (mds_requests.count(tid) == 0) {
2246 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2247 fwd->put();
2248 return;
2249 }
2250
2251 MetaRequest *request = mds_requests[tid];
2252 assert(request);
2253
2254 // reset retry counter
2255 request->retry_attempt = 0;
2256
2257 // request not forwarded, or dest mds has no session.
2258 // resend.
2259 ldout(cct, 10) << "handle_client_request tid " << tid
2260 << " fwd " << fwd->get_num_fwd()
2261 << " to mds." << fwd->get_dest_mds()
2262 << ", resending to " << fwd->get_dest_mds()
2263 << dendl;
2264
2265 request->mds = -1;
2266 request->item.remove_myself();
2267 request->num_fwd = fwd->get_num_fwd();
2268 request->resend_mds = fwd->get_dest_mds();
2269 request->caller_cond->Signal();
2270
2271 fwd->put();
2272}
2273
2274bool Client::is_dir_operation(MetaRequest *req)
2275{
2276 int op = req->get_op();
2277 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2278 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2279 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2280 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2281 return true;
2282 return false;
2283}
2284
2285void Client::handle_client_reply(MClientReply *reply)
2286{
2287 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2288 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2289 if (!session) {
2290 reply->put();
2291 return;
2292 }
2293
2294 ceph_tid_t tid = reply->get_tid();
2295 bool is_safe = reply->is_safe();
2296
2297 if (mds_requests.count(tid) == 0) {
2298 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2299 << " safe is:" << is_safe << dendl;
2300 reply->put();
2301 return;
2302 }
2303 MetaRequest *request = mds_requests.at(tid);
2304
2305 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2306 << " tid " << tid << dendl;
2307
2308 if (request->got_unsafe && !is_safe) {
2309 //duplicate response
2310 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2311 << mds_num << " safe:" << is_safe << dendl;
2312 reply->put();
2313 return;
2314 }
2315
2316 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2317 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2318 << " from mds." << request->mds << dendl;
2319 request->send_to_auth = true;
2320 request->resend_mds = choose_target_mds(request);
2321 Inode *in = request->inode();
2322 if (request->resend_mds >= 0 &&
2323 request->resend_mds == request->mds &&
2324 (in == NULL ||
2325 in->caps.count(request->resend_mds) == 0 ||
2326 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2327 // have to return ESTALE
2328 } else {
2329 request->caller_cond->Signal();
2330 reply->put();
2331 return;
2332 }
2333 ldout(cct, 20) << "have to return ESTALE" << dendl;
2334 }
2335
2336 assert(request->reply == NULL);
2337 request->reply = reply;
2338 insert_trace(request, session);
2339
2340 // Handle unsafe reply
2341 if (!is_safe) {
2342 request->got_unsafe = true;
2343 session->unsafe_requests.push_back(&request->unsafe_item);
2344 if (is_dir_operation(request)) {
2345 Inode *dir = request->inode();
2346 assert(dir);
2347 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2348 }
2349 if (request->target) {
2350 InodeRef &in = request->target;
2351 in->unsafe_ops.push_back(&request->unsafe_target_item);
2352 }
2353 }
2354
2355 // Only signal the caller once (on the first reply):
2356 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2357 if (!is_safe || !request->got_unsafe) {
2358 Cond cond;
2359 request->dispatch_cond = &cond;
2360
2361 // wake up waiter
2362 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2363 request->caller_cond->Signal();
2364
2365 // wake for kick back
2366 while (request->dispatch_cond) {
2367 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2368 cond.Wait(client_lock);
2369 }
2370 }
2371
2372 if (is_safe) {
2373 // the filesystem change is committed to disk
2374 // we're done, clean up
2375 if (request->got_unsafe) {
2376 request->unsafe_item.remove_myself();
2377 request->unsafe_dir_item.remove_myself();
2378 request->unsafe_target_item.remove_myself();
2379 signal_cond_list(request->waitfor_safe);
2380 }
2381 request->item.remove_myself();
2382 unregister_request(request);
2383 }
2384 if (unmounting)
2385 mount_cond.Signal();
2386}
2387
2388void Client::_handle_full_flag(int64_t pool)
2389{
2390 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2391 << "on " << pool << dendl;
2392 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2393 // to do this rather than blocking, because otherwise when we fill up we
2394 // potentially lock caps forever on files with dirty pages, and we need
2395 // to be able to release those caps to the MDS so that it can delete files
2396 // and free up space.
2397 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2398
2399 // For all inodes with layouts in this pool and a pending flush write op
2400 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2401 // from ObjectCacher so that it doesn't re-issue the write in response to
2402 // the ENOSPC error.
2403 // Fortunately since we're cancelling everything in a given pool, we don't
2404 // need to know which ops belong to which ObjectSet, we can just blow all
2405 // the un-flushed cached data away and mark any dirty inodes' async_err
2406 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2407 // affecting this pool, and all the objectsets we're purging were also
2408 // in this pool.
2409 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2410 i != inode_map.end(); ++i)
2411 {
2412 Inode *inode = i->second;
2413 if (inode->oset.dirty_or_tx
2414 && (pool == -1 || inode->layout.pool_id == pool)) {
2415 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2416 << " has dirty objects, purging and setting ENOSPC" << dendl;
2417 objectcacher->purge_set(&inode->oset);
2418 inode->set_async_err(-ENOSPC);
2419 }
2420 }
2421
2422 if (cancelled_epoch != (epoch_t)-1) {
2423 set_cap_epoch_barrier(cancelled_epoch);
2424 }
2425}
2426
2427void Client::handle_osd_map(MOSDMap *m)
2428{
31f18b77
FG
2429 std::set<entity_addr_t> new_blacklists;
2430 objecter->consume_blacklist_events(&new_blacklists);
2431
2432 const auto myaddr = messenger->get_myaddr();
2433 if (!blacklisted && new_blacklists.count(myaddr)) {
2434 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2435 return o.get_epoch();
2436 });
2437 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2438 blacklisted = true;
2439 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2440 p != mds_requests.end(); ) {
2441 auto req = p->second;
2442 ++p;
2443 req->abort(-EBLACKLISTED);
2444 if (req->caller_cond) {
2445 req->kick = true;
2446 req->caller_cond->Signal();
2447 }
2448 }
2449
2450 // Progress aborts on any requests that were on this waitlist. Any
2451 // requests that were on a waiting_for_open session waitlist
2452 // will get kicked during close session below.
2453 signal_cond_list(waiting_for_mdsmap);
2454
2455 // Force-close all sessions: assume this is not abandoning any state
2456 // on the MDS side because the MDS will have seen the blacklist too.
2457 while(!mds_sessions.empty()) {
2458 auto i = mds_sessions.begin();
2459 auto session = i->second;
2460 _closed_mds_session(session);
2461 }
2462
2463 // Since we know all our OSD ops will fail, cancel them all preemtively,
2464 // so that on an unhealthy cluster we can umount promptly even if e.g.
2465 // some PGs were inaccessible.
2466 objecter->op_cancel_writes(-EBLACKLISTED);
2467
2468 } else if (blacklisted) {
2469 // Handle case where we were blacklisted but no longer are
2470 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2471 return o.is_blacklisted(myaddr);});
2472 }
2473
7c673cae
FG
2474 if (objecter->osdmap_full_flag()) {
2475 _handle_full_flag(-1);
2476 } else {
2477 // Accumulate local list of full pools so that I can drop
2478 // the objecter lock before re-entering objecter in
2479 // cancel_writes
2480 std::vector<int64_t> full_pools;
2481
2482 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2483 for (const auto& kv : o.get_pools()) {
2484 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2485 full_pools.push_back(kv.first);
2486 }
2487 }
2488 });
2489
2490 for (auto p : full_pools)
2491 _handle_full_flag(p);
2492
2493 // Subscribe to subsequent maps to watch for the full flag going
2494 // away. For the global full flag objecter does this for us, but
2495 // it pays no attention to the per-pool full flag so in this branch
2496 // we do it ourselves.
2497 if (!full_pools.empty()) {
2498 objecter->maybe_request_map();
2499 }
2500 }
2501
2502 m->put();
2503}
2504
2505
2506// ------------------------
2507// incoming messages
2508
2509
2510bool Client::ms_dispatch(Message *m)
2511{
2512 Mutex::Locker l(client_lock);
2513 if (!initialized) {
2514 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2515 m->put();
2516 return true;
2517 }
2518
2519 switch (m->get_type()) {
2520 // mounting and mds sessions
2521 case CEPH_MSG_MDS_MAP:
2522 handle_mds_map(static_cast<MMDSMap*>(m));
2523 break;
2524 case CEPH_MSG_FS_MAP:
2525 handle_fs_map(static_cast<MFSMap*>(m));
2526 break;
2527 case CEPH_MSG_FS_MAP_USER:
2528 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2529 break;
2530 case CEPH_MSG_CLIENT_SESSION:
2531 handle_client_session(static_cast<MClientSession*>(m));
2532 break;
2533
2534 case CEPH_MSG_OSD_MAP:
2535 handle_osd_map(static_cast<MOSDMap*>(m));
2536 break;
2537
2538 // requests
2539 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2540 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2541 break;
2542 case CEPH_MSG_CLIENT_REPLY:
2543 handle_client_reply(static_cast<MClientReply*>(m));
2544 break;
2545
2546 case CEPH_MSG_CLIENT_SNAP:
2547 handle_snap(static_cast<MClientSnap*>(m));
2548 break;
2549 case CEPH_MSG_CLIENT_CAPS:
2550 handle_caps(static_cast<MClientCaps*>(m));
2551 break;
2552 case CEPH_MSG_CLIENT_LEASE:
2553 handle_lease(static_cast<MClientLease*>(m));
2554 break;
2555 case MSG_COMMAND_REPLY:
2556 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2557 handle_command_reply(static_cast<MCommandReply*>(m));
2558 } else {
2559 return false;
2560 }
2561 break;
2562 case CEPH_MSG_CLIENT_QUOTA:
2563 handle_quota(static_cast<MClientQuota*>(m));
2564 break;
2565
2566 default:
2567 return false;
2568 }
2569
2570 // unmounting?
2571 if (unmounting) {
2572 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2573 << "+" << inode_map.size() << dendl;
2574 long unsigned size = lru.lru_get_size() + inode_map.size();
2575 trim_cache();
2576 if (size < lru.lru_get_size() + inode_map.size()) {
2577 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2578 mount_cond.Signal();
2579 } else {
2580 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2581 << "+" << inode_map.size() << dendl;
2582 }
2583 }
2584
2585 return true;
2586}
2587
2588void Client::handle_fs_map(MFSMap *m)
2589{
2590 fsmap.reset(new FSMap(m->get_fsmap()));
2591 m->put();
2592
2593 signal_cond_list(waiting_for_fsmap);
2594
2595 monclient->sub_got("fsmap", fsmap->get_epoch());
2596}
2597
2598void Client::handle_fs_map_user(MFSMapUser *m)
2599{
2600 fsmap_user.reset(new FSMapUser);
2601 *fsmap_user = m->get_fsmap();
2602 m->put();
2603
2604 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2605 signal_cond_list(waiting_for_fsmap);
2606}
2607
2608void Client::handle_mds_map(MMDSMap* m)
2609{
2610 if (m->get_epoch() <= mdsmap->get_epoch()) {
2611 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2612 << " is identical to or older than our "
2613 << mdsmap->get_epoch() << dendl;
2614 m->put();
2615 return;
2616 }
2617
2618 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2619
2620 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2621 oldmap.swap(mdsmap);
2622
2623 mdsmap->decode(m->get_encoded());
2624
2625 // Cancel any commands for missing or laggy GIDs
2626 std::list<ceph_tid_t> cancel_ops;
2627 auto &commands = command_table.get_commands();
2628 for (const auto &i : commands) {
2629 auto &op = i.second;
2630 const mds_gid_t op_mds_gid = op.mds_gid;
2631 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2632 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2633 cancel_ops.push_back(i.first);
2634 if (op.outs) {
2635 std::ostringstream ss;
2636 ss << "MDS " << op_mds_gid << " went away";
2637 *(op.outs) = ss.str();
2638 }
2639 op.con->mark_down();
2640 if (op.on_finish) {
2641 op.on_finish->complete(-ETIMEDOUT);
2642 }
2643 }
2644 }
2645
2646 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2647 i != cancel_ops.end(); ++i) {
2648 command_table.erase(*i);
2649 }
2650
2651 // reset session
2652 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2653 p != mds_sessions.end(); ) {
2654 mds_rank_t mds = p->first;
2655 MetaSession *session = p->second;
2656 ++p;
2657
2658 int oldstate = oldmap->get_state(mds);
2659 int newstate = mdsmap->get_state(mds);
2660 if (!mdsmap->is_up(mds)) {
2661 session->con->mark_down();
2662 } else if (mdsmap->get_inst(mds) != session->inst) {
2663 session->con->mark_down();
2664 session->inst = mdsmap->get_inst(mds);
2665 // When new MDS starts to take over, notify kernel to trim unused entries
2666 // in its dcache/icache. Hopefully, the kernel will release some unused
2667 // inodes before the new MDS enters reconnect state.
2668 trim_cache_for_reconnect(session);
2669 } else if (oldstate == newstate)
2670 continue; // no change
2671
2672 session->mds_state = newstate;
2673 if (newstate == MDSMap::STATE_RECONNECT) {
2674 session->con = messenger->get_connection(session->inst);
2675 send_reconnect(session);
2676 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2677 if (oldstate < MDSMap::STATE_ACTIVE) {
2678 // kick new requests
2679 kick_requests(session);
2680 kick_flushing_caps(session);
2681 signal_context_list(session->waiting_for_open);
2682 kick_maxsize_requests(session);
2683 wake_inode_waiters(session);
2684 }
2685 connect_mds_targets(mds);
2686 } else if (newstate == MDSMap::STATE_NULL &&
2687 mds >= mdsmap->get_max_mds()) {
2688 _closed_mds_session(session);
2689 }
2690 }
2691
2692 // kick any waiting threads
2693 signal_cond_list(waiting_for_mdsmap);
2694
2695 m->put();
2696
2697 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2698}
2699
2700void Client::send_reconnect(MetaSession *session)
2701{
2702 mds_rank_t mds = session->mds_num;
2703 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2704
2705 // trim unused caps to reduce MDS's cache rejoin time
2706 trim_cache_for_reconnect(session);
2707
2708 session->readonly = false;
2709
2710 if (session->release) {
2711 session->release->put();
2712 session->release = NULL;
2713 }
2714
2715 // reset my cap seq number
2716 session->seq = 0;
2717 //connect to the mds' offload targets
2718 connect_mds_targets(mds);
2719 //make sure unsafe requests get saved
2720 resend_unsafe_requests(session);
2721
2722 MClientReconnect *m = new MClientReconnect;
2723
2724 // i have an open session.
2725 ceph::unordered_set<inodeno_t> did_snaprealm;
2726 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2727 p != inode_map.end();
2728 ++p) {
2729 Inode *in = p->second;
2730 if (in->caps.count(mds)) {
2731 ldout(cct, 10) << " caps on " << p->first
2732 << " " << ccap_string(in->caps[mds]->issued)
2733 << " wants " << ccap_string(in->caps_wanted())
2734 << dendl;
2735 filepath path;
2736 in->make_long_path(path);
2737 ldout(cct, 10) << " path " << path << dendl;
2738
2739 bufferlist flockbl;
2740 _encode_filelocks(in, flockbl);
2741
2742 Cap *cap = in->caps[mds];
2743 cap->seq = 0; // reset seq.
2744 cap->issue_seq = 0; // reset seq.
2745 cap->mseq = 0; // reset seq.
2746 cap->issued = cap->implemented;
2747
2748 snapid_t snap_follows = 0;
2749 if (!in->cap_snaps.empty())
2750 snap_follows = in->cap_snaps.begin()->first;
2751
2752 m->add_cap(p->first.ino,
2753 cap->cap_id,
2754 path.get_ino(), path.get_path(), // ino
2755 in->caps_wanted(), // wanted
2756 cap->issued, // issued
2757 in->snaprealm->ino,
2758 snap_follows,
2759 flockbl);
2760
2761 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2762 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2763 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2764 did_snaprealm.insert(in->snaprealm->ino);
2765 }
2766 }
2767 }
2768
2769 early_kick_flushing_caps(session);
2770
2771 session->con->send_message(m);
2772
2773 mount_cond.Signal();
2774}
2775
2776
2777void Client::kick_requests(MetaSession *session)
2778{
2779 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2780 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2781 p != mds_requests.end();
2782 ++p) {
31f18b77
FG
2783 MetaRequest *req = p->second;
2784 if (req->got_unsafe)
2785 continue;
2786 if (req->aborted()) {
2787 if (req->caller_cond) {
2788 req->kick = true;
2789 req->caller_cond->Signal();
2790 }
7c673cae 2791 continue;
31f18b77
FG
2792 }
2793 if (req->retry_attempt > 0)
7c673cae 2794 continue; // new requests only
31f18b77 2795 if (req->mds == session->mds_num) {
7c673cae
FG
2796 send_request(p->second, session);
2797 }
2798 }
2799}
2800
2801void Client::resend_unsafe_requests(MetaSession *session)
2802{
2803 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2804 !iter.end();
2805 ++iter)
2806 send_request(*iter, session);
2807
2808 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2809 // process completed requests in clientreplay stage.
2810 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2811 p != mds_requests.end();
2812 ++p) {
2813 MetaRequest *req = p->second;
2814 if (req->got_unsafe)
2815 continue;
31f18b77
FG
2816 if (req->aborted())
2817 continue;
7c673cae
FG
2818 if (req->retry_attempt == 0)
2819 continue; // old requests only
2820 if (req->mds == session->mds_num)
2821 send_request(req, session, true);
2822 }
2823}
2824
2825void Client::wait_unsafe_requests()
2826{
2827 list<MetaRequest*> last_unsafe_reqs;
2828 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2829 p != mds_sessions.end();
2830 ++p) {
2831 MetaSession *s = p->second;
2832 if (!s->unsafe_requests.empty()) {
2833 MetaRequest *req = s->unsafe_requests.back();
2834 req->get();
2835 last_unsafe_reqs.push_back(req);
2836 }
2837 }
2838
2839 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2840 p != last_unsafe_reqs.end();
2841 ++p) {
2842 MetaRequest *req = *p;
2843 if (req->unsafe_item.is_on_list())
2844 wait_on_list(req->waitfor_safe);
2845 put_request(req);
2846 }
2847}
2848
2849void Client::kick_requests_closed(MetaSession *session)
2850{
2851 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2852 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2853 p != mds_requests.end(); ) {
2854 MetaRequest *req = p->second;
2855 ++p;
2856 if (req->mds == session->mds_num) {
2857 if (req->caller_cond) {
2858 req->kick = true;
2859 req->caller_cond->Signal();
2860 }
2861 req->item.remove_myself();
2862 if (req->got_unsafe) {
2863 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2864 req->unsafe_item.remove_myself();
2865 req->unsafe_dir_item.remove_myself();
2866 req->unsafe_target_item.remove_myself();
2867 signal_cond_list(req->waitfor_safe);
2868 unregister_request(req);
2869 }
2870 }
2871 }
2872 assert(session->requests.empty());
2873 assert(session->unsafe_requests.empty());
2874}
2875
2876
2877
2878
2879/************
2880 * leases
2881 */
2882
2883void Client::got_mds_push(MetaSession *s)
2884{
2885 s->seq++;
2886 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2887 if (s->state == MetaSession::STATE_CLOSING) {
2888 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2889 }
2890}
2891
2892void Client::handle_lease(MClientLease *m)
2893{
2894 ldout(cct, 10) << "handle_lease " << *m << dendl;
2895
2896 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2897
2898 mds_rank_t mds = mds_rank_t(m->get_source().num());
2899 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2900 if (!session) {
2901 m->put();
2902 return;
2903 }
2904
2905 got_mds_push(session);
2906
2907 ceph_seq_t seq = m->get_seq();
2908
2909 Inode *in;
2910 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2911 if (inode_map.count(vino) == 0) {
2912 ldout(cct, 10) << " don't have vino " << vino << dendl;
2913 goto revoke;
2914 }
2915 in = inode_map[vino];
2916
2917 if (m->get_mask() & CEPH_LOCK_DN) {
2918 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2919 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2920 goto revoke;
2921 }
2922 Dentry *dn = in->dir->dentries[m->dname];
2923 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2924 dn->lease_mds = -1;
2925 }
2926
2927 revoke:
2928 m->get_connection()->send_message(
2929 new MClientLease(
2930 CEPH_MDS_LEASE_RELEASE, seq,
2931 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2932 m->put();
2933}
2934
2935void Client::put_inode(Inode *in, int n)
2936{
2937 ldout(cct, 10) << "put_inode on " << *in << dendl;
2938 int left = in->_put(n);
2939 if (left == 0) {
2940 // release any caps
2941 remove_all_caps(in);
2942
2943 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2944 bool unclean = objectcacher->release_set(&in->oset);
2945 assert(!unclean);
2946 inode_map.erase(in->vino());
2947 if (use_faked_inos())
2948 _release_faked_ino(in);
2949
2950 if (in == root) {
2951 root = 0;
2952 root_ancestor = 0;
2953 while (!root_parents.empty())
2954 root_parents.erase(root_parents.begin());
2955 }
2956
2957 delete in;
2958 }
2959}
2960
2961void Client::close_dir(Dir *dir)
2962{
2963 Inode *in = dir->parent_inode;
2964 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2965 assert(dir->is_empty());
2966 assert(in->dir == dir);
2967 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2968 if (!in->dn_set.empty())
2969 in->get_first_parent()->put(); // unpin dentry
2970
2971 delete in->dir;
2972 in->dir = 0;
2973 put_inode(in); // unpin inode
2974}
2975
2976 /**
2977 * Don't call this with in==NULL, use get_or_create for that
2978 * leave dn set to default NULL unless you're trying to add
2979 * a new inode to a pre-created Dentry
2980 */
2981Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2982{
2983 if (!dn) {
2984 // create a new Dentry
2985 dn = new Dentry;
2986 dn->name = name;
2987
2988 // link to dir
2989 dn->dir = dir;
2990 dir->dentries[dn->name] = dn;
2991 lru.lru_insert_mid(dn); // mid or top?
2992
2993 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2994 << " dn " << dn << " (new dn)" << dendl;
2995 } else {
2996 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2997 << " dn " << dn << " (old dn)" << dendl;
2998 }
2999
3000 if (in) { // link to inode
3001 dn->inode = in;
3002 if (in->is_dir()) {
3003 if (in->dir)
3004 dn->get(); // dir -> dn pin
3005 if (in->ll_ref)
3006 dn->get(); // ll_ref -> dn pin
3007 }
3008
3009 assert(in->dn_set.count(dn) == 0);
3010
3011 // only one parent for directories!
3012 if (in->is_dir() && !in->dn_set.empty()) {
3013 Dentry *olddn = in->get_first_parent();
3014 assert(olddn->dir != dir || olddn->name != name);
3015 Inode *old_diri = olddn->dir->parent_inode;
3016 old_diri->dir_release_count++;
3017 clear_dir_complete_and_ordered(old_diri, true);
3018 unlink(olddn, true, true); // keep dir, dentry
3019 }
3020
3021 in->dn_set.insert(dn);
3022
3023 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3024 }
3025
3026 return dn;
3027}
3028
3029void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3030{
3031 InodeRef in;
3032 in.swap(dn->inode);
3033 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3034 << " inode " << dn->inode << dendl;
3035
3036 // unlink from inode
3037 if (in) {
3038 if (in->is_dir()) {
3039 if (in->dir)
3040 dn->put(); // dir -> dn pin
3041 if (in->ll_ref)
3042 dn->put(); // ll_ref -> dn pin
3043 }
3044 dn->inode = 0;
3045 assert(in->dn_set.count(dn));
3046 in->dn_set.erase(dn);
3047 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3048 }
3049
3050 if (keepdentry) {
3051 dn->lease_mds = -1;
3052 } else {
3053 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3054
3055 // unlink from dir
3056 dn->dir->dentries.erase(dn->name);
3057 if (dn->dir->is_empty() && !keepdir)
3058 close_dir(dn->dir);
3059 dn->dir = 0;
3060
3061 // delete den
3062 lru.lru_remove(dn);
3063 dn->put();
3064 }
3065}
3066
3067/**
3068 * For asynchronous flushes, check for errors from the IO and
3069 * update the inode if necessary
3070 */
3071class C_Client_FlushComplete : public Context {
3072private:
3073 Client *client;
3074 InodeRef inode;
3075public:
3076 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3077 void finish(int r) override {
3078 assert(client->client_lock.is_locked_by_me());
3079 if (r != 0) {
3080 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3081 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3082 << " 0x" << std::hex << inode->ino << std::dec
3083 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3084 inode->set_async_err(r);
3085 }
3086 }
3087};
3088
3089
3090/****
3091 * caps
3092 */
3093
3094void Client::get_cap_ref(Inode *in, int cap)
3095{
3096 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3097 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3098 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3099 in->get();
3100 }
3101 if ((cap & CEPH_CAP_FILE_CACHE) &&
3102 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3103 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3104 in->get();
3105 }
3106 in->get_cap_ref(cap);
3107}
3108
3109void Client::put_cap_ref(Inode *in, int cap)
3110{
3111 int last = in->put_cap_ref(cap);
3112 if (last) {
3113 int put_nref = 0;
3114 int drop = last & ~in->caps_issued();
3115 if (in->snapid == CEPH_NOSNAP) {
3116 if ((last & CEPH_CAP_FILE_WR) &&
3117 !in->cap_snaps.empty() &&
3118 in->cap_snaps.rbegin()->second.writing) {
3119 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3120 in->cap_snaps.rbegin()->second.writing = 0;
3121 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3122 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3123 }
3124 if (last & CEPH_CAP_FILE_BUFFER) {
3125 for (auto &p : in->cap_snaps)
3126 p.second.dirty_data = 0;
3127 signal_cond_list(in->waitfor_commit);
3128 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3129 ++put_nref;
3130 }
3131 }
3132 if (last & CEPH_CAP_FILE_CACHE) {
3133 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3134 ++put_nref;
3135 }
3136 if (drop)
3137 check_caps(in, 0);
3138 if (put_nref)
3139 put_inode(in, put_nref);
3140 }
3141}
3142
3143int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3144{
3145 int r = check_pool_perm(in, need);
3146 if (r < 0)
3147 return r;
3148
3149 while (1) {
3150 int file_wanted = in->caps_file_wanted();
3151 if ((file_wanted & need) != need) {
3152 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3153 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3154 << dendl;
3155 return -EBADF;
3156 }
3157
3158 int implemented;
3159 int have = in->caps_issued(&implemented);
3160
3161 bool waitfor_caps = false;
3162 bool waitfor_commit = false;
3163
3164 if (have & need & CEPH_CAP_FILE_WR) {
3165 if (endoff > 0 &&
3166 (endoff >= (loff_t)in->max_size ||
3167 endoff > (loff_t)(in->size << 1)) &&
3168 endoff > (loff_t)in->wanted_max_size) {
3169 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3170 in->wanted_max_size = endoff;
3171 check_caps(in, 0);
3172 }
3173
3174 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3175 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3176 waitfor_caps = true;
3177 }
3178 if (!in->cap_snaps.empty()) {
3179 if (in->cap_snaps.rbegin()->second.writing) {
3180 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3181 waitfor_caps = true;
3182 }
3183 for (auto &p : in->cap_snaps) {
3184 if (p.second.dirty_data) {
3185 waitfor_commit = true;
3186 break;
3187 }
3188 }
3189 if (waitfor_commit) {
3190 _flush(in, new C_Client_FlushComplete(this, in));
3191 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3192 }
3193 }
3194 }
3195
3196 if (!waitfor_caps && !waitfor_commit) {
3197 if ((have & need) == need) {
7c673cae
FG
3198 int revoking = implemented & ~have;
3199 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3200 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3201 << " revoking " << ccap_string(revoking)
7c673cae 3202 << dendl;
c07f9fc5 3203 if ((revoking & want) == 0) {
7c673cae
FG
3204 *phave = need | (have & want);
3205 in->get_cap_ref(need);
3206 return 0;
3207 }
3208 }
3209 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3210 waitfor_caps = true;
3211 }
3212
3213 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3214 in->auth_cap->session->readonly)
3215 return -EROFS;
3216
3217 if (in->flags & I_CAP_DROPPED) {
3218 int mds_wanted = in->caps_mds_wanted();
3219 if ((mds_wanted & need) != need) {
3220 int ret = _renew_caps(in);
3221 if (ret < 0)
3222 return ret;
3223 continue;
3224 }
3225 if ((mds_wanted & file_wanted) ==
3226 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3227 in->flags &= ~I_CAP_DROPPED;
3228 }
3229 }
3230
3231 if (waitfor_caps)
3232 wait_on_list(in->waitfor_caps);
3233 else if (waitfor_commit)
3234 wait_on_list(in->waitfor_commit);
3235 }
3236}
3237
3238int Client::get_caps_used(Inode *in)
3239{
3240 unsigned used = in->caps_used();
3241 if (!(used & CEPH_CAP_FILE_CACHE) &&
3242 !objectcacher->set_is_empty(&in->oset))
3243 used |= CEPH_CAP_FILE_CACHE;
3244 return used;
3245}
3246
3247void Client::cap_delay_requeue(Inode *in)
3248{
3249 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3250 in->hold_caps_until = ceph_clock_now();
3251 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3252 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3253}
3254
3255void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3256 bool sync, int used, int want, int retain,
3257 int flush, ceph_tid_t flush_tid)
3258{
3259 int held = cap->issued | cap->implemented;
3260 int revoking = cap->implemented & ~cap->issued;
3261 retain &= ~revoking;
3262 int dropping = cap->issued & ~retain;
3263 int op = CEPH_CAP_OP_UPDATE;
3264
3265 ldout(cct, 10) << "send_cap " << *in
3266 << " mds." << session->mds_num << " seq " << cap->seq
3267 << (sync ? " sync " : " async ")
3268 << " used " << ccap_string(used)
3269 << " want " << ccap_string(want)
3270 << " flush " << ccap_string(flush)
3271 << " retain " << ccap_string(retain)
3272 << " held "<< ccap_string(held)
3273 << " revoking " << ccap_string(revoking)
3274 << " dropping " << ccap_string(dropping)
3275 << dendl;
3276
3277 if (cct->_conf->client_inject_release_failure && revoking) {
3278 const int would_have_issued = cap->issued & retain;
3279 const int would_have_implemented = cap->implemented & (cap->issued | used);
3280 // Simulated bug:
3281 // - tell the server we think issued is whatever they issued plus whatever we implemented
3282 // - leave what we have implemented in place
3283 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3284 cap->issued = cap->issued | cap->implemented;
3285
3286 // Make an exception for revoking xattr caps: we are injecting
3287 // failure to release other caps, but allow xattr because client
3288 // will block on xattr ops if it can't release these to MDS (#9800)
3289 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3290 cap->issued ^= xattr_mask & revoking;
3291 cap->implemented ^= xattr_mask & revoking;
3292
3293 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3294 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3295 } else {
3296 // Normal behaviour
3297 cap->issued &= retain;
3298 cap->implemented &= cap->issued | used;
3299 }
3300
3301 snapid_t follows = 0;
3302
3303 if (flush)
3304 follows = in->snaprealm->get_snap_context().seq;
3305
3306 MClientCaps *m = new MClientCaps(op,
3307 in->ino,
3308 0,
3309 cap->cap_id, cap->seq,
3310 cap->implemented,
3311 want,
3312 flush,
3313 cap->mseq,
3314 cap_epoch_barrier);
3315 m->caller_uid = in->cap_dirtier_uid;
3316 m->caller_gid = in->cap_dirtier_gid;
3317
3318 m->head.issue_seq = cap->issue_seq;
3319 m->set_tid(flush_tid);
3320
3321 m->head.uid = in->uid;
3322 m->head.gid = in->gid;
3323 m->head.mode = in->mode;
3324
3325 m->head.nlink = in->nlink;
3326
3327 if (flush & CEPH_CAP_XATTR_EXCL) {
3328 ::encode(in->xattrs, m->xattrbl);
3329 m->head.xattr_version = in->xattr_version;
3330 }
3331
3332 m->size = in->size;
3333 m->max_size = in->max_size;
3334 m->truncate_seq = in->truncate_seq;
3335 m->truncate_size = in->truncate_size;
3336 m->mtime = in->mtime;
3337 m->atime = in->atime;
3338 m->ctime = in->ctime;
3339 m->btime = in->btime;
3340 m->time_warp_seq = in->time_warp_seq;
3341 m->change_attr = in->change_attr;
3342 if (sync)
3343 m->flags |= CLIENT_CAPS_SYNC;
3344
3345 if (flush & CEPH_CAP_FILE_WR) {
3346 m->inline_version = in->inline_version;
3347 m->inline_data = in->inline_data;
3348 }
3349
3350 in->reported_size = in->size;
3351 m->set_snap_follows(follows);
3352 cap->wanted = want;
3353 if (cap == in->auth_cap) {
3354 m->set_max_size(in->wanted_max_size);
3355 in->requested_max_size = in->wanted_max_size;
3356 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3357 }
3358
3359 if (!session->flushing_caps_tids.empty())
3360 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3361
3362 session->con->send_message(m);
3363}
3364
31f18b77
FG
3365static bool is_max_size_approaching(Inode *in)
3366{
3367 /* mds will adjust max size according to the reported size */
3368 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3369 return false;
3370 if (in->size >= in->max_size)
3371 return true;
3372 /* half of previous max_size increment has been used */
3373 if (in->max_size > in->reported_size &&
3374 (in->size << 1) >= in->max_size + in->reported_size)
3375 return true;
3376 return false;
3377}
7c673cae
FG
3378
3379/**
3380 * check_caps
3381 *
3382 * Examine currently used and wanted versus held caps. Release, flush or ack
3383 * revoked caps to the MDS as appropriate.
3384 *
3385 * @param in the inode to check
3386 * @param flags flags to apply to cap check
3387 */
3388void Client::check_caps(Inode *in, unsigned flags)
3389{
3390 unsigned wanted = in->caps_wanted();
3391 unsigned used = get_caps_used(in);
3392 unsigned cap_used;
3393
3394 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3395 // we do this here because we don't want to drop to Fs (and then
3396 // drop the Fs if we do a create!) if that alone makes us send lookups
3397 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3398 wanted |= CEPH_CAP_FILE_EXCL;
3399 }
3400
3401 int implemented;
3402 int issued = in->caps_issued(&implemented);
3403 int revoking = implemented & ~issued;
3404
3405 int retain = wanted | used | CEPH_CAP_PIN;
3406 if (!unmounting) {
3407 if (wanted)
3408 retain |= CEPH_CAP_ANY;
3409 else
3410 retain |= CEPH_CAP_ANY_SHARED;
3411 }
3412
3413 ldout(cct, 10) << "check_caps on " << *in
3414 << " wanted " << ccap_string(wanted)
3415 << " used " << ccap_string(used)
3416 << " issued " << ccap_string(issued)
3417 << " revoking " << ccap_string(revoking)
3418 << " flags=" << flags
3419 << dendl;
3420
3421 if (in->snapid != CEPH_NOSNAP)
3422 return; //snap caps last forever, can't write
3423
3424 if (in->caps.empty())
3425 return; // guard if at end of func
3426
3427 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
94b18763
FG
3428 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3429 if (_release(in))
3430 used &= ~CEPH_CAP_FILE_CACHE;
3431 }
7c673cae
FG
3432
3433 if (!in->cap_snaps.empty())
3434 flush_snaps(in);
3435
3436 if (flags & CHECK_CAPS_NODELAY)
3437 in->hold_caps_until = utime_t();
3438 else
3439 cap_delay_requeue(in);
3440
3441 utime_t now = ceph_clock_now();
3442
3443 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3444 while (it != in->caps.end()) {
3445 mds_rank_t mds = it->first;
3446 Cap *cap = it->second;
3447 ++it;
3448
3449 MetaSession *session = mds_sessions[mds];
3450 assert(session);
3451
3452 cap_used = used;
3453 if (in->auth_cap && cap != in->auth_cap)
3454 cap_used &= ~in->auth_cap->issued;
3455
3456 revoking = cap->implemented & ~cap->issued;
3457
3458 ldout(cct, 10) << " cap mds." << mds
3459 << " issued " << ccap_string(cap->issued)
3460 << " implemented " << ccap_string(cap->implemented)
3461 << " revoking " << ccap_string(revoking) << dendl;
3462
3463 if (in->wanted_max_size > in->max_size &&
3464 in->wanted_max_size > in->requested_max_size &&
3465 cap == in->auth_cap)
3466 goto ack;
3467
3468 /* approaching file_max? */
3469 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3470 cap == in->auth_cap &&
3471 is_max_size_approaching(in)) {
7c673cae 3472 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3473 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3474 goto ack;
3475 }
3476
3477 /* completed revocation? */
3478 if (revoking && (revoking & cap_used) == 0) {
3479 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3480 goto ack;
3481 }
3482
3483 /* want more caps from mds? */
3484 if (wanted & ~(cap->wanted | cap->issued))
3485 goto ack;
3486
3487 if (!revoking && unmounting && (cap_used == 0))
3488 goto ack;
3489
3490 if (wanted == cap->wanted && // mds knows what we want.
3491 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3492 !in->dirty_caps) // and we have no dirty caps
3493 continue;
3494
3495 if (now < in->hold_caps_until) {
3496 ldout(cct, 10) << "delaying cap release" << dendl;
3497 continue;
3498 }
3499
3500 ack:
3501 // re-send old cap/snapcap flushes first.
3502 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3503 session->mds_state < MDSMap::STATE_ACTIVE &&
3504 session->early_flushing_caps.count(in) == 0) {
3505 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3506 << " to mds." << session->mds_num << dendl;
3507 session->early_flushing_caps.insert(in);
3508 if (in->cap_snaps.size())
3509 flush_snaps(in, true);
3510 if (in->flushing_caps)
3511 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3512 }
3513
3514 int flushing;
3515 ceph_tid_t flush_tid;
3516 if (in->auth_cap == cap && in->dirty_caps) {
3517 flushing = mark_caps_flushing(in, &flush_tid);
3518 } else {
3519 flushing = 0;
3520 flush_tid = 0;
3521 }
3522
3523 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3524 retain, flushing, flush_tid);
3525 }
3526}
3527
3528
3529void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3530{
3531 int used = get_caps_used(in);
3532 int dirty = in->caps_dirty();
3533 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3534
3535 if (in->cap_snaps.size() &&
3536 in->cap_snaps.rbegin()->second.writing) {
3537 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3538 return;
3539 } else if (in->caps_dirty() ||
3540 (used & CEPH_CAP_FILE_WR) ||
3541 (dirty & CEPH_CAP_ANY_WR)) {
3542 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3543 assert(capsnapem.second == true); /* element inserted */
3544 CapSnap &capsnap = capsnapem.first->second;
3545 capsnap.context = old_snapc;
3546 capsnap.issued = in->caps_issued();
3547 capsnap.dirty = in->caps_dirty();
3548
3549 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3550
3551 capsnap.uid = in->uid;
3552 capsnap.gid = in->gid;
3553 capsnap.mode = in->mode;
3554 capsnap.btime = in->btime;
3555 capsnap.xattrs = in->xattrs;
3556 capsnap.xattr_version = in->xattr_version;
3557
3558 if (used & CEPH_CAP_FILE_WR) {
3559 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3560 capsnap.writing = 1;
3561 } else {
3562 finish_cap_snap(in, capsnap, used);
3563 }
3564 } else {
3565 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3566 }
3567}
3568
3569void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3570{
3571 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3572 capsnap.size = in->size;
3573 capsnap.mtime = in->mtime;
3574 capsnap.atime = in->atime;
3575 capsnap.ctime = in->ctime;
3576 capsnap.time_warp_seq = in->time_warp_seq;
3577 capsnap.change_attr = in->change_attr;
3578
3579 capsnap.dirty |= in->caps_dirty();
3580
3581 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3582 capsnap.inline_data = in->inline_data;
3583 capsnap.inline_version = in->inline_version;
3584 }
3585
3586 if (used & CEPH_CAP_FILE_BUFFER) {
3587 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3588 << " WRBUFFER, delaying" << dendl;
3589 } else {
3590 capsnap.dirty_data = 0;
3591 flush_snaps(in);
3592 }
3593}
3594
3595void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3596{
3597 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3598 in->cap_snaps.at(seq).dirty_data = 0;
3599 flush_snaps(in);
3600}
3601
3602void Client::flush_snaps(Inode *in, bool all_again)
3603{
3604 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3605 assert(in->cap_snaps.size());
3606
3607 // pick auth mds
3608 assert(in->auth_cap);
3609 MetaSession *session = in->auth_cap->session;
3610 int mseq = in->auth_cap->mseq;
3611
3612 for (auto &p : in->cap_snaps) {
3613 CapSnap &capsnap = p.second;
3614 if (!all_again) {
3615 // only flush once per session
3616 if (capsnap.flush_tid > 0)
3617 continue;
3618 }
3619
3620 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3621 << " follows " << p.first
3622 << " size " << capsnap.size
3623 << " mtime " << capsnap.mtime
3624 << " dirty_data=" << capsnap.dirty_data
3625 << " writing=" << capsnap.writing
3626 << " on " << *in << dendl;
3627 if (capsnap.dirty_data || capsnap.writing)
3628 continue;
3629
3630 if (capsnap.flush_tid == 0) {
3631 capsnap.flush_tid = ++last_flush_tid;
3632 if (!in->flushing_cap_item.is_on_list())
3633 session->flushing_caps.push_back(&in->flushing_cap_item);
3634 session->flushing_caps_tids.insert(capsnap.flush_tid);
3635 }
3636
3637 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3638 cap_epoch_barrier);
3639 if (user_id >= 0)
3640 m->caller_uid = user_id;
3641 if (group_id >= 0)
3642 m->caller_gid = group_id;
3643
3644 m->set_client_tid(capsnap.flush_tid);
3645 m->head.snap_follows = p.first;
3646
3647 m->head.caps = capsnap.issued;
3648 m->head.dirty = capsnap.dirty;
3649
3650 m->head.uid = capsnap.uid;
3651 m->head.gid = capsnap.gid;
3652 m->head.mode = capsnap.mode;
3653 m->btime = capsnap.btime;
3654
3655 m->size = capsnap.size;
3656
3657 m->head.xattr_version = capsnap.xattr_version;
3658 ::encode(capsnap.xattrs, m->xattrbl);
3659
3660 m->ctime = capsnap.ctime;
3661 m->btime = capsnap.btime;
3662 m->mtime = capsnap.mtime;
3663 m->atime = capsnap.atime;
3664 m->time_warp_seq = capsnap.time_warp_seq;
3665 m->change_attr = capsnap.change_attr;
3666
3667 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3668 m->inline_version = in->inline_version;
3669 m->inline_data = in->inline_data;
3670 }
3671
3672 assert(!session->flushing_caps_tids.empty());
3673 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3674
3675 session->con->send_message(m);
3676 }
3677}
3678
3679
3680
3681void Client::wait_on_list(list<Cond*>& ls)
3682{
3683 Cond cond;
3684 ls.push_back(&cond);
3685 cond.Wait(client_lock);
3686 ls.remove(&cond);
3687}
3688
3689void Client::signal_cond_list(list<Cond*>& ls)
3690{
3691 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3692 (*it)->Signal();
3693}
3694
3695void Client::wait_on_context_list(list<Context*>& ls)
3696{
3697 Cond cond;
3698 bool done = false;
3699 int r;
3700 ls.push_back(new C_Cond(&cond, &done, &r));
3701 while (!done)
3702 cond.Wait(client_lock);
3703}
3704
3705void Client::signal_context_list(list<Context*>& ls)
3706{
3707 while (!ls.empty()) {
3708 ls.front()->complete(0);
3709 ls.pop_front();
3710 }
3711}
3712
3713void Client::wake_inode_waiters(MetaSession *s)
3714{
3715 xlist<Cap*>::iterator iter = s->caps.begin();
3716 while (!iter.end()){
3717 signal_cond_list((*iter)->inode->waitfor_caps);
3718 ++iter;
3719 }
3720}
3721
3722
3723// flush dirty data (from objectcache)
3724
3725class C_Client_CacheInvalidate : public Context {
3726private:
3727 Client *client;
3728 vinodeno_t ino;
3729 int64_t offset, length;
3730public:
3731 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3732 client(c), offset(off), length(len) {
3733 if (client->use_faked_inos())
3734 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3735 else
3736 ino = in->vino();
3737 }
3738 void finish(int r) override {
3739 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3740 assert(!client->client_lock.is_locked_by_me());
3741 client->_async_invalidate(ino, offset, length);
3742 }
3743};
3744
3745void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3746{
3747 if (unmounting)
3748 return;
3749 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3750 ino_invalidate_cb(callback_handle, ino, off, len);
3751}
3752
3753void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3754
3755 if (ino_invalidate_cb)
3756 // we queue the invalidate, which calls the callback and decrements the ref
3757 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3758}
3759
3760void Client::_invalidate_inode_cache(Inode *in)
3761{
3762 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3763
3764 // invalidate our userspace inode cache
94b18763 3765 if (cct->_conf->client_oc) {
7c673cae 3766 objectcacher->release_set(&in->oset);
94b18763
FG
3767 if (!objectcacher->set_is_empty(&in->oset))
3768 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3769 }
7c673cae
FG
3770
3771 _schedule_invalidate_callback(in, 0, 0);
3772}
3773
3774void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3775{
3776 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3777
3778 // invalidate our userspace inode cache
3779 if (cct->_conf->client_oc) {
3780 vector<ObjectExtent> ls;
3781 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3782 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3783 }
3784
3785 _schedule_invalidate_callback(in, off, len);
3786}
3787
3788bool Client::_release(Inode *in)
3789{
3790 ldout(cct, 20) << "_release " << *in << dendl;
3791 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3792 _invalidate_inode_cache(in);
3793 return true;
3794 }
3795 return false;
3796}
3797
3798bool Client::_flush(Inode *in, Context *onfinish)
3799{
3800 ldout(cct, 10) << "_flush " << *in << dendl;
3801
3802 if (!in->oset.dirty_or_tx) {
3803 ldout(cct, 10) << " nothing to flush" << dendl;
3804 onfinish->complete(0);
3805 return true;
3806 }
3807
3808 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3809 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3810 objectcacher->purge_set(&in->oset);
3811 if (onfinish) {
3812 onfinish->complete(-ENOSPC);
3813 }
3814 return true;
3815 }
3816
3817 return objectcacher->flush_set(&in->oset, onfinish);
3818}
3819
3820void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3821{
3822 assert(client_lock.is_locked());
3823 if (!in->oset.dirty_or_tx) {
3824 ldout(cct, 10) << " nothing to flush" << dendl;
3825 return;
3826 }
3827
3828 Mutex flock("Client::_flush_range flock");
3829 Cond cond;
3830 bool safe = false;
3831 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3832 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3833 offset, size, onflush);
3834 if (!ret) {
3835 // wait for flush
3836 client_lock.Unlock();
3837 flock.Lock();
3838 while (!safe)
3839 cond.Wait(flock);
3840 flock.Unlock();
3841 client_lock.Lock();
3842 }
3843}
3844
3845void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3846{
3847 // Mutex::Locker l(client_lock);
3848 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3849 Inode *in = static_cast<Inode *>(oset->parent);
3850 assert(in);
3851 _flushed(in);
3852}
3853
3854void Client::_flushed(Inode *in)
3855{
3856 ldout(cct, 10) << "_flushed " << *in << dendl;
3857
3858 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3859}
3860
3861
3862
3863// checks common to add_update_cap, handle_cap_grant
3864void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3865{
3866 unsigned had = in->caps_issued();
3867
3868 if ((issued & CEPH_CAP_FILE_CACHE) &&
3869 !(had & CEPH_CAP_FILE_CACHE))
3870 in->cache_gen++;
3871
3872 if ((issued & CEPH_CAP_FILE_SHARED) &&
3873 !(had & CEPH_CAP_FILE_SHARED)) {
3874 in->shared_gen++;
3875
3876 if (in->is_dir())
3877 clear_dir_complete_and_ordered(in, true);
3878 }
3879}
3880
3881void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3882 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3883 int flags, const UserPerm& cap_perms)
3884{
3885 Cap *cap = 0;
3886 mds_rank_t mds = mds_session->mds_num;
3887 if (in->caps.count(mds)) {
3888 cap = in->caps[mds];
3889
3890 /*
3891 * auth mds of the inode changed. we received the cap export
3892 * message, but still haven't received the cap import message.
3893 * handle_cap_export() updated the new auth MDS' cap.
3894 *
3895 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3896 * a message that was send before the cap import message. So
3897 * don't remove caps.
3898 */
3899 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3900 assert(cap == in->auth_cap);
3901 assert(cap->cap_id == cap_id);
3902 seq = cap->seq;
3903 mseq = cap->mseq;
3904 issued |= cap->issued;
3905 flags |= CEPH_CAP_FLAG_AUTH;
3906 }
3907 } else {
3908 mds_session->num_caps++;
3909 if (!in->is_any_caps()) {
3910 assert(in->snaprealm == 0);
3911 in->snaprealm = get_snap_realm(realm);
3912 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3913 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3914 }
3915 in->caps[mds] = cap = new Cap;
3916
3917 mds_session->caps.push_back(&cap->cap_item);
3918 cap->session = mds_session;
3919 cap->inode = in;
3920 cap->gen = mds_session->cap_gen;
7c673cae
FG
3921 }
3922
3923 check_cap_issue(in, cap, issued);
3924
3925 if (flags & CEPH_CAP_FLAG_AUTH) {
3926 if (in->auth_cap != cap &&
3927 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3928 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3929 ldout(cct, 10) << "add_update_cap changing auth cap: "
3930 << "add myself to new auth MDS' flushing caps list" << dendl;
3931 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3932 }
3933 in->auth_cap = cap;
3934 }
3935 }
3936
3937 unsigned old_caps = cap->issued;
3938 cap->cap_id = cap_id;
3939 cap->issued |= issued;
3940 cap->implemented |= issued;
3941 cap->seq = seq;
3942 cap->issue_seq = seq;
3943 cap->mseq = mseq;
28e407b8 3944 cap->gen = mds_session->cap_gen;
7c673cae
FG
3945 cap->latest_perms = cap_perms;
3946 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3947 << " from mds." << mds
3948 << " on " << *in
3949 << dendl;
3950
3951 if ((issued & ~old_caps) && in->auth_cap == cap) {
3952 // non-auth MDS is revoking the newly grant caps ?
3953 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3954 if (it->second == cap)
3955 continue;
3956 if (it->second->implemented & ~it->second->issued & issued) {
3957 check_caps(in, CHECK_CAPS_NODELAY);
3958 break;
3959 }
3960 }
3961 }
3962
3963 if (issued & ~old_caps)
3964 signal_cond_list(in->waitfor_caps);
3965}
3966
3967void Client::remove_cap(Cap *cap, bool queue_release)
3968{
3969 Inode *in = cap->inode;
3970 MetaSession *session = cap->session;
3971 mds_rank_t mds = cap->session->mds_num;
3972
3973 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3974
3975 if (queue_release) {
3976 session->enqueue_cap_release(
3977 in->ino,
3978 cap->cap_id,
3979 cap->issue_seq,
3980 cap->mseq,
3981 cap_epoch_barrier);
3982 }
3983
3984 if (in->auth_cap == cap) {
3985 if (in->flushing_cap_item.is_on_list()) {
3986 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3987 in->flushing_cap_item.remove_myself();
3988 }
3989 in->auth_cap = NULL;
3990 }
3991 assert(in->caps.count(mds));
3992 in->caps.erase(mds);
3993
3994 cap->cap_item.remove_myself();
3995 delete cap;
3996 cap = nullptr;
3997
3998 if (!in->is_any_caps()) {
3999 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
4000 in->snaprealm_item.remove_myself();
4001 put_snap_realm(in->snaprealm);
4002 in->snaprealm = 0;
4003 }
4004}
4005
4006void Client::remove_all_caps(Inode *in)
4007{
4008 while (!in->caps.empty())
4009 remove_cap(in->caps.begin()->second, true);
4010}
4011
4012void Client::remove_session_caps(MetaSession *s)
4013{
4014 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
4015
4016 while (s->caps.size()) {
4017 Cap *cap = *s->caps.begin();
4018 Inode *in = cap->inode;
4019 bool dirty_caps = false, cap_snaps = false;
4020 if (in->auth_cap == cap) {
4021 cap_snaps = !in->cap_snaps.empty();
4022 dirty_caps = in->dirty_caps | in->flushing_caps;
4023 in->wanted_max_size = 0;
4024 in->requested_max_size = 0;
4025 in->flags |= I_CAP_DROPPED;
4026 }
4027 remove_cap(cap, false);
4028 signal_cond_list(in->waitfor_caps);
4029 if (cap_snaps) {
4030 InodeRef tmp_ref(in);
4031 in->cap_snaps.clear();
4032 }
4033 if (dirty_caps) {
4034 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4035 if (in->flushing_caps) {
4036 num_flushing_caps--;
4037 in->flushing_cap_tids.clear();
4038 }
4039 in->flushing_caps = 0;
28e407b8 4040 in->mark_caps_clean();
7c673cae
FG
4041 put_inode(in);
4042 }
4043 }
4044 s->flushing_caps_tids.clear();
4045 sync_cond.Signal();
4046}
4047
b32b8144
FG
4048int Client::_do_remount(void)
4049{
4050 errno = 0;
4051 int r = remount_cb(callback_handle);
4052 if (r != 0) {
4053 int e = errno;
4054 client_t whoami = get_nodeid();
4055 if (r == -1) {
4056 lderr(cct) <<
4057 "failed to remount (to trim kernel dentries): "
4058 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4059 } else {
4060 lderr(cct) <<
4061 "failed to remount (to trim kernel dentries): "
4062 "return code = " << r << dendl;
4063 }
4064 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4065 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
4066 if (should_abort && !unmounting) {
4067 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4068 ceph_abort();
4069 }
4070 }
4071 return r;
4072}
4073
7c673cae
FG
4074class C_Client_Remount : public Context {
4075private:
4076 Client *client;
4077public:
4078 explicit C_Client_Remount(Client *c) : client(c) {}
4079 void finish(int r) override {
b32b8144
FG
4080 assert(r == 0);
4081 client->_do_remount();
7c673cae
FG
4082 }
4083};
4084
4085void Client::_invalidate_kernel_dcache()
4086{
4087 if (unmounting)
4088 return;
94b18763
FG
4089 if (can_invalidate_dentries) {
4090 if (dentry_invalidate_cb && root->dir) {
4091 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4092 p != root->dir->dentries.end();
4093 ++p) {
4094 if (p->second->inode)
4095 _schedule_invalidate_dentry_callback(p->second, false);
4096 }
7c673cae
FG
4097 }
4098 } else if (remount_cb) {
4099 // Hacky:
4100 // when remounting a file system, linux kernel trims all unused dentries in the fs
4101 remount_finisher.queue(new C_Client_Remount(this));
4102 }
4103}
4104
28e407b8 4105void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4106{
4107 mds_rank_t mds = s->mds_num;
28e407b8 4108 size_t caps_size = s->caps.size();
7c673cae
FG
4109 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4110 << " caps " << caps_size << dendl;
4111
28e407b8
AA
4112 uint64_t trimmed = 0;
4113 auto p = s->caps.begin();
4114 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4115 * looking at from getting deleted during traversal. */
7c673cae
FG
4116 while ((caps_size - trimmed) > max && !p.end()) {
4117 Cap *cap = *p;
b32b8144 4118 InodeRef in(cap->inode);
7c673cae
FG
4119
4120 // Increment p early because it will be invalidated if cap
4121 // is deleted inside remove_cap
4122 ++p;
4123
4124 if (in->caps.size() > 1 && cap != in->auth_cap) {
4125 int mine = cap->issued | cap->implemented;
4126 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4127 // disposable non-auth cap
b32b8144 4128 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4129 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4130 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4131 trimmed++;
4132 }
4133 } else {
4134 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4135 bool all = true;
4136 set<Dentry*>::iterator q = in->dn_set.begin();
7c673cae
FG
4137 while (q != in->dn_set.end()) {
4138 Dentry *dn = *q++;
4139 if (dn->lru_is_expireable()) {
4140 if (can_invalidate_dentries &&
4141 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4142 // Only issue one of these per DN for inodes in root: handle
4143 // others more efficiently by calling for root-child DNs at
4144 // the end of this function.
4145 _schedule_invalidate_dentry_callback(dn, true);
4146 }
28e407b8
AA
4147 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4148 to_trim.insert(dn);
7c673cae
FG
4149 } else {
4150 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4151 all = false;
4152 }
4153 }
4154 if (all && in->ino != MDS_INO_ROOT) {
4155 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4156 trimmed++;
4157 }
4158 }
4159 }
28e407b8
AA
4160 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4161 for (const auto &dn : to_trim) {
4162 trim_dentry(dn);
4163 }
4164 to_trim.clear();
7c673cae 4165
b32b8144
FG
4166 caps_size = s->caps.size();
4167 if (caps_size > max)
7c673cae
FG
4168 _invalidate_kernel_dcache();
4169}
4170
4171void Client::force_session_readonly(MetaSession *s)
4172{
4173 s->readonly = true;
4174 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4175 Inode *in = (*p)->inode;
4176 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4177 signal_cond_list(in->waitfor_caps);
4178 }
4179}
4180
7c673cae
FG
4181int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4182{
4183 MetaSession *session = in->auth_cap->session;
4184
4185 int flushing = in->dirty_caps;
4186 assert(flushing);
4187
4188 ceph_tid_t flush_tid = ++last_flush_tid;
4189 in->flushing_cap_tids[flush_tid] = flushing;
4190
4191 if (!in->flushing_caps) {
4192 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4193 num_flushing_caps++;
4194 } else {
4195 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4196 }
4197
4198 in->flushing_caps |= flushing;
28e407b8 4199 in->mark_caps_clean();
7c673cae
FG
4200
4201 if (!in->flushing_cap_item.is_on_list())
4202 session->flushing_caps.push_back(&in->flushing_cap_item);
4203 session->flushing_caps_tids.insert(flush_tid);
4204
4205 *ptid = flush_tid;
4206 return flushing;
4207}
4208
4209void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4210{
4211 for (auto &p : in->cap_snaps) {
4212 CapSnap &capsnap = p.second;
4213 if (capsnap.flush_tid > 0) {
4214 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4215 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4216 }
4217 }
4218 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4219 it != in->flushing_cap_tids.end();
4220 ++it) {
4221 old_s->flushing_caps_tids.erase(it->first);
4222 new_s->flushing_caps_tids.insert(it->first);
4223 }
4224 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4225}
4226
4227/*
4228 * Flush all caps back to the MDS. Because the callers generally wait on the
4229 * result of this function (syncfs and umount cases), we set
4230 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4231 */
4232void Client::flush_caps_sync()
4233{
4234 ldout(cct, 10) << __func__ << dendl;
28e407b8 4235 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4236 while (!p.end()) {
4237 unsigned flags = CHECK_CAPS_NODELAY;
4238 Inode *in = *p;
4239
4240 ++p;
28e407b8
AA
4241 delayed_list.pop_front();
4242 if (p.end() && dirty_list.empty())
7c673cae
FG
4243 flags |= CHECK_CAPS_SYNCHRONOUS;
4244 check_caps(in, flags);
4245 }
4246
4247 // other caps, too
28e407b8 4248 p = dirty_list.begin();
7c673cae
FG
4249 while (!p.end()) {
4250 unsigned flags = CHECK_CAPS_NODELAY;
4251 Inode *in = *p;
4252
4253 ++p;
4254 if (p.end())
4255 flags |= CHECK_CAPS_SYNCHRONOUS;
4256 check_caps(in, flags);
4257 }
4258}
4259
4260void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4261{
4262 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4263 Cap *cap = in->auth_cap;
4264 assert(cap->session == session);
4265
4266 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4267 p != in->flushing_cap_tids.end();
4268 ++p) {
4269 bool req_sync = false;
4270
4271 /* If this is a synchronous request, then flush the journal on last one */
4272 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4273 req_sync = true;
4274
4275 send_cap(in, session, cap, req_sync,
4276 (get_caps_used(in) | in->caps_dirty()),
4277 in->caps_wanted(), (cap->issued | cap->implemented),
4278 p->second, p->first);
4279 }
4280}
4281
4282void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4283{
4284 while (in->flushing_caps) {
4285 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4286 assert(it != in->flushing_cap_tids.end());
4287 if (it->first > want)
4288 break;
4289 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4290 << ccap_string(it->second) << " want " << want
4291 << " last " << it->first << dendl;
4292 wait_on_list(in->waitfor_caps);
4293 }
4294}
4295
4296void Client::wait_sync_caps(ceph_tid_t want)
4297{
4298 retry:
4299 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4300 << num_flushing_caps << " total flushing)" << dendl;
4301 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4302 p != mds_sessions.end();
4303 ++p) {
4304 MetaSession *s = p->second;
4305 if (s->flushing_caps_tids.empty())
4306 continue;
4307 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4308 if (oldest_tid <= want) {
4309 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4310 << " (want " << want << ")" << dendl;
4311 sync_cond.Wait(client_lock);
4312 goto retry;
4313 }
4314 }
4315}
4316
4317void Client::kick_flushing_caps(MetaSession *session)
4318{
4319 mds_rank_t mds = session->mds_num;
4320 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4321
4322 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4323 Inode *in = *p;
4324 if (session->early_flushing_caps.count(in))
4325 continue;
4326 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4327 if (in->cap_snaps.size())
4328 flush_snaps(in, true);
4329 if (in->flushing_caps)
4330 flush_caps(in, session);
4331 }
4332
4333 session->early_flushing_caps.clear();
4334}
4335
4336void Client::early_kick_flushing_caps(MetaSession *session)
4337{
4338 session->early_flushing_caps.clear();
4339
4340 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4341 Inode *in = *p;
4342 assert(in->auth_cap);
4343
4344 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4345 // stage. This guarantees that MDS processes the cap flush message before issuing
4346 // the flushing caps to other client.
4347 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4348 continue;
4349
4350 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4351 << " to mds." << session->mds_num << dendl;
4352
4353 session->early_flushing_caps.insert(in);
4354
4355 if (in->cap_snaps.size())
4356 flush_snaps(in, true);
4357 if (in->flushing_caps)
4358 flush_caps(in, session);
4359
4360 }
4361}
4362
4363void Client::kick_maxsize_requests(MetaSession *session)
4364{
4365 xlist<Cap*>::iterator iter = session->caps.begin();
4366 while (!iter.end()){
4367 (*iter)->inode->requested_max_size = 0;
4368 (*iter)->inode->wanted_max_size = 0;
4369 signal_cond_list((*iter)->inode->waitfor_caps);
4370 ++iter;
4371 }
4372}
4373
4374void SnapRealm::build_snap_context()
4375{
4376 set<snapid_t> snaps;
4377 snapid_t max_seq = seq;
4378
4379 // start with prior_parents?
4380 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4381 snaps.insert(prior_parent_snaps[i]);
4382
4383 // current parent's snaps
4384 if (pparent) {
4385 const SnapContext& psnapc = pparent->get_snap_context();
4386 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4387 if (psnapc.snaps[i] >= parent_since)
4388 snaps.insert(psnapc.snaps[i]);
4389 if (psnapc.seq > max_seq)
4390 max_seq = psnapc.seq;
4391 }
4392
4393 // my snaps
4394 for (unsigned i=0; i<my_snaps.size(); i++)
4395 snaps.insert(my_snaps[i]);
4396
4397 // ok!
4398 cached_snap_context.seq = max_seq;
4399 cached_snap_context.snaps.resize(0);
4400 cached_snap_context.snaps.reserve(snaps.size());
4401 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4402 cached_snap_context.snaps.push_back(*p);
4403}
4404
4405void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4406{
4407 list<SnapRealm*> q;
4408 q.push_back(realm);
4409
4410 while (!q.empty()) {
4411 realm = q.front();
4412 q.pop_front();
4413
4414 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4415 realm->invalidate_cache();
4416
4417 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4418 p != realm->pchildren.end();
4419 ++p)
4420 q.push_back(*p);
4421 }
4422}
4423
4424SnapRealm *Client::get_snap_realm(inodeno_t r)
4425{
4426 SnapRealm *realm = snap_realms[r];
4427 if (!realm)
4428 snap_realms[r] = realm = new SnapRealm(r);
4429 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4430 realm->nref++;
4431 return realm;
4432}
4433
4434SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4435{
4436 if (snap_realms.count(r) == 0) {
4437 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4438 return NULL;
4439 }
4440 SnapRealm *realm = snap_realms[r];
4441 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4442 realm->nref++;
4443 return realm;
4444}
4445
4446void Client::put_snap_realm(SnapRealm *realm)
4447{
4448 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4449 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4450 if (--realm->nref == 0) {
4451 snap_realms.erase(realm->ino);
4452 if (realm->pparent) {
4453 realm->pparent->pchildren.erase(realm);
4454 put_snap_realm(realm->pparent);
4455 }
4456 delete realm;
4457 }
4458}
4459
4460bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4461{
4462 if (realm->parent != parent) {
4463 ldout(cct, 10) << "adjust_realm_parent " << *realm
4464 << " " << realm->parent << " -> " << parent << dendl;
4465 realm->parent = parent;
4466 if (realm->pparent) {
4467 realm->pparent->pchildren.erase(realm);
4468 put_snap_realm(realm->pparent);
4469 }
4470 realm->pparent = get_snap_realm(parent);
4471 realm->pparent->pchildren.insert(realm);
4472 return true;
4473 }
4474 return false;
4475}
4476
4477static bool has_new_snaps(const SnapContext& old_snapc,
4478 const SnapContext& new_snapc)
4479{
4480 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4481}
4482
4483
4484void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4485{
4486 SnapRealm *first_realm = NULL;
4487 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4488
4489 map<SnapRealm*, SnapContext> dirty_realms;
4490
4491 bufferlist::iterator p = bl.begin();
4492 while (!p.end()) {
4493 SnapRealmInfo info;
4494 ::decode(info, p);
4495 SnapRealm *realm = get_snap_realm(info.ino());
4496
4497 bool invalidate = false;
4498
4499 if (info.seq() > realm->seq) {
4500 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4501 << dendl;
4502
4503 if (flush) {
4504 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4505 // flush me + children
4506 list<SnapRealm*> q;
4507 q.push_back(realm);
4508 while (!q.empty()) {
4509 SnapRealm *realm = q.front();
4510 q.pop_front();
4511
4512 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4513 p != realm->pchildren.end();
4514 ++p)
4515 q.push_back(*p);
4516
4517 if (dirty_realms.count(realm) == 0) {
4518 realm->nref++;
4519 dirty_realms[realm] = realm->get_snap_context();
4520 }
4521 }
4522 }
4523
4524 // update
4525 realm->seq = info.seq();
4526 realm->created = info.created();
4527 realm->parent_since = info.parent_since();
4528 realm->prior_parent_snaps = info.prior_parent_snaps;
4529 realm->my_snaps = info.my_snaps;
4530 invalidate = true;
4531 }
4532
4533 // _always_ verify parent
4534 if (adjust_realm_parent(realm, info.parent()))
4535 invalidate = true;
4536
4537 if (invalidate) {
4538 invalidate_snaprealm_and_children(realm);
4539 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4540 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4541 } else {
4542 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4543 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4544 }
4545
4546 if (!first_realm)
4547 first_realm = realm;
4548 else
4549 put_snap_realm(realm);
4550 }
4551
4552 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4553 q != dirty_realms.end();
4554 ++q) {
4555 SnapRealm *realm = q->first;
4556 // if there are new snaps ?
4557 if (has_new_snaps(q->second, realm->get_snap_context())) {
4558 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4559 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4560 while (!r.end()) {
4561 Inode *in = *r;
4562 ++r;
4563 queue_cap_snap(in, q->second);
4564 }
4565 } else {
4566 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4567 }
4568 put_snap_realm(realm);
4569 }
4570
4571 if (realm_ret)
4572 *realm_ret = first_realm;
4573 else
4574 put_snap_realm(first_realm);
4575}
4576
4577void Client::handle_snap(MClientSnap *m)
4578{
4579 ldout(cct, 10) << "handle_snap " << *m << dendl;
4580 mds_rank_t mds = mds_rank_t(m->get_source().num());
4581 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4582 if (!session) {
4583 m->put();
4584 return;
4585 }
4586
4587 got_mds_push(session);
4588
4589 map<Inode*, SnapContext> to_move;
4590 SnapRealm *realm = 0;
4591
4592 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4593 assert(m->head.split);
4594 SnapRealmInfo info;
4595 bufferlist::iterator p = m->bl.begin();
4596 ::decode(info, p);
4597 assert(info.ino() == m->head.split);
4598
4599 // flush, then move, ino's.
4600 realm = get_snap_realm(info.ino());
4601 ldout(cct, 10) << " splitting off " << *realm << dendl;
4602 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4603 p != m->split_inos.end();
4604 ++p) {
4605 vinodeno_t vino(*p, CEPH_NOSNAP);
4606 if (inode_map.count(vino)) {
4607 Inode *in = inode_map[vino];
4608 if (!in->snaprealm || in->snaprealm == realm)
4609 continue;
4610 if (in->snaprealm->created > info.created()) {
4611 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4612 << *in->snaprealm << dendl;
4613 continue;
4614 }
4615 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4616
4617
4618 in->snaprealm_item.remove_myself();
4619 to_move[in] = in->snaprealm->get_snap_context();
4620 put_snap_realm(in->snaprealm);
4621 }
4622 }
4623
4624 // move child snaprealms, too
4625 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4626 p != m->split_realms.end();
4627 ++p) {
4628 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4629 SnapRealm *child = get_snap_realm_maybe(*p);
4630 if (!child)
4631 continue;
4632 adjust_realm_parent(child, realm->ino);
4633 put_snap_realm(child);
4634 }
4635 }
4636
4637 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4638
4639 if (realm) {
4640 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4641 Inode *in = p->first;
4642 in->snaprealm = realm;
4643 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4644 realm->nref++;
4645 // queue for snap writeback
4646 if (has_new_snaps(p->second, realm->get_snap_context()))
4647 queue_cap_snap(in, p->second);
4648 }
4649 put_snap_realm(realm);
4650 }
4651
4652 m->put();
4653}
4654
4655void Client::handle_quota(MClientQuota *m)
4656{
4657 mds_rank_t mds = mds_rank_t(m->get_source().num());
4658 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4659 if (!session) {
4660 m->put();
4661 return;
4662 }
4663
4664 got_mds_push(session);
4665
4666 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4667
4668 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4669 if (inode_map.count(vino)) {
4670 Inode *in = NULL;
4671 in = inode_map[vino];
4672
4673 if (in) {
4674 in->quota = m->quota;
4675 in->rstat = m->rstat;
4676 }
4677 }
4678
4679 m->put();
4680}
4681
4682void Client::handle_caps(MClientCaps *m)
4683{
4684 mds_rank_t mds = mds_rank_t(m->get_source().num());
4685 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4686 if (!session) {
4687 m->put();
4688 return;
4689 }
4690
4691 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4692 // Pause RADOS operations until we see the required epoch
4693 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4694 }
4695
4696 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4697 // Record the barrier so that we will transmit it to MDS when releasing
4698 set_cap_epoch_barrier(m->osd_epoch_barrier);
4699 }
4700
4701 got_mds_push(session);
4702
4703 m->clear_payload(); // for if/when we send back to MDS
4704
4705 Inode *in = 0;
4706 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4707 if (inode_map.count(vino))
4708 in = inode_map[vino];
4709 if (!in) {
4710 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4711 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4712 session->enqueue_cap_release(
4713 m->get_ino(),
4714 m->get_cap_id(),
4715 m->get_seq(),
4716 m->get_mseq(),
4717 cap_epoch_barrier);
4718 } else {
4719 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4720 }
4721 m->put();
4722
4723 // in case the mds is waiting on e.g. a revocation
4724 flush_cap_releases();
4725 return;
4726 }
4727
4728 switch (m->get_op()) {
4729 case CEPH_CAP_OP_EXPORT:
4730 return handle_cap_export(session, in, m);
4731 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4732 return handle_cap_flushsnap_ack(session, in, m);
4733 case CEPH_CAP_OP_IMPORT:
4734 handle_cap_import(session, in, m);
4735 }
4736
4737 if (in->caps.count(mds) == 0) {
4738 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4739 m->put();
4740 return;
4741 }
4742
4743 Cap *cap = in->caps[mds];
4744
4745 switch (m->get_op()) {
4746 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4747 case CEPH_CAP_OP_IMPORT:
4748 case CEPH_CAP_OP_REVOKE:
4749 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4750 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4751 default:
4752 m->put();
4753 }
4754}
4755
4756void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4757{
4758 mds_rank_t mds = session->mds_num;
4759
4760 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4761 << " IMPORT from mds." << mds << dendl;
4762
4763 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4764 Cap *cap = NULL;
4765 UserPerm cap_perms;
4766 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4767 cap = in->caps[peer_mds];
4768 if (cap) {
4769 cap_perms = cap->latest_perms;
4770 }
4771 }
4772
4773 // add/update it
4774 SnapRealm *realm = NULL;
4775 update_snap_trace(m->snapbl, &realm);
4776
4777 add_update_cap(in, session, m->get_cap_id(),
4778 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4779 CEPH_CAP_FLAG_AUTH, cap_perms);
4780
4781 if (cap && cap->cap_id == m->peer.cap_id) {
4782 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4783 }
4784
4785 if (realm)
4786 put_snap_realm(realm);
4787
4788 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4789 // reflush any/all caps (if we are now the auth_cap)
4790 if (in->cap_snaps.size())
4791 flush_snaps(in, true);
4792 if (in->flushing_caps)
4793 flush_caps(in, session);
4794 }
4795}
4796
4797void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4798{
4799 mds_rank_t mds = session->mds_num;
4800
4801 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4802 << " EXPORT from mds." << mds << dendl;
4803
4804 Cap *cap = NULL;
4805 if (in->caps.count(mds))
4806 cap = in->caps[mds];
4807
4808 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4809
4810 if (cap && cap->cap_id == m->get_cap_id()) {
4811 if (m->peer.cap_id) {
4812 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4813 if (in->caps.count(peer_mds)) {
4814 Cap *tcap = in->caps[peer_mds];
181888fb 4815 if (tcap->cap_id == m->peer.cap_id &&
7c673cae
FG
4816 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4817 tcap->cap_id = m->peer.cap_id;
4818 tcap->seq = m->peer.seq - 1;
4819 tcap->issue_seq = tcap->seq;
4820 tcap->mseq = m->peer.mseq;
4821 tcap->issued |= cap->issued;
4822 tcap->implemented |= cap->issued;
4823 if (cap == in->auth_cap)
4824 in->auth_cap = tcap;
4825 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4826 adjust_session_flushing_caps(in, session, tsession);
4827 }
4828 } else {
4829 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4830 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4831 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4832 cap->latest_perms);
4833 }
4834 } else {
4835 if (cap == in->auth_cap)
4836 in->flags |= I_CAP_DROPPED;
4837 }
4838
4839 remove_cap(cap, false);
4840 }
4841
4842 m->put();
4843}
4844
4845void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4846{
4847 mds_rank_t mds = session->mds_num;
4848 assert(in->caps[mds]);
4849
4850 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4851 << " size " << in->size << " -> " << m->get_size()
4852 << dendl;
4853
1adf2230
AA
4854 int issued;
4855 in->caps_issued(&issued);
4856 issued |= in->caps_dirty();
4857 update_inode_file_size(in, issued, m->get_size(),
4858 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4859 m->put();
4860}
4861
4862void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4863{
4864 ceph_tid_t flush_ack_tid = m->get_client_tid();
4865 int dirty = m->get_dirty();
4866 int cleaned = 0;
4867 int flushed = 0;
4868
4869 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4870 it != in->flushing_cap_tids.end(); ) {
4871 if (it->first == flush_ack_tid)
4872 cleaned = it->second;
4873 if (it->first <= flush_ack_tid) {
4874 session->flushing_caps_tids.erase(it->first);
4875 in->flushing_cap_tids.erase(it++);
4876 ++flushed;
4877 continue;
4878 }
4879 cleaned &= ~it->second;
4880 if (!cleaned)
4881 break;
4882 ++it;
4883 }
4884
4885 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4886 << " cleaned " << ccap_string(cleaned) << " on " << *in
4887 << " with " << ccap_string(dirty) << dendl;
4888
4889 if (flushed) {
4890 signal_cond_list(in->waitfor_caps);
4891 if (session->flushing_caps_tids.empty() ||
4892 *session->flushing_caps_tids.begin() > flush_ack_tid)
4893 sync_cond.Signal();
4894 }
4895
4896 if (!dirty) {
4897 in->cap_dirtier_uid = -1;
4898 in->cap_dirtier_gid = -1;
4899 }
4900
4901 if (!cleaned) {
4902 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4903 } else {
4904 if (in->flushing_caps) {
4905 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4906 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4907 in->flushing_caps &= ~cleaned;
4908 if (in->flushing_caps == 0) {
4909 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4910 num_flushing_caps--;
4911 if (in->cap_snaps.empty())
4912 in->flushing_cap_item.remove_myself();
4913 }
4914 if (!in->caps_dirty())
4915 put_inode(in);
4916 }
4917 }
4918
4919 m->put();
4920}
4921
4922
4923void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4924{
4925 mds_rank_t mds = session->mds_num;
4926 assert(in->caps[mds]);
4927 snapid_t follows = m->get_snap_follows();
4928
4929 if (in->cap_snaps.count(follows)) {
4930 CapSnap &capsnap = in->cap_snaps.at(follows);
4931 if (m->get_client_tid() != capsnap.flush_tid) {
4932 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4933 } else {
4934 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4935 << " on " << *in << dendl;
4936 InodeRef tmp_ref;
4937 if (in->get_num_ref() == 1)
4938 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4939 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4940 in->flushing_cap_item.remove_myself();
4941 session->flushing_caps_tids.erase(capsnap.flush_tid);
4942 in->cap_snaps.erase(follows);
4943 }
4944 } else {
4945 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4946 << " on " << *in << dendl;
4947 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4948 }
4949
4950 m->put();
4951}
4952
4953class C_Client_DentryInvalidate : public Context {
4954private:
4955 Client *client;
4956 vinodeno_t dirino;
4957 vinodeno_t ino;
4958 string name;
4959public:
4960 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4961 client(c), name(dn->name) {
4962 if (client->use_faked_inos()) {
4963 dirino.ino = dn->dir->parent_inode->faked_ino;
4964 if (del)
4965 ino.ino = dn->inode->faked_ino;
4966 } else {
4967 dirino = dn->dir->parent_inode->vino();
4968 if (del)
4969 ino = dn->inode->vino();
4970 }
4971 if (!del)
4972 ino.ino = inodeno_t();
4973 }
4974 void finish(int r) override {
4975 // _async_dentry_invalidate is responsible for its own locking
4976 assert(!client->client_lock.is_locked_by_me());
4977 client->_async_dentry_invalidate(dirino, ino, name);
4978 }
4979};
4980
4981void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4982{
4983 if (unmounting)
4984 return;
4985 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4986 << " in dir " << dirino << dendl;
4987 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4988}
4989
4990void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4991{
4992 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4993 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4994}
4995
4996void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4997{
4998 int ref = in->get_num_ref();
4999
5000 if (in->dir && !in->dir->dentries.empty()) {
5001 for (auto p = in->dir->dentries.begin();
5002 p != in->dir->dentries.end(); ) {
5003 Dentry *dn = p->second;
5004 ++p;
5005 /* rmsnap removes whole subtree, need trim inodes recursively.
5006 * we don't need to invalidate dentries recursively. because
5007 * invalidating a directory dentry effectively invalidate
5008 * whole subtree */
5009 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5010 _try_to_trim_inode(dn->inode.get(), false);
5011
5012 if (dn->lru_is_expireable())
5013 unlink(dn, true, false); // keep dir, drop dentry
5014 }
5015 if (in->dir->dentries.empty()) {
5016 close_dir(in->dir);
5017 --ref;
5018 }
5019 }
5020
5021 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5022 InodeRef snapdir = open_snapdir(in);
5023 _try_to_trim_inode(snapdir.get(), false);
5024 --ref;
5025 }
5026
5027 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5028 set<Dentry*>::iterator q = in->dn_set.begin();
5029 while (q != in->dn_set.end()) {
5030 Dentry *dn = *q++;
5031 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5032 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5033 _schedule_invalidate_dentry_callback(dn, true);
5034 unlink(dn, true, true);
5035 }
5036 }
5037}
5038
5039void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5040{
5041 mds_rank_t mds = session->mds_num;
5042 int used = get_caps_used(in);
5043 int wanted = in->caps_wanted();
5044
5045 const int old_caps = cap->issued;
5046 const int new_caps = m->get_caps();
5047 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5048 << " mds." << mds << " seq " << m->get_seq()
5049 << " caps now " << ccap_string(new_caps)
5050 << " was " << ccap_string(old_caps) << dendl;
5051 cap->seq = m->get_seq();
28e407b8 5052 cap->gen = session->cap_gen;
7c673cae 5053
7c673cae 5054 // update inode
1adf2230
AA
5055 int issued;
5056 in->caps_issued(&issued);
5057 issued |= in->caps_dirty();
7c673cae 5058
1adf2230
AA
5059 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5060 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5061 in->mode = m->head.mode;
5062 in->uid = m->head.uid;
5063 in->gid = m->head.gid;
5064 in->btime = m->btime;
5065 }
5066 bool deleted_inode = false;
1adf2230
AA
5067 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5068 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5069 in->nlink = m->head.nlink;
5070 if (in->nlink == 0 &&
5071 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5072 deleted_inode = true;
5073 }
1adf2230 5074 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5075 m->xattrbl.length() &&
5076 m->head.xattr_version > in->xattr_version) {
5077 bufferlist::iterator p = m->xattrbl.begin();
5078 ::decode(in->xattrs, p);
5079 in->xattr_version = m->head.xattr_version;
5080 }
28e407b8
AA
5081
5082 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5083 in->dirstat.nfiles = m->get_nfiles();
5084 in->dirstat.nsubdirs = m->get_nsubdirs();
5085 }
5086
1adf2230
AA
5087 if (new_caps & CEPH_CAP_ANY_RD) {
5088 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5089 m->get_ctime(), m->get_mtime(), m->get_atime());
5090 }
5091
5092 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5093 in->layout = m->get_layout();
5094 update_inode_file_size(in, issued, m->get_size(),
5095 m->get_truncate_seq(), m->get_truncate_size());
5096 }
5097
5098 if (m->inline_version > in->inline_version) {
5099 in->inline_data = m->inline_data;
5100 in->inline_version = m->inline_version;
5101 }
5102
5103 /* always take a newer change attr */
5104 if (m->get_change_attr() > in->change_attr)
5105 in->change_attr = m->get_change_attr();
7c673cae
FG
5106
5107 // max_size
5108 if (cap == in->auth_cap &&
1adf2230
AA
5109 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5110 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5111 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5112 in->max_size = m->get_max_size();
5113 if (in->max_size > in->wanted_max_size) {
5114 in->wanted_max_size = 0;
5115 in->requested_max_size = 0;
5116 }
5117 }
5118
5119 bool check = false;
5120 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5121 check = true;
5122
5123 check_cap_issue(in, cap, new_caps);
5124
5125 // update caps
b32b8144
FG
5126 int revoked = old_caps & ~new_caps;
5127 if (revoked) {
5128 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5129 cap->issued = new_caps;
5130 cap->implemented |= new_caps;
5131
b32b8144
FG
5132 // recall delegations if we're losing caps necessary for them
5133 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5134 in->recall_deleg(false);
5135 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5136 in->recall_deleg(true);
5137
28e407b8
AA
5138 if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
5139 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5140 // waitin' for flush
28e407b8 5141 } else if (revoked & CEPH_CAP_FILE_CACHE) {
7c673cae
FG
5142 if (_release(in))
5143 check = true;
5144 } else {
5145 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5146 check = true;
5147 }
7c673cae
FG
5148 } else if (old_caps == new_caps) {
5149 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5150 } else {
5151 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5152 cap->issued = new_caps;
5153 cap->implemented |= new_caps;
5154
5155 if (cap == in->auth_cap) {
5156 // non-auth MDS is revoking the newly grant caps ?
5157 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5158 if (it->second == cap)
5159 continue;
5160 if (it->second->implemented & ~it->second->issued & new_caps) {
5161 check = true;
5162 break;
5163 }
5164 }
5165 }
5166 }
5167
5168 if (check)
5169 check_caps(in, 0);
5170
5171 // wake up waiters
5172 if (new_caps)
5173 signal_cond_list(in->waitfor_caps);
5174
5175 // may drop inode's last ref
5176 if (deleted_inode)
5177 _try_to_trim_inode(in, true);
5178
5179 m->put();
5180}
5181
7c673cae
FG
5182int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5183{
5184 if (perms.uid() == 0)
5185 return 0;
5186
5187 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5188 int ret = _posix_acl_permission(in, perms, want);
5189 if (ret != -EAGAIN)
5190 return ret;
5191 }
5192
5193 // check permissions before doing anything else
5194 if (!in->check_mode(perms, want))
5195 return -EACCES;
5196 return 0;
5197}
5198
5199int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5200 const UserPerm& perms)
5201{
5202 int r = _getattr_for_perm(in, perms);
5203 if (r < 0)
5204 goto out;
5205
5206 r = 0;
5207 if (strncmp(name, "system.", 7) == 0) {
5208 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5209 r = -EPERM;
5210 } else {
5211 r = inode_permission(in, perms, want);
5212 }
5213out:
1adf2230 5214 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5215 return r;
5216}
5217
5218ostream& operator<<(ostream &out, const UserPerm& perm) {
5219 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5220 return out;
5221}
5222
5223int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5224 const UserPerm& perms)
5225{
181888fb 5226 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5227 int r = _getattr_for_perm(in, perms);
5228 if (r < 0)
5229 goto out;
5230
5231 if (mask & CEPH_SETATTR_SIZE) {
5232 r = inode_permission(in, perms, MAY_WRITE);
5233 if (r < 0)
5234 goto out;
5235 }
5236
5237 r = -EPERM;
5238 if (mask & CEPH_SETATTR_UID) {
5239 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5240 goto out;
5241 }
5242 if (mask & CEPH_SETATTR_GID) {
5243 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5244 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5245 goto out;
5246 }
5247
5248 if (mask & CEPH_SETATTR_MODE) {
5249 if (perms.uid() != 0 && perms.uid() != in->uid)
5250 goto out;
5251
5252 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5253 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5254 stx->stx_mode &= ~S_ISGID;
5255 }
5256
5257 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5258 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5259 if (perms.uid() != 0 && perms.uid() != in->uid) {
5260 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5261 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5262 check_mask |= CEPH_SETATTR_MTIME;
5263 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5264 check_mask |= CEPH_SETATTR_ATIME;
5265 if (check_mask & mask) {
5266 goto out;
5267 } else {
5268 r = inode_permission(in, perms, MAY_WRITE);
5269 if (r < 0)
5270 goto out;
5271 }
5272 }
5273 }
5274 r = 0;
5275out:
5276 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5277 return r;
5278}
5279
5280int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5281{
181888fb 5282 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5283 unsigned want = 0;
5284
5285 if ((flags & O_ACCMODE) == O_WRONLY)
5286 want = MAY_WRITE;
5287 else if ((flags & O_ACCMODE) == O_RDWR)
5288 want = MAY_READ | MAY_WRITE;
5289 else if ((flags & O_ACCMODE) == O_RDONLY)
5290 want = MAY_READ;
5291 if (flags & O_TRUNC)
5292 want |= MAY_WRITE;
5293
5294 int r = 0;
5295 switch (in->mode & S_IFMT) {
5296 case S_IFLNK:
5297 r = -ELOOP;
5298 goto out;
5299 case S_IFDIR:
5300 if (want & MAY_WRITE) {
5301 r = -EISDIR;
5302 goto out;
5303 }
5304 break;
5305 }
5306
5307 r = _getattr_for_perm(in, perms);
5308 if (r < 0)
5309 goto out;
5310
5311 r = inode_permission(in, perms, want);
5312out:
5313 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5314 return r;
5315}
5316
5317int Client::may_lookup(Inode *dir, const UserPerm& perms)
5318{
181888fb 5319 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5320 int r = _getattr_for_perm(dir, perms);
5321 if (r < 0)
5322 goto out;
5323
5324 r = inode_permission(dir, perms, MAY_EXEC);
5325out:
5326 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5327 return r;
5328}
5329
5330int Client::may_create(Inode *dir, const UserPerm& perms)
5331{
181888fb 5332 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5333 int r = _getattr_for_perm(dir, perms);
5334 if (r < 0)
5335 goto out;
5336
5337 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5338out:
5339 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5340 return r;
5341}
5342
5343int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5344{
181888fb 5345 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5346 int r = _getattr_for_perm(dir, perms);
5347 if (r < 0)
5348 goto out;
5349
5350 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5351 if (r < 0)
5352 goto out;
5353
5354 /* 'name == NULL' means rmsnap */
5355 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5356 InodeRef otherin;
5357 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5358 if (r < 0)
5359 goto out;
5360 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5361 r = -EPERM;
5362 }
5363out:
5364 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5365 return r;
5366}
5367
5368int Client::may_hardlink(Inode *in, const UserPerm& perms)
5369{
181888fb 5370 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5371 int r = _getattr_for_perm(in, perms);
5372 if (r < 0)
5373 goto out;
5374
5375 if (perms.uid() == 0 || perms.uid() == in->uid) {
5376 r = 0;
5377 goto out;
5378 }
5379
5380 r = -EPERM;
5381 if (!S_ISREG(in->mode))
5382 goto out;
5383
5384 if (in->mode & S_ISUID)
5385 goto out;
5386
5387 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5388 goto out;
5389
5390 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5391out:
5392 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5393 return r;
5394}
5395
5396int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5397{
5398 int mask = CEPH_STAT_CAP_MODE;
5399 bool force = false;
5400 if (acl_type != NO_ACL) {
5401 mask |= CEPH_STAT_CAP_XATTR;
5402 force = in->xattr_version == 0;
5403 }
5404 return _getattr(in, mask, perms, force);
5405}
5406
5407vinodeno_t Client::_get_vino(Inode *in)
5408{
5409 /* The caller must hold the client lock */
5410 return vinodeno_t(in->ino, in->snapid);
5411}
5412
5413inodeno_t Client::_get_inodeno(Inode *in)
5414{
5415 /* The caller must hold the client lock */
5416 return in->ino;
5417}
5418
5419
5420/**
5421 * Resolve an MDS spec to a list of MDS daemon GIDs.
5422 *
5423 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5424 * It may be '*' in which case it matches all GIDs.
5425 *
5426 * If no error is returned, the `targets` vector will be populated with at least
5427 * one MDS.
5428 */
5429int Client::resolve_mds(
5430 const std::string &mds_spec,
5431 std::vector<mds_gid_t> *targets)
5432{
5433 assert(fsmap);
5434 assert(targets != nullptr);
5435
5436 mds_role_t role;
5437 std::stringstream ss;
5438 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5439 if (role_r == 0) {
5440 // We got a role, resolve it to a GID
5441 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5442 << role << "'" << dendl;
5443 targets->push_back(
5444 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5445 return 0;
5446 }
5447
5448 std::string strtol_err;
5449 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5450 if (strtol_err.empty()) {
5451 // It is a possible GID
5452 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5453 if (fsmap->gid_exists(mds_gid)) {
5454 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5455 targets->push_back(mds_gid);
5456 } else {
5457 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5458 << dendl;
5459 return -ENOENT;
5460 }
5461 } else if (mds_spec == "*") {
5462 // It is a wildcard: use all MDSs
5463 const auto mds_info = fsmap->get_mds_info();
5464
5465 if (mds_info.empty()) {
5466 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5467 return -ENOENT;
5468 }
5469
5470 for (const auto i : mds_info) {
5471 targets->push_back(i.first);
5472 }
5473 } else {
5474 // It did not parse as an integer, it is not a wildcard, it must be a name
5475 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5476 if (mds_gid == 0) {
5477 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5478
5479 lderr(cct) << "FSMap: " << *fsmap << dendl;
5480
5481 return -ENOENT;
5482 } else {
5483 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5484 << "' to GID " << mds_gid << dendl;
5485 targets->push_back(mds_gid);
5486 }
5487 }
5488
5489 return 0;
5490}
5491
5492
5493/**
5494 * Authenticate with mon and establish global ID
5495 */
5496int Client::authenticate()
5497{
5498 assert(client_lock.is_locked_by_me());
5499
5500 if (monclient->is_authenticated()) {
5501 return 0;
5502 }
5503
5504 client_lock.Unlock();
5505 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5506 client_lock.Lock();
5507 if (r < 0) {
5508 return r;
5509 }
5510
5511 whoami = monclient->get_global_id();
5512 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5513
5514 return 0;
5515}
5516
5517int Client::fetch_fsmap(bool user)
5518{
5519 int r;
5520 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5521 // rather than MDSMap because no one MDSMap contains all the daemons, and
5522 // a `tell` can address any daemon.
5523 version_t fsmap_latest;
5524 do {
5525 C_SaferCond cond;
5526 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5527 client_lock.Unlock();
5528 r = cond.wait();
5529 client_lock.Lock();
5530 } while (r == -EAGAIN);
5531
5532 if (r < 0) {
5533 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5534 return r;
5535 }
5536
5537 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5538
5539 if (user) {
5540 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5541 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5542 monclient->renew_subs();
5543 wait_on_list(waiting_for_fsmap);
5544 }
5545 assert(fsmap_user);
5546 assert(fsmap_user->get_epoch() >= fsmap_latest);
5547 } else {
5548 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5549 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5550 monclient->renew_subs();
5551 wait_on_list(waiting_for_fsmap);
5552 }
5553 assert(fsmap);
5554 assert(fsmap->get_epoch() >= fsmap_latest);
5555 }
5556 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5557 << fsmap_latest << dendl;
5558 return 0;
5559}
5560
5561/**
5562 *
5563 * @mds_spec one of ID, rank, GID, "*"
5564 *
5565 */
5566int Client::mds_command(
5567 const std::string &mds_spec,
5568 const vector<string>& cmd,
5569 const bufferlist& inbl,
5570 bufferlist *outbl,
5571 string *outs,
5572 Context *onfinish)
5573{
5574 Mutex::Locker lock(client_lock);
5575
181888fb
FG
5576 if (!initialized)
5577 return -ENOTCONN;
7c673cae
FG
5578
5579 int r;
5580 r = authenticate();
5581 if (r < 0) {
5582 return r;
5583 }
5584
5585 r = fetch_fsmap(false);
5586 if (r < 0) {
5587 return r;
5588 }
5589
5590 // Look up MDS target(s) of the command
5591 std::vector<mds_gid_t> targets;
5592 r = resolve_mds(mds_spec, &targets);
5593 if (r < 0) {
5594 return r;
5595 }
5596
5597 // If daemons are laggy, we won't send them commands. If all
5598 // are laggy then we fail.
5599 std::vector<mds_gid_t> non_laggy;
5600 for (const auto gid : targets) {
5601 const auto info = fsmap->get_info_gid(gid);
5602 if (!info.laggy()) {
5603 non_laggy.push_back(gid);
5604 }
5605 }
5606 if (non_laggy.size() == 0) {
5607 *outs = "All targeted MDS daemons are laggy";
5608 return -ENOENT;
5609 }
5610
5611 if (metadata.empty()) {
5612 // We are called on an unmounted client, so metadata
5613 // won't be initialized yet.
5614 populate_metadata("");
5615 }
5616
5617 // Send commands to targets
5618 C_GatherBuilder gather(cct, onfinish);
5619 for (const auto target_gid : non_laggy) {
5620 const auto info = fsmap->get_info_gid(target_gid);
5621
5622 // Open a connection to the target MDS
5623 entity_inst_t inst = info.get_inst();
5624 ConnectionRef conn = messenger->get_connection(inst);
5625
5626 // Generate MDSCommandOp state
5627 auto &op = command_table.start_command();
5628
5629 op.on_finish = gather.new_sub();
5630 op.cmd = cmd;
5631 op.outbl = outbl;
5632 op.outs = outs;
5633 op.inbl = inbl;
5634 op.mds_gid = target_gid;
5635 op.con = conn;
5636
5637 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5638 << " tid=" << op.tid << cmd << dendl;
5639
5640 // Construct and send MCommand
5641 MCommand *m = op.get_message(monclient->get_fsid());
5642 conn->send_message(m);
5643 }
5644 gather.activate();
5645
5646 return 0;
5647}
5648
5649void Client::handle_command_reply(MCommandReply *m)
5650{
5651 ceph_tid_t const tid = m->get_tid();
5652
5653 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5654
5655 if (!command_table.exists(tid)) {
5656 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5657 m->put();
5658 return;
5659 }
5660
5661 auto &op = command_table.get_command(tid);
5662 if (op.outbl) {
5663 op.outbl->claim(m->get_data());
5664 }
5665 if (op.outs) {
5666 *op.outs = m->rs;
5667 }
5668
5669 if (op.on_finish) {
5670 op.on_finish->complete(m->r);
5671 }
5672
5673 command_table.erase(tid);
5674
5675 m->put();
5676}
5677
5678// -------------------
5679// MOUNT
5680
5681int Client::mount(const std::string &mount_root, const UserPerm& perms,
5682 bool require_mds)
5683{
5684 Mutex::Locker lock(client_lock);
5685
5686 if (mounted) {
5687 ldout(cct, 5) << "already mounted" << dendl;
5688 return 0;
5689 }
5690
b32b8144
FG
5691 unmounting = false;
5692
7c673cae
FG
5693 int r = authenticate();
5694 if (r < 0) {
5695 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5696 return r;
5697 }
5698
5699 std::string want = "mdsmap";
5700 const auto &mds_ns = cct->_conf->client_mds_namespace;
5701 if (!mds_ns.empty()) {
5702 r = fetch_fsmap(true);
5703 if (r < 0)
5704 return r;
5705 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5706 if (cid == FS_CLUSTER_ID_NONE)
5707 return -ENOENT;
5708
5709 std::ostringstream oss;
5710 oss << want << "." << cid;
5711 want = oss.str();
5712 }
5713 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5714
5715 monclient->sub_want(want, 0, 0);
5716 monclient->renew_subs();
5717
5718 tick(); // start tick
5719
5720 if (require_mds) {
5721 while (1) {
5722 auto availability = mdsmap->is_cluster_available();
5723 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5724 // Error out
5725 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5726 return CEPH_FUSE_NO_MDS_UP;
5727 } else if (availability == MDSMap::AVAILABLE) {
5728 // Continue to mount
5729 break;
5730 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5731 // Else, wait. MDSMonitor will update the map to bring
5732 // us to a conclusion eventually.
5733 wait_on_list(waiting_for_mdsmap);
5734 } else {
5735 // Unexpected value!
5736 ceph_abort();
5737 }
5738 }
5739 }
5740
5741 populate_metadata(mount_root.empty() ? "/" : mount_root);
5742
5743 filepath fp(CEPH_INO_ROOT);
5744 if (!mount_root.empty()) {
5745 fp = filepath(mount_root.c_str());
5746 }
5747 while (true) {
5748 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5749 req->set_filepath(fp);
5750 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5751 int res = make_request(req, perms);
5752 if (res < 0) {
5753 if (res == -EACCES && root) {
5754 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5755 break;
5756 }
5757 return res;
5758 }
5759
5760 if (fp.depth())
5761 fp.pop_dentry();
5762 else
5763 break;
5764 }
5765
5766 assert(root);
5767 _ll_get(root);
5768
5769 mounted = true;
5770
5771 // trace?
5772 if (!cct->_conf->client_trace.empty()) {
5773 traceout.open(cct->_conf->client_trace.c_str());
5774 if (traceout.is_open()) {
5775 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5776 } else {
5777 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5778 }
5779 }
5780
5781 /*
5782 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5783 ldout(cct, 3) << "op: struct stat st;" << dendl;
5784 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5785 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5786 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5787 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5788 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5789 ldout(cct, 3) << "op: int fd;" << dendl;
5790 */
5791 return 0;
5792}
5793
5794// UNMOUNT
5795
5796void Client::_close_sessions()
5797{
5798 while (!mds_sessions.empty()) {
5799 // send session closes!
5800 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5801 p != mds_sessions.end();
5802 ++p) {
5803 if (p->second->state != MetaSession::STATE_CLOSING) {
5804 _close_mds_session(p->second);
5805 }
5806 }
5807
5808 // wait for sessions to close
5809 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5810 mount_cond.Wait(client_lock);
5811 }
5812}
5813
31f18b77
FG
5814void Client::flush_mdlog_sync()
5815{
5816 if (mds_requests.empty())
5817 return;
5818 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5819 p != mds_sessions.end();
5820 ++p) {
5821 MetaSession *s = p->second;
5822 flush_mdlog(s);
5823 }
5824}
5825
5826void Client::flush_mdlog(MetaSession *session)
5827{
5828 // Only send this to Luminous or newer MDS daemons, older daemons
5829 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5830 const uint64_t features = session->con->get_features();
5831 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5832 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5833 session->con->send_message(m);
5834 }
5835}
5836
5837
b32b8144 5838void Client::_unmount()
7c673cae 5839{
181888fb
FG
5840 if (unmounting)
5841 return;
7c673cae
FG
5842
5843 ldout(cct, 2) << "unmounting" << dendl;
5844 unmounting = true;
5845
b32b8144
FG
5846 deleg_timeout = 0;
5847
31f18b77 5848 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5849 while (!mds_requests.empty()) {
5850 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5851 mount_cond.Wait(client_lock);
5852 }
5853
5854 if (tick_event)
5855 timer.cancel_event(tick_event);
5856 tick_event = 0;
5857
5858 cwd.reset();
5859
5860 // clean up any unclosed files
5861 while (!fd_map.empty()) {
5862 Fh *fh = fd_map.begin()->second;
5863 fd_map.erase(fd_map.begin());
5864 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5865 _release_fh(fh);
5866 }
5867
5868 while (!ll_unclosed_fh_set.empty()) {
5869 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5870 Fh *fh = *it;
5871 ll_unclosed_fh_set.erase(fh);
5872 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5873 _release_fh(fh);
5874 }
5875
5876 while (!opened_dirs.empty()) {
5877 dir_result_t *dirp = *opened_dirs.begin();
5878 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5879 _closedir(dirp);
5880 }
5881
5882 _ll_drop_pins();
5883
31f18b77
FG
5884 if (blacklisted) {
5885 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5886
5887 if (cct->_conf->client_oc) {
5888 // Purge all cached data so that ObjectCacher doesn't get hung up
5889 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5890 // is to just leave things marked dirty
5891 // (http://tracker.ceph.com/issues/9105)
5892 for (const auto &i : inode_map) {
5893 objectcacher->purge_set(&(i.second->oset));
5894 }
5895 }
5896
5897 mounted = false;
5898 return;
5899 }
5900
7c673cae
FG
5901 while (unsafe_sync_write > 0) {
5902 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5903 mount_cond.Wait(client_lock);
5904 }
5905
5906 if (cct->_conf->client_oc) {
5907 // flush/release all buffered data
5908 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5909 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5910 p != inode_map.end();
5911 p = next) {
5912 next = p;
5913 ++next;
5914 Inode *in = p->second;
5915 if (!in) {
5916 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5917 assert(in);
5918 }
5919 if (!in->caps.empty()) {
5920 InodeRef tmp_ref(in);
5921 _release(in);
5922 _flush(in, new C_Client_FlushComplete(this, in));
5923 }
5924 }
5925 }
5926
5927 flush_caps_sync();
5928 wait_sync_caps(last_flush_tid);
5929
5930 // empty lru cache
7c673cae
FG
5931 trim_cache();
5932
5933 while (lru.lru_get_size() > 0 ||
5934 !inode_map.empty()) {
5935 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5936 << "+" << inode_map.size() << " items"
5937 << ", waiting (for caps to release?)"
5938 << dendl;
5939 utime_t until = ceph_clock_now() + utime_t(5, 0);
5940 int r = mount_cond.WaitUntil(client_lock, until);
5941 if (r == ETIMEDOUT) {
5942 dump_cache(NULL);
5943 }
5944 }
5945 assert(lru.lru_get_size() == 0);
5946 assert(inode_map.empty());
5947
5948 // stop tracing
5949 if (!cct->_conf->client_trace.empty()) {
5950 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5951 traceout.close();
5952 }
5953
5954 _close_sessions();
5955
5956 mounted = false;
5957
5958 ldout(cct, 2) << "unmounted." << dendl;
5959}
5960
b32b8144
FG
5961void Client::unmount()
5962{
5963 Mutex::Locker lock(client_lock);
5964 _unmount();
5965}
5966
7c673cae
FG
5967void Client::flush_cap_releases()
5968{
5969 // send any cap releases
5970 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5971 p != mds_sessions.end();
5972 ++p) {
5973 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5974 p->first)) {
5975 if (cct->_conf->client_inject_release_failure) {
5976 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5977 p->second->release->put();
5978 } else {
5979 p->second->con->send_message(p->second->release);
5980 }
5981 p->second->release = 0;
5982 }
5983 }
5984}
5985
5986void Client::tick()
5987{
5988 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5989 sleep(cct->_conf->client_debug_inject_tick_delay);
5990 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5991 cct->_conf->apply_changes(NULL);
5992 }
5993
5994 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
5995 tick_event = timer.add_event_after(
5996 cct->_conf->client_tick_interval,
5997 new FunctionContext([this](int) {
5998 // Called back via Timer, which takes client_lock for us
5999 assert(client_lock.is_locked_by_me());
6000 tick();
6001 }));
7c673cae
FG
6002 utime_t now = ceph_clock_now();
6003
6004 if (!mounted && !mds_requests.empty()) {
6005 MetaRequest *req = mds_requests.begin()->second;
6006 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6007 req->abort(-ETIMEDOUT);
6008 if (req->caller_cond) {
6009 req->kick = true;
6010 req->caller_cond->Signal();
6011 }
6012 signal_cond_list(waiting_for_mdsmap);
6013 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6014 p != mds_sessions.end();
6015 ++p)
6016 signal_context_list(p->second->waiting_for_open);
6017 }
6018 }
6019
6020 if (mdsmap->get_epoch()) {
6021 // renew caps?
6022 utime_t el = now - last_cap_renew;
6023 if (el > mdsmap->get_session_timeout() / 3.0)
6024 renew_caps();
6025
6026 flush_cap_releases();
6027 }
6028
6029 // delayed caps
28e407b8 6030 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6031 while (!p.end()) {
6032 Inode *in = *p;
6033 ++p;
6034 if (in->hold_caps_until > now)
6035 break;
28e407b8 6036 delayed_list.pop_front();
7c673cae
FG
6037 check_caps(in, CHECK_CAPS_NODELAY);
6038 }
6039
6040 trim_cache(true);
6041}
6042
6043void Client::renew_caps()
6044{
6045 ldout(cct, 10) << "renew_caps()" << dendl;
6046 last_cap_renew = ceph_clock_now();
6047
6048 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6049 p != mds_sessions.end();
6050 ++p) {
6051 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6052 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6053 renew_caps(p->second);
6054 }
6055}
6056
6057void Client::renew_caps(MetaSession *session)
6058{
6059 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6060 session->last_cap_renew_request = ceph_clock_now();
6061 uint64_t seq = ++session->cap_renew_seq;
6062 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6063}
6064
6065
6066// ===============================================================
6067// high level (POSIXy) interface
6068
6069int Client::_do_lookup(Inode *dir, const string& name, int mask,
6070 InodeRef *target, const UserPerm& perms)
6071{
6072 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6073 MetaRequest *req = new MetaRequest(op);
6074 filepath path;
6075 dir->make_nosnap_relative_path(path);
6076 path.push_dentry(name);
6077 req->set_filepath(path);
6078 req->set_inode(dir);
6079 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6080 mask |= DEBUG_GETATTR_CAPS;
6081 req->head.args.getattr.mask = mask;
6082
6083 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6084
6085 int r = make_request(req, perms, target);
6086 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6087 return r;
6088}
6089
6090int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6091 const UserPerm& perms)
6092{
6093 int r = 0;
6094 Dentry *dn = NULL;
6095
6096 if (!dir->is_dir()) {
6097 r = -ENOTDIR;
6098 goto done;
6099 }
6100
6101 if (dname == "..") {
6102 if (dir->dn_set.empty())
6103 *target = dir;
6104 else
6105 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6106 goto done;
6107 }
6108
6109 if (dname == ".") {
6110 *target = dir;
6111 goto done;
6112 }
6113
6114 if (dname.length() > NAME_MAX) {
6115 r = -ENAMETOOLONG;
6116 goto done;
6117 }
6118
6119 if (dname == cct->_conf->client_snapdir &&
6120 dir->snapid == CEPH_NOSNAP) {
6121 *target = open_snapdir(dir);
6122 goto done;
6123 }
6124
6125 if (dir->dir &&
6126 dir->dir->dentries.count(dname)) {
6127 dn = dir->dir->dentries[dname];
6128
6129 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6130 << " seq " << dn->lease_seq
6131 << dendl;
6132
94b18763 6133 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6134 // is dn lease valid?
6135 utime_t now = ceph_clock_now();
6136 if (dn->lease_mds >= 0 &&
6137 dn->lease_ttl > now &&
6138 mds_sessions.count(dn->lease_mds)) {
6139 MetaSession *s = mds_sessions[dn->lease_mds];
6140 if (s->cap_ttl > now &&
6141 s->cap_gen == dn->lease_gen) {
6142 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6143 // make trim_caps() behave.
6144 dir->try_touch_cap(dn->lease_mds);
6145 goto hit_dn;
6146 }
6147 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6148 << " vs lease_gen " << dn->lease_gen << dendl;
6149 }
6150 // dir lease?
94b18763 6151 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6152 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6153 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6154 goto hit_dn;
6155 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6156 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6157 << *dir << " dn '" << dname << "'" << dendl;
6158 return -ENOENT;
6159 }
6160 }
6161 } else {
6162 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6163 }
6164 } else {
6165 // can we conclude ENOENT locally?
94b18763 6166 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae
FG
6167 (dir->flags & I_COMPLETE)) {
6168 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6169 return -ENOENT;
6170 }
6171 }
6172
6173 r = _do_lookup(dir, dname, mask, target, perms);
6174 goto done;
6175
6176 hit_dn:
6177 if (dn->inode) {
6178 *target = dn->inode;
6179 } else {
6180 r = -ENOENT;
6181 }
6182 touch_dn(dn);
6183
6184 done:
6185 if (r < 0)
6186 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6187 else
6188 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6189 return r;
6190}
6191
6192int Client::get_or_create(Inode *dir, const char* name,
6193 Dentry **pdn, bool expect_null)
6194{
6195 // lookup
6196 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6197 dir->open_dir();
6198 if (dir->dir->dentries.count(name)) {
6199 Dentry *dn = dir->dir->dentries[name];
6200
6201 // is dn lease valid?
6202 utime_t now = ceph_clock_now();
6203 if (dn->inode &&
6204 dn->lease_mds >= 0 &&
6205 dn->lease_ttl > now &&
6206 mds_sessions.count(dn->lease_mds)) {
6207 MetaSession *s = mds_sessions[dn->lease_mds];
6208 if (s->cap_ttl > now &&
6209 s->cap_gen == dn->lease_gen) {
6210 if (expect_null)
6211 return -EEXIST;
6212 }
6213 }
6214 *pdn = dn;
6215 } else {
6216 // otherwise link up a new one
6217 *pdn = link(dir->dir, name, NULL, NULL);
6218 }
6219
6220 // success
6221 return 0;
6222}
6223
6224int Client::path_walk(const filepath& origpath, InodeRef *end,
6225 const UserPerm& perms, bool followsym, int mask)
6226{
6227 filepath path = origpath;
6228 InodeRef cur;
6229 if (origpath.absolute())
6230 cur = root;
6231 else
6232 cur = cwd;
6233 assert(cur);
6234
6235 ldout(cct, 10) << "path_walk " << path << dendl;
6236
6237 int symlinks = 0;
6238
6239 unsigned i=0;
6240 while (i < path.depth() && cur) {
6241 int caps = 0;
6242 const string &dname = path[i];
6243 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6244 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6245 InodeRef next;
6246 if (cct->_conf->client_permissions) {
6247 int r = may_lookup(cur.get(), perms);
6248 if (r < 0)
6249 return r;
6250 caps = CEPH_CAP_AUTH_SHARED;
6251 }
6252
6253 /* Get extra requested caps on the last component */
6254 if (i == (path.depth() - 1))
6255 caps |= mask;
6256 int r = _lookup(cur.get(), dname, caps, &next, perms);
6257 if (r < 0)
6258 return r;
6259 // only follow trailing symlink if followsym. always follow
6260 // 'directory' symlinks.
6261 if (next && next->is_symlink()) {
6262 symlinks++;
6263 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6264 if (symlinks > MAXSYMLINKS) {
6265 return -ELOOP;
6266 }
6267
6268 if (i < path.depth() - 1) {
6269 // dir symlink
6270 // replace consumed components of path with symlink dir target
6271 filepath resolved(next->symlink.c_str());
6272 resolved.append(path.postfixpath(i + 1));
6273 path = resolved;
6274 i = 0;
6275 if (next->symlink[0] == '/') {
6276 cur = root;
6277 }
6278 continue;
6279 } else if (followsym) {
6280 if (next->symlink[0] == '/') {
6281 path = next->symlink.c_str();
6282 i = 0;
6283 // reset position
6284 cur = root;
6285 } else {
6286 filepath more(next->symlink.c_str());
6287 // we need to remove the symlink component from off of the path
6288 // before adding the target that the symlink points to. remain
6289 // at the same position in the path.
6290 path.pop_dentry();
6291 path.append(more);
6292 }
6293 continue;
6294 }
6295 }
6296 cur.swap(next);
6297 i++;
6298 }
6299 if (!cur)
6300 return -ENOENT;
6301 if (end)
6302 end->swap(cur);
6303 return 0;
6304}
6305
6306
6307// namespace ops
6308
6309int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6310{
6311 Mutex::Locker lock(client_lock);
6312 tout(cct) << "link" << std::endl;
6313 tout(cct) << relexisting << std::endl;
6314 tout(cct) << relpath << std::endl;
6315
181888fb
FG
6316 if (unmounting)
6317 return -ENOTCONN;
6318
7c673cae
FG
6319 filepath existing(relexisting);
6320
6321 InodeRef in, dir;
6322 int r = path_walk(existing, &in, perm, true);
6323 if (r < 0)
6324 return r;
6325 if (std::string(relpath) == "/") {
6326 r = -EEXIST;
6327 return r;
6328 }
6329 filepath path(relpath);
6330 string name = path.last_dentry();
6331 path.pop_dentry();
6332
6333 r = path_walk(path, &dir, perm, true);
6334 if (r < 0)
6335 return r;
6336 if (cct->_conf->client_permissions) {
6337 if (S_ISDIR(in->mode)) {
6338 r = -EPERM;
6339 return r;
6340 }
6341 r = may_hardlink(in.get(), perm);
6342 if (r < 0)
6343 return r;
6344 r = may_create(dir.get(), perm);
6345 if (r < 0)
6346 return r;
6347 }
6348 r = _link(in.get(), dir.get(), name.c_str(), perm);
6349 return r;
6350}
6351
6352int Client::unlink(const char *relpath, const UserPerm& perm)
6353{
6354 Mutex::Locker lock(client_lock);
6355 tout(cct) << "unlink" << std::endl;
6356 tout(cct) << relpath << std::endl;
6357
181888fb
FG
6358 if (unmounting)
6359 return -ENOTCONN;
6360
7c673cae
FG
6361 if (std::string(relpath) == "/")
6362 return -EISDIR;
6363
6364 filepath path(relpath);
6365 string name = path.last_dentry();
6366 path.pop_dentry();
6367 InodeRef dir;
6368 int r = path_walk(path, &dir, perm);
6369 if (r < 0)
6370 return r;
6371 if (cct->_conf->client_permissions) {
6372 r = may_delete(dir.get(), name.c_str(), perm);
6373 if (r < 0)
6374 return r;
6375 }
6376 return _unlink(dir.get(), name.c_str(), perm);
6377}
6378
6379int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6380{
6381 Mutex::Locker lock(client_lock);
6382 tout(cct) << "rename" << std::endl;
6383 tout(cct) << relfrom << std::endl;
6384 tout(cct) << relto << std::endl;
6385
181888fb
FG
6386 if (unmounting)
6387 return -ENOTCONN;
6388
7c673cae
FG
6389 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6390 return -EBUSY;
6391
6392 filepath from(relfrom);
6393 filepath to(relto);
6394 string fromname = from.last_dentry();
6395 from.pop_dentry();
6396 string toname = to.last_dentry();
6397 to.pop_dentry();
6398
6399 InodeRef fromdir, todir;
6400 int r = path_walk(from, &fromdir, perm);
6401 if (r < 0)
6402 goto out;
6403 r = path_walk(to, &todir, perm);
6404 if (r < 0)
6405 goto out;
6406
6407 if (cct->_conf->client_permissions) {
6408 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6409 if (r < 0)
6410 return r;
6411 r = may_delete(todir.get(), toname.c_str(), perm);
6412 if (r < 0 && r != -ENOENT)
6413 return r;
6414 }
6415 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6416out:
6417 return r;
6418}
6419
6420// dirs
6421
6422int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6423{
6424 Mutex::Locker lock(client_lock);
6425 tout(cct) << "mkdir" << std::endl;
6426 tout(cct) << relpath << std::endl;
6427 tout(cct) << mode << std::endl;
6428 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6429
181888fb
FG
6430 if (unmounting)
6431 return -ENOTCONN;
6432
7c673cae
FG
6433 if (std::string(relpath) == "/")
6434 return -EEXIST;
6435
6436 filepath path(relpath);
6437 string name = path.last_dentry();
6438 path.pop_dentry();
6439 InodeRef dir;
6440 int r = path_walk(path, &dir, perm);
6441 if (r < 0)
6442 return r;
6443 if (cct->_conf->client_permissions) {
6444 r = may_create(dir.get(), perm);
6445 if (r < 0)
6446 return r;
6447 }
6448 return _mkdir(dir.get(), name.c_str(), mode, perm);
6449}
6450
6451int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6452{
6453 Mutex::Locker lock(client_lock);
6454 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6455 tout(cct) << "mkdirs" << std::endl;
6456 tout(cct) << relpath << std::endl;
6457 tout(cct) << mode << std::endl;
6458
181888fb
FG
6459 if (unmounting)
6460 return -ENOTCONN;
6461
7c673cae
FG
6462 //get through existing parts of path
6463 filepath path(relpath);
6464 unsigned int i;
6465 int r = 0, caps = 0;
6466 InodeRef cur, next;
6467 cur = cwd;
6468 for (i=0; i<path.depth(); ++i) {
6469 if (cct->_conf->client_permissions) {
6470 r = may_lookup(cur.get(), perms);
6471 if (r < 0)
6472 break;
6473 caps = CEPH_CAP_AUTH_SHARED;
6474 }
6475 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6476 if (r < 0)
6477 break;
6478 cur.swap(next);
6479 }
6480 //check that we have work left to do
6481 if (i==path.depth()) return -EEXIST;
6482 if (r!=-ENOENT) return r;
6483 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6484 //make new directory at each level
6485 for (; i<path.depth(); ++i) {
6486 if (cct->_conf->client_permissions) {
6487 r = may_create(cur.get(), perms);
6488 if (r < 0)
6489 return r;
6490 }
6491 //make new dir
6492 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6493
7c673cae 6494 //check proper creation/existence
c07f9fc5
FG
6495 if(-EEXIST == r && i < path.depth() - 1) {
6496 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6497 }
6498 if (r < 0)
6499 return r;
7c673cae
FG
6500 //move to new dir and continue
6501 cur.swap(next);
6502 ldout(cct, 20) << "mkdirs: successfully created directory "
6503 << filepath(cur->ino).get_path() << dendl;
6504 }
6505 return 0;
6506}
6507
6508int Client::rmdir(const char *relpath, const UserPerm& perms)
6509{
6510 Mutex::Locker lock(client_lock);
6511 tout(cct) << "rmdir" << std::endl;
6512 tout(cct) << relpath << std::endl;
6513
181888fb
FG
6514 if (unmounting)
6515 return -ENOTCONN;
6516
7c673cae
FG
6517 if (std::string(relpath) == "/")
6518 return -EBUSY;
6519
6520 filepath path(relpath);
6521 string name = path.last_dentry();
6522 path.pop_dentry();
6523 InodeRef dir;
6524 int r = path_walk(path, &dir, perms);
6525 if (r < 0)
6526 return r;
6527 if (cct->_conf->client_permissions) {
6528 int r = may_delete(dir.get(), name.c_str(), perms);
6529 if (r < 0)
6530 return r;
6531 }
6532 return _rmdir(dir.get(), name.c_str(), perms);
6533}
6534
6535int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6536{
6537 Mutex::Locker lock(client_lock);
6538 tout(cct) << "mknod" << std::endl;
6539 tout(cct) << relpath << std::endl;
6540 tout(cct) << mode << std::endl;
6541 tout(cct) << rdev << std::endl;
6542
181888fb
FG
6543 if (unmounting)
6544 return -ENOTCONN;
6545
7c673cae
FG
6546 if (std::string(relpath) == "/")
6547 return -EEXIST;
6548
6549 filepath path(relpath);
6550 string name = path.last_dentry();
6551 path.pop_dentry();
6552 InodeRef dir;
6553 int r = path_walk(path, &dir, perms);
6554 if (r < 0)
6555 return r;
6556 if (cct->_conf->client_permissions) {
6557 int r = may_create(dir.get(), perms);
6558 if (r < 0)
6559 return r;
6560 }
6561 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6562}
6563
6564// symlinks
6565
6566int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6567{
6568 Mutex::Locker lock(client_lock);
6569 tout(cct) << "symlink" << std::endl;
6570 tout(cct) << target << std::endl;
6571 tout(cct) << relpath << std::endl;
6572
181888fb
FG
6573 if (unmounting)
6574 return -ENOTCONN;
6575
7c673cae
FG
6576 if (std::string(relpath) == "/")
6577 return -EEXIST;
6578
6579 filepath path(relpath);
6580 string name = path.last_dentry();
6581 path.pop_dentry();
6582 InodeRef dir;
6583 int r = path_walk(path, &dir, perms);
6584 if (r < 0)
6585 return r;
6586 if (cct->_conf->client_permissions) {
6587 int r = may_create(dir.get(), perms);
6588 if (r < 0)
6589 return r;
6590 }
6591 return _symlink(dir.get(), name.c_str(), target, perms);
6592}
6593
6594int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6595{
6596 Mutex::Locker lock(client_lock);
6597 tout(cct) << "readlink" << std::endl;
6598 tout(cct) << relpath << std::endl;
6599
181888fb
FG
6600 if (unmounting)
6601 return -ENOTCONN;
6602
7c673cae
FG
6603 filepath path(relpath);
6604 InodeRef in;
6605 int r = path_walk(path, &in, perms, false);
6606 if (r < 0)
6607 return r;
6608
6609 return _readlink(in.get(), buf, size);
6610}
6611
6612int Client::_readlink(Inode *in, char *buf, size_t size)
6613{
6614 if (!in->is_symlink())
6615 return -EINVAL;
6616
6617 // copy into buf (at most size bytes)
6618 int r = in->symlink.length();
6619 if (r > (int)size)
6620 r = size;
6621 memcpy(buf, in->symlink.c_str(), r);
6622 return r;
6623}
6624
6625
6626// inode stuff
6627
6628int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6629{
94b18763 6630 bool yes = in->caps_issued_mask(mask, true);
7c673cae
FG
6631
6632 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6633 if (yes && !force)
6634 return 0;
6635
6636 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6637 filepath path;
6638 in->make_nosnap_relative_path(path);
6639 req->set_filepath(path);
6640 req->set_inode(in);
6641 req->head.args.getattr.mask = mask;
6642
6643 int res = make_request(req, perms);
6644 ldout(cct, 10) << "_getattr result=" << res << dendl;
6645 return res;
6646}
6647
6648int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6649 const UserPerm& perms, InodeRef *inp)
6650{
6651 int issued = in->caps_issued();
6652
6653 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6654 ccap_string(issued) << dendl;
6655
6656 if (in->snapid != CEPH_NOSNAP) {
6657 return -EROFS;
6658 }
6659 if ((mask & CEPH_SETATTR_SIZE) &&
6660 (unsigned long)stx->stx_size > in->size &&
6661 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6662 perms)) {
6663 return -EDQUOT;
6664 }
6665
6666 // make the change locally?
6667 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6668 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6669 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6670 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6671 << in->cap_dirtier_gid << ", forcing sync setattr"
6672 << dendl;
6673 /*
6674 * This works because we implicitly flush the caps as part of the
6675 * request, so the cap update check will happen with the writeback
6676 * cap context, and then the setattr check will happen with the
6677 * caller's context.
6678 *
6679 * In reality this pattern is likely pretty rare (different users
6680 * setattr'ing the same file). If that turns out not to be the
6681 * case later, we can build a more complex pipelined cap writeback
6682 * infrastructure...
6683 */
6684 if (!mask)
6685 mask |= CEPH_SETATTR_CTIME;
6686 goto force_request;
6687 }
6688
6689 if (!mask) {
6690 // caller just needs us to bump the ctime
6691 in->ctime = ceph_clock_now();
6692 in->cap_dirtier_uid = perms.uid();
6693 in->cap_dirtier_gid = perms.gid();
6694 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6695 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6696 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6697 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6698 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6699 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6700 else
6701 mask |= CEPH_SETATTR_CTIME;
6702 }
6703
6704 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6705 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6706
6707 mask &= ~CEPH_SETATTR_KILL_SGUID;
6708
6709 if (mask & CEPH_SETATTR_UID) {
6710 in->ctime = ceph_clock_now();
6711 in->cap_dirtier_uid = perms.uid();
6712 in->cap_dirtier_gid = perms.gid();
6713 in->uid = stx->stx_uid;
28e407b8 6714 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6715 mask &= ~CEPH_SETATTR_UID;
6716 kill_sguid = true;
6717 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6718 }
6719 if (mask & CEPH_SETATTR_GID) {
6720 in->ctime = ceph_clock_now();
6721 in->cap_dirtier_uid = perms.uid();
6722 in->cap_dirtier_gid = perms.gid();
6723 in->gid = stx->stx_gid;
28e407b8 6724 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6725 mask &= ~CEPH_SETATTR_GID;
6726 kill_sguid = true;
6727 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6728 }
6729
6730 if (mask & CEPH_SETATTR_MODE) {
6731 in->ctime = ceph_clock_now();
6732 in->cap_dirtier_uid = perms.uid();
6733 in->cap_dirtier_gid = perms.gid();
6734 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6735 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6736 mask &= ~CEPH_SETATTR_MODE;
6737 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6738 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6739 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6740 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6741 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6742 }
6743
6744 if (mask & CEPH_SETATTR_BTIME) {
6745 in->ctime = ceph_clock_now();
6746 in->cap_dirtier_uid = perms.uid();
6747 in->cap_dirtier_gid = perms.gid();
6748 in->btime = utime_t(stx->stx_btime);
28e407b8 6749 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6750 mask &= ~CEPH_SETATTR_BTIME;
6751 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6752 }
6753 } else if (mask & CEPH_SETATTR_SIZE) {
6754 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6755 mask |= CEPH_SETATTR_KILL_SGUID;
6756 }
6757
6758 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6759 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6760 if (mask & CEPH_SETATTR_MTIME)
6761 in->mtime = utime_t(stx->stx_mtime);
6762 if (mask & CEPH_SETATTR_ATIME)
6763 in->atime = utime_t(stx->stx_atime);
6764 in->ctime = ceph_clock_now();
6765 in->cap_dirtier_uid = perms.uid();
6766 in->cap_dirtier_gid = perms.gid();
6767 in->time_warp_seq++;
28e407b8 6768 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6769 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6770 }
6771 }
6772 if (!mask) {
6773 in->change_attr++;
6774 return 0;
6775 }
6776
6777force_request:
6778 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6779
6780 filepath path;
6781
6782 in->make_nosnap_relative_path(path);
6783 req->set_filepath(path);
6784 req->set_inode(in);
6785
6786 if (mask & CEPH_SETATTR_KILL_SGUID) {
6787 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6788 }
6789 if (mask & CEPH_SETATTR_MODE) {
6790 req->head.args.setattr.mode = stx->stx_mode;
6791 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6792 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6793 }
6794 if (mask & CEPH_SETATTR_UID) {
6795 req->head.args.setattr.uid = stx->stx_uid;
6796 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6797 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6798 }
6799 if (mask & CEPH_SETATTR_GID) {
6800 req->head.args.setattr.gid = stx->stx_gid;
6801 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6802 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6803 }
6804 if (mask & CEPH_SETATTR_BTIME) {
6805 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6806 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6807 }
6808 if (mask & CEPH_SETATTR_MTIME) {
6809 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6810 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6811 CEPH_CAP_FILE_WR;
6812 }
6813 if (mask & CEPH_SETATTR_ATIME) {
6814 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6815 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6816 CEPH_CAP_FILE_WR;
6817 }
6818 if (mask & CEPH_SETATTR_SIZE) {
6819 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6820 req->head.args.setattr.size = stx->stx_size;
6821 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6822 } else { //too big!
6823 put_request(req);
6824 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6825 return -EFBIG;
6826 }
94b18763 6827 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6828 CEPH_CAP_FILE_WR;
6829 }
6830 req->head.args.setattr.mask = mask;
6831
6832 req->regetattr_mask = mask;
6833
6834 int res = make_request(req, perms, inp);
6835 ldout(cct, 10) << "_setattr result=" << res << dendl;
6836 return res;
6837}
6838
6839/* Note that we only care about attrs that setattr cares about */
6840void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6841{
6842 stx->stx_size = st->st_size;
6843 stx->stx_mode = st->st_mode;
6844 stx->stx_uid = st->st_uid;
6845 stx->stx_gid = st->st_gid;
6846 stx->stx_mtime = st->st_mtim;
6847 stx->stx_atime = st->st_atim;
6848}
6849
6850int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6851 const UserPerm& perms, InodeRef *inp)
6852{
6853 int ret = _do_setattr(in, stx, mask, perms, inp);
6854 if (ret < 0)
6855 return ret;
6856 if (mask & CEPH_SETATTR_MODE)
6857 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6858 return ret;
6859}
6860
6861int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6862 const UserPerm& perms)
6863{
6864 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6865 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6866 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6867 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6868 if (cct->_conf->client_permissions) {
6869 int r = may_setattr(in.get(), stx, mask, perms);
6870 if (r < 0)
6871 return r;
6872 }
6873 return __setattrx(in.get(), stx, mask, perms);
6874}
6875
6876int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6877 const UserPerm& perms)
6878{
6879 struct ceph_statx stx;
6880
6881 stat_to_statx(attr, &stx);
6882 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
6883
6884 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6885 mask &= ~CEPH_SETATTR_UID;
6886 }
6887 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6888 mask &= ~CEPH_SETATTR_GID;
6889 }
6890
7c673cae
FG
6891 return _setattrx(in, &stx, mask, perms);
6892}
6893
6894int Client::setattr(const char *relpath, struct stat *attr, int mask,
6895 const UserPerm& perms)
6896{
6897 Mutex::Locker lock(client_lock);
6898 tout(cct) << "setattr" << std::endl;
6899 tout(cct) << relpath << std::endl;
6900 tout(cct) << mask << std::endl;
6901
181888fb
FG
6902 if (unmounting)
6903 return -ENOTCONN;
6904
7c673cae
FG
6905 filepath path(relpath);
6906 InodeRef in;
6907 int r = path_walk(path, &in, perms);
6908 if (r < 0)
6909 return r;
6910 return _setattr(in, attr, mask, perms);
6911}
6912
6913int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6914 const UserPerm& perms, int flags)
6915{
6916 Mutex::Locker lock(client_lock);
6917 tout(cct) << "setattrx" << std::endl;
6918 tout(cct) << relpath << std::endl;
6919 tout(cct) << mask << std::endl;
6920
181888fb
FG
6921 if (unmounting)
6922 return -ENOTCONN;
6923
7c673cae
FG
6924 filepath path(relpath);
6925 InodeRef in;
6926 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6927 if (r < 0)
6928 return r;
6929 return _setattrx(in, stx, mask, perms);
6930}
6931
6932int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6933{
6934 Mutex::Locker lock(client_lock);
6935 tout(cct) << "fsetattr" << std::endl;
6936 tout(cct) << fd << std::endl;
6937 tout(cct) << mask << std::endl;
6938
181888fb
FG
6939 if (unmounting)
6940 return -ENOTCONN;
6941
7c673cae
FG
6942 Fh *f = get_filehandle(fd);
6943 if (!f)
6944 return -EBADF;
6945#if defined(__linux__) && defined(O_PATH)
6946 if (f->flags & O_PATH)
6947 return -EBADF;
6948#endif
6949 return _setattr(f->inode, attr, mask, perms);
6950}
6951
6952int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6953{
6954 Mutex::Locker lock(client_lock);
6955 tout(cct) << "fsetattr" << std::endl;
6956 tout(cct) << fd << std::endl;
6957 tout(cct) << mask << std::endl;
6958
181888fb
FG
6959 if (unmounting)
6960 return -ENOTCONN;
6961
7c673cae
FG
6962 Fh *f = get_filehandle(fd);
6963 if (!f)
6964 return -EBADF;
6965#if defined(__linux__) && defined(O_PATH)
6966 if (f->flags & O_PATH)
6967 return -EBADF;
6968#endif
6969 return _setattrx(f->inode, stx, mask, perms);
6970}
6971
6972int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6973 frag_info_t *dirstat, int mask)
6974{
6975 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6976 Mutex::Locker lock(client_lock);
6977 tout(cct) << "stat" << std::endl;
6978 tout(cct) << relpath << std::endl;
181888fb
FG
6979
6980 if (unmounting)
6981 return -ENOTCONN;
6982
7c673cae
FG
6983 filepath path(relpath);
6984 InodeRef in;
6985 int r = path_walk(path, &in, perms, true, mask);
6986 if (r < 0)
6987 return r;
6988 r = _getattr(in, mask, perms);
6989 if (r < 0) {
6990 ldout(cct, 3) << "stat exit on error!" << dendl;
6991 return r;
6992 }
6993 fill_stat(in, stbuf, dirstat);
6994 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6995 return r;
6996}
6997
6998unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6999{
7000 unsigned mask = 0;
7001
7002 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7003 if (flags & AT_NO_ATTR_SYNC)
7004 goto out;
7005
7006 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7007 mask |= CEPH_CAP_PIN;
7008 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7009 mask |= CEPH_CAP_AUTH_SHARED;
7010 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7011 mask |= CEPH_CAP_LINK_SHARED;
7012 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7013 mask |= CEPH_CAP_FILE_SHARED;
7014 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7015 mask |= CEPH_CAP_XATTR_SHARED;
7016out:
7017 return mask;
7018}
7019
7020int Client::statx(const char *relpath, struct ceph_statx *stx,
7021 const UserPerm& perms,
7022 unsigned int want, unsigned int flags)
7023{
7024 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7025 Mutex::Locker lock(client_lock);
7026 tout(cct) << "statx" << std::endl;
7027 tout(cct) << relpath << std::endl;
181888fb
FG
7028
7029 if (unmounting)
7030 return -ENOTCONN;
7031
7c673cae
FG
7032 filepath path(relpath);
7033 InodeRef in;
7034
7035 unsigned mask = statx_to_mask(flags, want);
7036
7037 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7038 if (r < 0)
7039 return r;
7040
7041 r = _getattr(in, mask, perms);
7042 if (r < 0) {
7043 ldout(cct, 3) << "statx exit on error!" << dendl;
7044 return r;
7045 }
7046
7047 fill_statx(in, mask, stx);
7048 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7049 return r;
7050}
7051
7052int Client::lstat(const char *relpath, struct stat *stbuf,
7053 const UserPerm& perms, frag_info_t *dirstat, int mask)
7054{
7055 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7056 Mutex::Locker lock(client_lock);
7057 tout(cct) << "lstat" << std::endl;
7058 tout(cct) << relpath << std::endl;
181888fb
FG
7059
7060 if (unmounting)
7061 return -ENOTCONN;
7062
7c673cae
FG
7063 filepath path(relpath);
7064 InodeRef in;
7065 // don't follow symlinks
7066 int r = path_walk(path, &in, perms, false, mask);
7067 if (r < 0)
7068 return r;
7069 r = _getattr(in, mask, perms);
7070 if (r < 0) {
7071 ldout(cct, 3) << "lstat exit on error!" << dendl;
7072 return r;
7073 }
7074 fill_stat(in, stbuf, dirstat);
7075 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7076 return r;
7077}
7078
7079int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7080{
7081 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7082 << " mode 0" << oct << in->mode << dec
7083 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7084 memset(st, 0, sizeof(struct stat));
7085 if (use_faked_inos())
7086 st->st_ino = in->faked_ino;
7087 else
7088 st->st_ino = in->ino;
7089 st->st_dev = in->snapid;
7090 st->st_mode = in->mode;
7091 st->st_rdev = in->rdev;
28e407b8
AA
7092 if (in->is_dir()) {
7093 switch (in->nlink) {
7094 case 0:
7095 st->st_nlink = 0; /* dir is unlinked */
7096 break;
7097 case 1:
7098 st->st_nlink = 1 /* parent dentry */
7099 + 1 /* <dir>/. */
7100 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7101 break;
7102 default:
7103 ceph_abort();
7104 }
7105 } else {
7106 st->st_nlink = in->nlink;
7107 }
7c673cae
FG
7108 st->st_uid = in->uid;
7109 st->st_gid = in->gid;
7110 if (in->ctime > in->mtime) {
7111 stat_set_ctime_sec(st, in->ctime.sec());
7112 stat_set_ctime_nsec(st, in->ctime.nsec());
7113 } else {
7114 stat_set_ctime_sec(st, in->mtime.sec());
7115 stat_set_ctime_nsec(st, in->mtime.nsec());
7116 }
7117 stat_set_atime_sec(st, in->atime.sec());
7118 stat_set_atime_nsec(st, in->atime.nsec());
7119 stat_set_mtime_sec(st, in->mtime.sec());
7120 stat_set_mtime_nsec(st, in->mtime.nsec());
7121 if (in->is_dir()) {
7122 if (cct->_conf->client_dirsize_rbytes)
7123 st->st_size = in->rstat.rbytes;
7124 else
7125 st->st_size = in->dirstat.size();
7126 st->st_blocks = 1;
7127 } else {
7128 st->st_size = in->size;
7129 st->st_blocks = (in->size + 511) >> 9;
7130 }
7131 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7132
7133 if (dirstat)
7134 *dirstat = in->dirstat;
7135 if (rstat)
7136 *rstat = in->rstat;
7137
7138 return in->caps_issued();
7139}
7140
7141void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7142{
7143 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7144 << " mode 0" << oct << in->mode << dec
7145 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7146 memset(stx, 0, sizeof(struct ceph_statx));
7147
7148 /*
7149 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7150 * so that all bits are set.
7151 */
7152 if (!mask)
7153 mask = ~0;
7154
7155 /* These are always considered to be available */
7156 stx->stx_dev = in->snapid;
7157 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7158
7159 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7160 stx->stx_mode = S_IFMT & in->mode;
7161 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7162 stx->stx_rdev = in->rdev;
7163 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7164
7165 if (mask & CEPH_CAP_AUTH_SHARED) {
7166 stx->stx_uid = in->uid;
7167 stx->stx_gid = in->gid;
7168 stx->stx_mode = in->mode;
7169 in->btime.to_timespec(&stx->stx_btime);
7170 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7171 }
7172
7173 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7174 if (in->is_dir()) {
7175 switch (in->nlink) {
7176 case 0:
7177 stx->stx_nlink = 0; /* dir is unlinked */
7178 break;
7179 case 1:
7180 stx->stx_nlink = 1 /* parent dentry */
7181 + 1 /* <dir>/. */
7182 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7183 break;
7184 default:
7185 ceph_abort();
7186 }
7187 } else {
7188 stx->stx_nlink = in->nlink;
7189 }
7c673cae
FG
7190 stx->stx_mask |= CEPH_STATX_NLINK;
7191 }
7192
7193 if (mask & CEPH_CAP_FILE_SHARED) {
7194
7195 in->atime.to_timespec(&stx->stx_atime);
7196 in->mtime.to_timespec(&stx->stx_mtime);
7197
7198 if (in->is_dir()) {
7199 if (cct->_conf->client_dirsize_rbytes)
7200 stx->stx_size = in->rstat.rbytes;
7201 else
7202 stx->stx_size = in->dirstat.size();
7203 stx->stx_blocks = 1;
7204 } else {
7205 stx->stx_size = in->size;
7206 stx->stx_blocks = (in->size + 511) >> 9;
7207 }
7208 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7209 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7210 }
7211
7212 /* Change time and change_attr both require all shared caps to view */
7213 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7214 stx->stx_version = in->change_attr;
7215 if (in->ctime > in->mtime)
7216 in->ctime.to_timespec(&stx->stx_ctime);
7217 else
7218 in->mtime.to_timespec(&stx->stx_ctime);
7219 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7220 }
7221
7222}
7223
7224void Client::touch_dn(Dentry *dn)
7225{
7226 lru.lru_touch(dn);
7227}
7228
7229int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7230{
7231 Mutex::Locker lock(client_lock);
7232 tout(cct) << "chmod" << std::endl;
7233 tout(cct) << relpath << std::endl;
7234 tout(cct) << mode << std::endl;
181888fb
FG
7235
7236 if (unmounting)
7237 return -ENOTCONN;
7238
7c673cae
FG
7239 filepath path(relpath);
7240 InodeRef in;
7241 int r = path_walk(path, &in, perms);
7242 if (r < 0)
7243 return r;
7244 struct stat attr;
7245 attr.st_mode = mode;
7246 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7247}
7248
7249int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7250{
7251 Mutex::Locker lock(client_lock);
7252 tout(cct) << "fchmod" << std::endl;
7253 tout(cct) << fd << std::endl;
7254 tout(cct) << mode << std::endl;
181888fb
FG
7255
7256 if (unmounting)
7257 return -ENOTCONN;
7258
7c673cae
FG
7259 Fh *f = get_filehandle(fd);
7260 if (!f)
7261 return -EBADF;
7262#if defined(__linux__) && defined(O_PATH)
7263 if (f->flags & O_PATH)
7264 return -EBADF;
7265#endif
7266 struct stat attr;
7267 attr.st_mode = mode;
7268 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7269}
7270
7271int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7272{
7273 Mutex::Locker lock(client_lock);
7274 tout(cct) << "lchmod" << std::endl;
7275 tout(cct) << relpath << std::endl;
7276 tout(cct) << mode << std::endl;
181888fb
FG
7277
7278 if (unmounting)
7279 return -ENOTCONN;
7280
7c673cae
FG
7281 filepath path(relpath);
7282 InodeRef in;
7283 // don't follow symlinks
7284 int r = path_walk(path, &in, perms, false);
7285 if (r < 0)
7286 return r;
7287 struct stat attr;
7288 attr.st_mode = mode;
7289 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7290}
7291
7292int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7293 const UserPerm& perms)
7294{
7295 Mutex::Locker lock(client_lock);
7296 tout(cct) << "chown" << std::endl;
7297 tout(cct) << relpath << std::endl;
7298 tout(cct) << new_uid << std::endl;
7299 tout(cct) << new_gid << std::endl;
181888fb
FG
7300
7301 if (unmounting)
7302 return -ENOTCONN;
7303
7c673cae
FG
7304 filepath path(relpath);
7305 InodeRef in;
7306 int r = path_walk(path, &in, perms);
7307 if (r < 0)
7308 return r;
7309 struct stat attr;
7310 attr.st_uid = new_uid;
7311 attr.st_gid = new_gid;
181888fb 7312 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7313}
7314
7315int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7316{
7317 Mutex::Locker lock(client_lock);
7318 tout(cct) << "fchown" << std::endl;
7319 tout(cct) << fd << std::endl;
7320 tout(cct) << new_uid << std::endl;
7321 tout(cct) << new_gid << std::endl;
181888fb
FG
7322
7323 if (unmounting)
7324 return -ENOTCONN;
7325
7c673cae
FG
7326 Fh *f = get_filehandle(fd);
7327 if (!f)
7328 return -EBADF;
7329#if defined(__linux__) && defined(O_PATH)
7330 if (f->flags & O_PATH)
7331 return -EBADF;
7332#endif
7333 struct stat attr;
7334 attr.st_uid = new_uid;
7335 attr.st_gid = new_gid;
7336 int mask = 0;
7337 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7338 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7339 return _setattr(f->inode, &attr, mask, perms);
7340}
7341
7342int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7343 const UserPerm& perms)
7344{
7345 Mutex::Locker lock(client_lock);
7346 tout(cct) << "lchown" << std::endl;
7347 tout(cct) << relpath << std::endl;
7348 tout(cct) << new_uid << std::endl;
7349 tout(cct) << new_gid << std::endl;
181888fb
FG
7350
7351 if (unmounting)
7352 return -ENOTCONN;
7353
7c673cae
FG
7354 filepath path(relpath);
7355 InodeRef in;
7356 // don't follow symlinks
7357 int r = path_walk(path, &in, perms, false);
7358 if (r < 0)
7359 return r;
7360 struct stat attr;
7361 attr.st_uid = new_uid;
7362 attr.st_gid = new_gid;
7363 int mask = 0;
7364 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7365 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7366 return _setattr(in, &attr, mask, perms);
7367}
7368
7369int Client::utime(const char *relpath, struct utimbuf *buf,
7370 const UserPerm& perms)
7371{
7372 Mutex::Locker lock(client_lock);
7373 tout(cct) << "utime" << std::endl;
7374 tout(cct) << relpath << std::endl;
7375 tout(cct) << buf->modtime << std::endl;
7376 tout(cct) << buf->actime << std::endl;
181888fb
FG
7377
7378 if (unmounting)
7379 return -ENOTCONN;
7380
7c673cae
FG
7381 filepath path(relpath);
7382 InodeRef in;
7383 int r = path_walk(path, &in, perms);
7384 if (r < 0)
7385 return r;
7386 struct stat attr;
7387 stat_set_mtime_sec(&attr, buf->modtime);
7388 stat_set_mtime_nsec(&attr, 0);
7389 stat_set_atime_sec(&attr, buf->actime);
7390 stat_set_atime_nsec(&attr, 0);
7391 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7392}
7393
7394int Client::lutime(const char *relpath, struct utimbuf *buf,
7395 const UserPerm& perms)
7396{
7397 Mutex::Locker lock(client_lock);
7398 tout(cct) << "lutime" << std::endl;
7399 tout(cct) << relpath << std::endl;
7400 tout(cct) << buf->modtime << std::endl;
7401 tout(cct) << buf->actime << std::endl;
181888fb
FG
7402
7403 if (unmounting)
7404 return -ENOTCONN;
7405
7c673cae
FG
7406 filepath path(relpath);
7407 InodeRef in;
7408 // don't follow symlinks
7409 int r = path_walk(path, &in, perms, false);
7410 if (r < 0)
7411 return r;
7412 struct stat attr;
7413 stat_set_mtime_sec(&attr, buf->modtime);
7414 stat_set_mtime_nsec(&attr, 0);
7415 stat_set_atime_sec(&attr, buf->actime);
7416 stat_set_atime_nsec(&attr, 0);
7417 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7418}
7419
7420int Client::flock(int fd, int operation, uint64_t owner)
7421{
7422 Mutex::Locker lock(client_lock);
7423 tout(cct) << "flock" << std::endl;
7424 tout(cct) << fd << std::endl;
7425 tout(cct) << operation << std::endl;
7426 tout(cct) << owner << std::endl;
181888fb
FG
7427
7428 if (unmounting)
7429 return -ENOTCONN;
7430
7c673cae
FG
7431 Fh *f = get_filehandle(fd);
7432 if (!f)
7433 return -EBADF;
7434
7435 return _flock(f, operation, owner);
7436}
7437
7438int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7439{
7440 Mutex::Locker lock(client_lock);
7441 tout(cct) << "opendir" << std::endl;
7442 tout(cct) << relpath << std::endl;
181888fb
FG
7443
7444 if (unmounting)
7445 return -ENOTCONN;
7446
7c673cae
FG
7447 filepath path(relpath);
7448 InodeRef in;
7449 int r = path_walk(path, &in, perms, true);
7450 if (r < 0)
7451 return r;
7452 if (cct->_conf->client_permissions) {
7453 int r = may_open(in.get(), O_RDONLY, perms);
7454 if (r < 0)
7455 return r;
7456 }
7457 r = _opendir(in.get(), dirpp, perms);
7458 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7459 if (r != -ENOTDIR)
7460 tout(cct) << (unsigned long)*dirpp << std::endl;
7461 return r;
7462}
7463
7464int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7465{
7466 if (!in->is_dir())
7467 return -ENOTDIR;
7468 *dirpp = new dir_result_t(in, perms);
7469 opened_dirs.insert(*dirpp);
1adf2230 7470 ldout(cct, 8) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7471 return 0;
7472}
7473
7474
7475int Client::closedir(dir_result_t *dir)
7476{
7477 Mutex::Locker lock(client_lock);
7478 tout(cct) << "closedir" << std::endl;
7479 tout(cct) << (unsigned long)dir << std::endl;
7480
7481 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7482 _closedir(dir);
7483 return 0;
7484}
7485
7486void Client::_closedir(dir_result_t *dirp)
7487{
7488 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7489 if (dirp->inode) {
7490 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7491 dirp->inode.reset();
7492 }
7493 _readdir_drop_dirp_buffer(dirp);
7494 opened_dirs.erase(dirp);
7495 delete dirp;
7496}
7497
7498void Client::rewinddir(dir_result_t *dirp)
7499{
7500 Mutex::Locker lock(client_lock);
7c673cae 7501 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
181888fb
FG
7502
7503 if (unmounting)
7504 return;
7505
7c673cae
FG
7506 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7507 _readdir_drop_dirp_buffer(d);
7508 d->reset();
7509}
7510
7511loff_t Client::telldir(dir_result_t *dirp)
7512{
7513 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7514 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7515 return d->offset;
7516}
7517
7518void Client::seekdir(dir_result_t *dirp, loff_t offset)
7519{
7520 Mutex::Locker lock(client_lock);
7521
7522 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7523
181888fb
FG
7524 if (unmounting)
7525 return;
7526
7c673cae
FG
7527 if (offset == dirp->offset)
7528 return;
7529
7530 if (offset > dirp->offset)
7531 dirp->release_count = 0; // bump if we do a forward seek
7532 else
7533 dirp->ordered_count = 0; // disable filling readdir cache
7534
7535 if (dirp->hash_order()) {
7536 if (dirp->offset > offset) {
7537 _readdir_drop_dirp_buffer(dirp);
7538 dirp->reset();
7539 }
7540 } else {
7541 if (offset == 0 ||
7542 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7543 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7544 _readdir_drop_dirp_buffer(dirp);
7545 dirp->reset();
7546 }
7547 }
7548
7549 dirp->offset = offset;
7550}
7551
7552
7553//struct dirent {
7554// ino_t d_ino; /* inode number */
7555// off_t d_off; /* offset to the next dirent */
7556// unsigned short d_reclen; /* length of this record */
7557// unsigned char d_type; /* type of file */
7558// char d_name[256]; /* filename */
7559//};
7560void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7561{
7562 strncpy(de->d_name, name, 255);
7563 de->d_name[255] = '\0';
7564#ifndef __CYGWIN__
7565 de->d_ino = ino;
7566#if !defined(DARWIN) && !defined(__FreeBSD__)
7567 de->d_off = next_off;
7568#endif
7569 de->d_reclen = 1;
7570 de->d_type = IFTODT(type);
7571 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7572 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7573#endif
7574}
7575
7576void Client::_readdir_next_frag(dir_result_t *dirp)
7577{
7578 frag_t fg = dirp->buffer_frag;
7579
7580 if (fg.is_rightmost()) {
7581 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7582 dirp->set_end();
7583 return;
7584 }
7585
7586 // advance
7587 fg = fg.next();
7588 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7589
7590 if (dirp->hash_order()) {
7591 // keep last_name
7592 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7593 if (dirp->offset < new_offset) // don't decrease offset
7594 dirp->offset = new_offset;
7595 } else {
7596 dirp->last_name.clear();
7597 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7598 _readdir_rechoose_frag(dirp);
7599 }
7600}
7601
7602void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7603{
7604 assert(dirp->inode);
7605
7606 if (dirp->hash_order())
7607 return;
7608
7609 frag_t cur = frag_t(dirp->offset_high());
7610 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7611 if (fg != cur) {
7612 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7613 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7614 dirp->last_name.clear();
7615 dirp->next_offset = 2;
7616 }
7617}
7618
7619void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7620{
7621 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7622 dirp->buffer.clear();
7623}
7624
7625int Client::_readdir_get_frag(dir_result_t *dirp)
7626{
7627 assert(dirp);
7628 assert(dirp->inode);
7629
7630 // get the current frag.
7631 frag_t fg;
7632 if (dirp->hash_order())
7633 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7634 else
7635 fg = frag_t(dirp->offset_high());
7636
7637 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7638 << " offset " << hex << dirp->offset << dec << dendl;
7639
7640 int op = CEPH_MDS_OP_READDIR;
7641 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7642 op = CEPH_MDS_OP_LSSNAP;
7643
7644 InodeRef& diri = dirp->inode;
7645
7646 MetaRequest *req = new MetaRequest(op);
7647 filepath path;
7648 diri->make_nosnap_relative_path(path);
7649 req->set_filepath(path);
7650 req->set_inode(diri.get());
7651 req->head.args.readdir.frag = fg;
7652 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7653 if (dirp->last_name.length()) {
94b18763 7654 req->path2.set_path(dirp->last_name);
7c673cae
FG
7655 } else if (dirp->hash_order()) {
7656 req->head.args.readdir.offset_hash = dirp->offset_high();
7657 }
7658 req->dirp = dirp;
7659
7660 bufferlist dirbl;
7661 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7662
7663 if (res == -EAGAIN) {
7664 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7665 _readdir_rechoose_frag(dirp);
7666 return _readdir_get_frag(dirp);
7667 }
7668
7669 if (res == 0) {
7670 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7671 << " size " << dirp->buffer.size() << dendl;
7672 } else {
7673 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7674 dirp->set_end();
7675 }
7676
7677 return res;
7678}
7679
7680struct dentry_off_lt {
7681 bool operator()(const Dentry* dn, int64_t off) const {
7682 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7683 }
7684};
7685
7686int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7687 int caps, bool getref)
7688{
7689 assert(client_lock.is_locked());
7690 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7691 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7692 << dendl;
7693 Dir *dir = dirp->inode->dir;
7694
7695 if (!dir) {
7696 ldout(cct, 10) << " dir is empty" << dendl;
7697 dirp->set_end();
7698 return 0;
7699 }
7700
7701 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7702 dir->readdir_cache.end(),
7703 dirp->offset, dentry_off_lt());
7704
7705 string dn_name;
7706 while (true) {
7707 if (!dirp->inode->is_complete_and_ordered())
7708 return -EAGAIN;
7709 if (pd == dir->readdir_cache.end())
7710 break;
7711 Dentry *dn = *pd;
7712 if (dn->inode == NULL) {
7713 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7714 ++pd;
7715 continue;
7716 }
7717 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7718 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7719 ++pd;
7720 continue;
7721 }
7722
7723 int r = _getattr(dn->inode, caps, dirp->perms);
7724 if (r < 0)
7725 return r;
7726
7727 struct ceph_statx stx;
7728 struct dirent de;
7729 fill_statx(dn->inode, caps, &stx);
7730
7731 uint64_t next_off = dn->offset + 1;
7732 ++pd;
7733 if (pd == dir->readdir_cache.end())
7734 next_off = dir_result_t::END;
7735
7736 Inode *in = NULL;
7737 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7738 if (getref) {
7739 in = dn->inode.get();
7740 _ll_get(in);
7741 }
7742
7743 dn_name = dn->name; // fill in name while we have lock
7744
7745 client_lock.Unlock();
7746 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7747 client_lock.Lock();
7748 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7749 << " = " << r << dendl;
7750 if (r < 0) {
7751 return r;
7752 }
7753
7754 dirp->offset = next_off;
7755 if (dirp->at_end())
7756 dirp->next_offset = 2;
7757 else
7758 dirp->next_offset = dirp->offset_low();
7759 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 7760 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
7761 if (r > 0)
7762 return r;
7763 }
7764
7765 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7766 dirp->set_end();
7767 return 0;
7768}
7769
7770int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7771 unsigned want, unsigned flags, bool getref)
7772{
7773 int caps = statx_to_mask(flags, want);
7774
7775 Mutex::Locker lock(client_lock);
7776
181888fb
FG
7777 if (unmounting)
7778 return -ENOTCONN;
7779
7c673cae
FG
7780 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7781
7782 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7783 << dec << " at_end=" << dirp->at_end()
7784 << " hash_order=" << dirp->hash_order() << dendl;
7785
7786 struct dirent de;
7787 struct ceph_statx stx;
7788 memset(&de, 0, sizeof(de));
7789 memset(&stx, 0, sizeof(stx));
7790
7791 InodeRef& diri = dirp->inode;
7792
7793 if (dirp->at_end())
7794 return 0;
7795
7796 if (dirp->offset == 0) {
7797 ldout(cct, 15) << " including ." << dendl;
7798 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7799 uint64_t next_off = 1;
7800
7801 int r;
7802 r = _getattr(diri, caps, dirp->perms);
7803 if (r < 0)
7804 return r;
7805
7806 fill_statx(diri, caps, &stx);
7807 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7808
7809 Inode *inode = NULL;
7810 if (getref) {
7811 inode = diri.get();
7812 _ll_get(inode);
7813 }
7814
7815 client_lock.Unlock();
7816 r = cb(p, &de, &stx, next_off, inode);
7817 client_lock.Lock();
7818 if (r < 0)
7819 return r;
7820
7821 dirp->offset = next_off;
7822 if (r > 0)
7823 return r;
7824 }
7825 if (dirp->offset == 1) {
7826 ldout(cct, 15) << " including .." << dendl;
7827 uint64_t next_off = 2;
7828 InodeRef in;
7829 if (diri->dn_set.empty())
7830 in = diri;
7831 else
94b18763 7832 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
7833
7834 int r;
94b18763 7835 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
7836 if (r < 0)
7837 return r;
7838
7839 fill_statx(in, caps, &stx);
7840 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7841
7842 Inode *inode = NULL;
7843 if (getref) {
7844 inode = in.get();
7845 _ll_get(inode);
7846 }
7847
7848 client_lock.Unlock();
7849 r = cb(p, &de, &stx, next_off, inode);
7850 client_lock.Lock();
7851 if (r < 0)
7852 return r;
7853
7854 dirp->offset = next_off;
7855 if (r > 0)
7856 return r;
7857 }
7858
7859 // can we read from our cache?
7860 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7861 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7862 << dirp->inode->is_complete_and_ordered()
7863 << " issued " << ccap_string(dirp->inode->caps_issued())
7864 << dendl;
7865 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7866 dirp->inode->is_complete_and_ordered() &&
94b18763 7867 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
7868 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7869 if (err != -EAGAIN)
7870 return err;
7871 }
7872
7873 while (1) {
7874 if (dirp->at_end())
7875 return 0;
7876
7877 bool check_caps = true;
7878 if (!dirp->is_cached()) {
7879 int r = _readdir_get_frag(dirp);
7880 if (r)
7881 return r;
7882 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7883 // different than the requested one. (our dirfragtree was outdated)
7884 check_caps = false;
7885 }
7886 frag_t fg = dirp->buffer_frag;
7887
7888 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7889 << " offset " << hex << dirp->offset << dendl;
7890
7891 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7892 dirp->offset, dir_result_t::dentry_off_lt());
7893 it != dirp->buffer.end();
7894 ++it) {
7895 dir_result_t::dentry &entry = *it;
7896
7897 uint64_t next_off = entry.offset + 1;
7898
7899 int r;
7900 if (check_caps) {
7901 r = _getattr(entry.inode, caps, dirp->perms);
7902 if (r < 0)
7903 return r;
7904 }
7905
7906 fill_statx(entry.inode, caps, &stx);
7907 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7908
7909 Inode *inode = NULL;
7910 if (getref) {
7911 inode = entry.inode.get();
7912 _ll_get(inode);
7913 }
7914
7915 client_lock.Unlock();
7916 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7917 client_lock.Lock();
7918
7919 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7920 << " = " << r << dendl;
7921 if (r < 0)
7922 return r;
7923
7924 dirp->offset = next_off;
7925 if (r > 0)
7926 return r;
7927 }
7928
7929 if (dirp->next_offset > 2) {
7930 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7931 _readdir_drop_dirp_buffer(dirp);
7932 continue; // more!
7933 }
7934
7935 if (!fg.is_rightmost()) {
7936 // next frag!
7937 _readdir_next_frag(dirp);
7938 continue;
7939 }
7940
7941 if (diri->shared_gen == dirp->start_shared_gen &&
7942 diri->dir_release_count == dirp->release_count) {
7943 if (diri->dir_ordered_count == dirp->ordered_count) {
7944 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7945 if (diri->dir) {
7946 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7947 diri->dir->readdir_cache.resize(dirp->cache_index);
7948 }
7949 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7950 } else {
7951 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7952 diri->flags |= I_COMPLETE;
7953 }
7954 }
7955
7956 dirp->set_end();
7957 return 0;
7958 }
7959 ceph_abort();
7960 return 0;
7961}
7962
7963
7964int Client::readdir_r(dir_result_t *d, struct dirent *de)
7965{
7966 return readdirplus_r(d, de, 0, 0, 0, NULL);
7967}
7968
7969/*
7970 * readdirplus_r
7971 *
7972 * returns
7973 * 1 if we got a dirent
7974 * 0 for end of directory
7975 * <0 on error
7976 */
7977
7978struct single_readdir {
7979 struct dirent *de;
7980 struct ceph_statx *stx;
7981 Inode *inode;
7982 bool full;
7983};
7984
7985static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7986 struct ceph_statx *stx, off_t off,
7987 Inode *in)
7988{
7989 single_readdir *c = static_cast<single_readdir *>(p);
7990
7991 if (c->full)
7992 return -1; // already filled this dirent
7993
7994 *c->de = *de;
7995 if (c->stx)
7996 *c->stx = *stx;
7997 c->inode = in;
7998 c->full = true;
7999 return 1;
8000}
8001
8002struct dirent *Client::readdir(dir_result_t *d)
8003{
8004 int ret;
8005 static struct dirent de;
8006 single_readdir sr;
8007 sr.de = &de;
8008 sr.stx = NULL;
8009 sr.inode = NULL;
8010 sr.full = false;
8011
8012 // our callback fills the dirent and sets sr.full=true on first
8013 // call, and returns -1 the second time around.
8014 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8015 if (ret < -1) {
8016 errno = -ret; // this sucks.
8017 return (dirent *) NULL;
8018 }
8019 if (sr.full) {
8020 return &de;
8021 }
8022 return (dirent *) NULL;
8023}
8024
8025int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8026 struct ceph_statx *stx, unsigned want,
8027 unsigned flags, Inode **out)
8028{
8029 single_readdir sr;
8030 sr.de = de;
8031 sr.stx = stx;
8032 sr.inode = NULL;
8033 sr.full = false;
8034
8035 // our callback fills the dirent and sets sr.full=true on first
8036 // call, and returns -1 the second time around.
8037 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8038 if (r < -1)
8039 return r;
8040 if (out)
8041 *out = sr.inode;
8042 if (sr.full)
8043 return 1;
8044 return 0;
8045}
8046
8047
8048/* getdents */
8049struct getdents_result {
8050 char *buf;
8051 int buflen;
8052 int pos;
8053 bool fullent;
8054};
8055
8056static int _readdir_getdent_cb(void *p, struct dirent *de,
8057 struct ceph_statx *stx, off_t off, Inode *in)
8058{
8059 struct getdents_result *c = static_cast<getdents_result *>(p);
8060
8061 int dlen;
8062 if (c->fullent)
8063 dlen = sizeof(*de);
8064 else
8065 dlen = strlen(de->d_name) + 1;
8066
8067 if (c->pos + dlen > c->buflen)
8068 return -1; // doesn't fit
8069
8070 if (c->fullent) {
8071 memcpy(c->buf + c->pos, de, sizeof(*de));
8072 } else {
8073 memcpy(c->buf + c->pos, de->d_name, dlen);
8074 }
8075 c->pos += dlen;
8076 return 0;
8077}
8078
8079int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8080{
8081 getdents_result gr;
8082 gr.buf = buf;
8083 gr.buflen = buflen;
8084 gr.fullent = fullent;
8085 gr.pos = 0;
8086
8087 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8088
8089 if (r < 0) { // some error
8090 if (r == -1) { // buffer ran out of space
8091 if (gr.pos) { // but we got some entries already!
8092 return gr.pos;
8093 } // or we need a larger buffer
8094 return -ERANGE;
8095 } else { // actual error, return it
8096 return r;
8097 }
8098 }
8099 return gr.pos;
8100}
8101
8102
8103/* getdir */
8104struct getdir_result {
8105 list<string> *contents;
8106 int num;
8107};
8108
8109static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8110{
8111 getdir_result *r = static_cast<getdir_result *>(p);
8112
8113 r->contents->push_back(de->d_name);
8114 r->num++;
8115 return 0;
8116}
8117
8118int Client::getdir(const char *relpath, list<string>& contents,
8119 const UserPerm& perms)
8120{
8121 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8122 {
8123 Mutex::Locker lock(client_lock);
8124 tout(cct) << "getdir" << std::endl;
8125 tout(cct) << relpath << std::endl;
8126 }
8127
8128 dir_result_t *d;
8129 int r = opendir(relpath, &d, perms);
8130 if (r < 0)
8131 return r;
8132
8133 getdir_result gr;
8134 gr.contents = &contents;
8135 gr.num = 0;
8136 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8137
8138 closedir(d);
8139
8140 if (r < 0)
8141 return r;
8142 return gr.num;
8143}
8144
8145
8146/****** file i/o **********/
8147int Client::open(const char *relpath, int flags, const UserPerm& perms,
8148 mode_t mode, int stripe_unit, int stripe_count,
8149 int object_size, const char *data_pool)
8150{
8151 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8152 Mutex::Locker lock(client_lock);
8153 tout(cct) << "open" << std::endl;
8154 tout(cct) << relpath << std::endl;
8155 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8156
181888fb
FG
8157 if (unmounting)
8158 return -ENOTCONN;
8159
7c673cae
FG
8160 Fh *fh = NULL;
8161
8162#if defined(__linux__) && defined(O_PATH)
8163 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8164 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8165 * in kernel (fs/open.c). */
8166 if (flags & O_PATH)
8167 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8168#endif
8169
8170 filepath path(relpath);
8171 InodeRef in;
8172 bool created = false;
8173 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8174 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8175 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8176
8177 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8178 return -EEXIST;
8179
8180#if defined(__linux__) && defined(O_PATH)
8181 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8182#else
8183 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8184#endif
8185 return -ELOOP;
8186
8187 if (r == -ENOENT && (flags & O_CREAT)) {
8188 filepath dirpath = path;
8189 string dname = dirpath.last_dentry();
8190 dirpath.pop_dentry();
8191 InodeRef dir;
8192 r = path_walk(dirpath, &dir, perms, true,
8193 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8194 if (r < 0)
8195 goto out;
8196 if (cct->_conf->client_permissions) {
8197 r = may_create(dir.get(), perms);
8198 if (r < 0)
8199 goto out;
8200 }
8201 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8202 stripe_count, object_size, data_pool, &created, perms);
8203 }
8204 if (r < 0)
8205 goto out;
8206
8207 if (!created) {
8208 // posix says we can only check permissions of existing files
8209 if (cct->_conf->client_permissions) {
8210 r = may_open(in.get(), flags, perms);
8211 if (r < 0)
8212 goto out;
8213 }
8214 }
8215
8216 if (!fh)
8217 r = _open(in.get(), flags, mode, &fh, perms);
8218 if (r >= 0) {
8219 // allocate a integer file descriptor
8220 assert(fh);
8221 r = get_fd();
8222 assert(fd_map.count(r) == 0);
8223 fd_map[r] = fh;
8224 }
8225
8226 out:
8227 tout(cct) << r << std::endl;
8228 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8229 return r;
8230}
8231
8232int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8233{
8234 /* Use default file striping parameters */
8235 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8236}
8237
8238int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8239 const UserPerm& perms)
8240{
8241 Mutex::Locker lock(client_lock);
8242 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8243
181888fb
FG
8244 if (unmounting)
8245 return -ENOTCONN;
8246
7c673cae
FG
8247 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8248 filepath path(ino);
8249 req->set_filepath(path);
8250
8251 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8252 char f[30];
8253 sprintf(f, "%u", h);
8254 filepath path2(dirino);
8255 path2.push_dentry(string(f));
8256 req->set_filepath2(path2);
8257
8258 int r = make_request(req, perms, NULL, NULL,
8259 rand() % mdsmap->get_num_in_mds());
8260 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8261 return r;
8262}
8263
8264
8265/**
8266 * Load inode into local cache.
8267 *
8268 * If inode pointer is non-NULL, and take a reference on
8269 * the resulting Inode object in one operation, so that caller
8270 * can safely assume inode will still be there after return.
8271 */
1adf2230 8272int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8273{
1adf2230 8274 ldout(cct, 8) << "lookup_ino enter(" << ino << ")" << dendl;
7c673cae 8275
181888fb
FG
8276 if (unmounting)
8277 return -ENOTCONN;
8278
7c673cae
FG
8279 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8280 filepath path(ino);
8281 req->set_filepath(path);
8282
8283 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8284 if (r == 0 && inode != NULL) {
8285 vinodeno_t vino(ino, CEPH_NOSNAP);
8286 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8287 assert(p != inode_map.end());
8288 *inode = p->second;
8289 _ll_get(*inode);
8290 }
1adf2230 8291 ldout(cct, 8) << "lookup_ino exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8292 return r;
8293}
8294
1adf2230
AA
8295int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8296{
8297 Mutex::Locker lock(client_lock);
8298 return _lookup_ino(ino, perms, inode);
8299}
7c673cae
FG
8300
8301/**
8302 * Find the parent inode of `ino` and insert it into
8303 * our cache. Conditionally also set `parent` to a referenced
8304 * Inode* if caller provides non-NULL value.
8305 */
1adf2230 8306int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8307{
1adf2230 8308 ldout(cct, 8) << "lookup_parent enter(" << ino->ino << ")" << dendl;
7c673cae 8309
181888fb
FG
8310 if (unmounting)
8311 return -ENOTCONN;
8312
7c673cae
FG
8313 if (!ino->dn_set.empty()) {
8314 // if we exposed the parent here, we'd need to check permissions,
8315 // but right now we just rely on the MDS doing so in make_request
1adf2230 8316 ldout(cct, 8) << "lookup_parent dentry already present" << dendl;
7c673cae
FG
8317 return 0;
8318 }
8319
8320 if (ino->is_root()) {
8321 *parent = NULL;
1adf2230 8322 ldout(cct, 8) << "ino is root, no parent" << dendl;
7c673cae
FG
8323 return -EINVAL;
8324 }
8325
8326 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8327 filepath path(ino->ino);
8328 req->set_filepath(path);
8329
8330 InodeRef target;
8331 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8332 // Give caller a reference to the parent ino if they provided a pointer.
8333 if (parent != NULL) {
8334 if (r == 0) {
8335 *parent = target.get();
8336 _ll_get(*parent);
1adf2230 8337 ldout(cct, 8) << "lookup_parent found parent " << (*parent)->ino << dendl;
7c673cae
FG
8338 } else {
8339 *parent = NULL;
8340 }
8341 }
1adf2230 8342 ldout(cct, 8) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8343 return r;
8344}
8345
1adf2230
AA
8346int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8347{
8348 Mutex::Locker lock(client_lock);
8349 return _lookup_parent(ino, perms, parent);
8350}
7c673cae
FG
8351
8352/**
8353 * Populate the parent dentry for `ino`, provided it is
8354 * a child of `parent`.
8355 */
1adf2230 8356int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae
FG
8357{
8358 assert(parent->is_dir());
7c673cae
FG
8359 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8360
181888fb
FG
8361 if (unmounting)
8362 return -ENOTCONN;
8363
7c673cae
FG
8364 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8365 req->set_filepath2(filepath(parent->ino));
8366 req->set_filepath(filepath(ino->ino));
8367 req->set_inode(ino);
8368
8369 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8370 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8371 return r;
8372}
8373
1adf2230
AA
8374int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8375{
8376 Mutex::Locker lock(client_lock);
8377 return _lookup_name(ino, parent, perms);
8378}
7c673cae
FG
8379
8380 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8381{
8382 assert(in);
8383 Fh *f = new Fh(in);
8384 f->mode = cmode;
8385 f->flags = flags;
8386
8387 // inode
8388 f->actor_perms = perms;
8389
8390 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8391
8392 if (in->snapid != CEPH_NOSNAP) {
8393 in->snap_cap_refs++;
8394 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8395 << ccap_string(in->caps_issued()) << dendl;
8396 }
8397
8398 const md_config_t *conf = cct->_conf;
8399 f->readahead.set_trigger_requests(1);
8400 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8401 uint64_t max_readahead = Readahead::NO_LIMIT;
8402 if (conf->client_readahead_max_bytes) {
8403 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8404 }
8405 if (conf->client_readahead_max_periods) {
8406 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8407 }
8408 f->readahead.set_max_readahead_size(max_readahead);
8409 vector<uint64_t> alignments;
8410 alignments.push_back(in->layout.get_period());
8411 alignments.push_back(in->layout.stripe_unit);
8412 f->readahead.set_alignments(alignments);
8413
8414 return f;
8415}
8416
8417int Client::_release_fh(Fh *f)
8418{
8419 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8420 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8421 Inode *in = f->inode.get();
1adf2230 8422 ldout(cct, 8) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8423
b32b8144
FG
8424 in->unset_deleg(f);
8425
7c673cae
FG
8426 if (in->snapid == CEPH_NOSNAP) {
8427 if (in->put_open_ref(f->mode)) {
8428 _flush(in, new C_Client_FlushComplete(this, in));
8429 check_caps(in, 0);
8430 }
8431 } else {
8432 assert(in->snap_cap_refs > 0);
8433 in->snap_cap_refs--;
8434 }
8435
8436 _release_filelocks(f);
8437
8438 // Finally, read any async err (i.e. from flushes)
8439 int err = f->take_async_err();
8440 if (err != 0) {
8441 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8442 << cpp_strerror(err) << dendl;
8443 } else {
8444 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8445 }
8446
8447 _put_fh(f);
8448
8449 return err;
8450}
8451
8452void Client::_put_fh(Fh *f)
8453{
8454 int left = f->put();
8455 if (!left) {
8456 delete f;
8457 }
8458}
8459
8460int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8461 const UserPerm& perms)
8462{
8463 if (in->snapid != CEPH_NOSNAP &&
8464 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8465 return -EROFS;
8466 }
8467
8468 // use normalized flags to generate cmode
8469 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8470 if (cmode < 0)
8471 return -EINVAL;
8472 int want = ceph_caps_for_mode(cmode);
8473 int result = 0;
8474
8475 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8476
b32b8144 8477 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8478 // update wanted?
8479 check_caps(in, CHECK_CAPS_NODELAY);
8480 } else {
b32b8144 8481
7c673cae
FG
8482 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8483 filepath path;
8484 in->make_nosnap_relative_path(path);
8485 req->set_filepath(path);
8486 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8487 req->head.args.open.mode = mode;
8488 req->head.args.open.pool = -1;
8489 if (cct->_conf->client_debug_getattr_caps)
8490 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8491 else
8492 req->head.args.open.mask = 0;
8493 req->head.args.open.old_size = in->size; // for O_TRUNC
8494 req->set_inode(in);
8495 result = make_request(req, perms);
b32b8144
FG
8496
8497 /*
8498 * NFS expects that delegations will be broken on a conflicting open,
8499 * not just when there is actual conflicting access to the file. SMB leases
8500 * and oplocks also have similar semantics.
8501 *
8502 * Ensure that clients that have delegations enabled will wait on minimal
8503 * caps during open, just to ensure that other clients holding delegations
8504 * return theirs first.
8505 */
8506 if (deleg_timeout && result == 0) {
8507 int need = 0, have;
8508
8509 if (cmode & CEPH_FILE_MODE_WR)
8510 need |= CEPH_CAP_FILE_WR;
8511 if (cmode & CEPH_FILE_MODE_RD)
8512 need |= CEPH_CAP_FILE_RD;
8513
8514 result = get_caps(in, need, want, &have, -1);
8515 if (result < 0) {
1adf2230 8516 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8517 " . Denying open: " <<
8518 cpp_strerror(result) << dendl;
8519 in->put_open_ref(cmode);
8520 } else {
8521 put_cap_ref(in, need);
8522 }
8523 }
7c673cae
FG
8524 }
8525
8526 // success?
8527 if (result >= 0) {
8528 if (fhp)
8529 *fhp = _create_fh(in, flags, cmode, perms);
8530 } else {
8531 in->put_open_ref(cmode);
8532 }
8533
8534 trim_cache();
8535
8536 return result;
8537}
8538
8539int Client::_renew_caps(Inode *in)
8540{
8541 int wanted = in->caps_file_wanted();
8542 if (in->is_any_caps() &&
8543 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8544 check_caps(in, CHECK_CAPS_NODELAY);
8545 return 0;
8546 }
8547
8548 int flags = 0;
8549 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8550 flags = O_RDWR;
8551 else if (wanted & CEPH_CAP_FILE_RD)
8552 flags = O_RDONLY;
8553 else if (wanted & CEPH_CAP_FILE_WR)
8554 flags = O_WRONLY;
8555
8556 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8557 filepath path;
8558 in->make_nosnap_relative_path(path);
8559 req->set_filepath(path);
8560 req->head.args.open.flags = flags;
8561 req->head.args.open.pool = -1;
8562 if (cct->_conf->client_debug_getattr_caps)
8563 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8564 else
8565 req->head.args.open.mask = 0;
8566 req->set_inode(in);
8567
8568 // duplicate in case Cap goes away; not sure if that race is a concern?
8569 const UserPerm *pperm = in->get_best_perms();
8570 UserPerm perms;
8571 if (pperm != NULL)
8572 perms = *pperm;
8573 int ret = make_request(req, perms);
8574 return ret;
8575}
8576
8577int Client::close(int fd)
8578{
8579 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8580 Mutex::Locker lock(client_lock);
8581 tout(cct) << "close" << std::endl;
8582 tout(cct) << fd << std::endl;
8583
181888fb
FG
8584 if (unmounting)
8585 return -ENOTCONN;
8586
7c673cae
FG
8587 Fh *fh = get_filehandle(fd);
8588 if (!fh)
8589 return -EBADF;
8590 int err = _release_fh(fh);
8591 fd_map.erase(fd);
8592 put_fd(fd);
8593 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8594 return err;
8595}
8596
8597
8598// ------------
8599// read, write
8600
8601loff_t Client::lseek(int fd, loff_t offset, int whence)
8602{
8603 Mutex::Locker lock(client_lock);
8604 tout(cct) << "lseek" << std::endl;
8605 tout(cct) << fd << std::endl;
8606 tout(cct) << offset << std::endl;
8607 tout(cct) << whence << std::endl;
8608
181888fb
FG
8609 if (unmounting)
8610 return -ENOTCONN;
8611
7c673cae
FG
8612 Fh *f = get_filehandle(fd);
8613 if (!f)
8614 return -EBADF;
8615#if defined(__linux__) && defined(O_PATH)
8616 if (f->flags & O_PATH)
8617 return -EBADF;
8618#endif
8619 return _lseek(f, offset, whence);
8620}
8621
8622loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8623{
8624 Inode *in = f->inode.get();
8625 int r;
8626
8627 switch (whence) {
8628 case SEEK_SET:
8629 f->pos = offset;
8630 break;
8631
8632 case SEEK_CUR:
8633 f->pos += offset;
8634 break;
8635
8636 case SEEK_END:
8637 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8638 if (r < 0)
8639 return r;
8640 f->pos = in->size + offset;
8641 break;
8642
8643 default:
8644 ceph_abort();
8645 }
8646
1adf2230 8647 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8648 return f->pos;
8649}
8650
8651
8652void Client::lock_fh_pos(Fh *f)
8653{
8654 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8655
8656 if (f->pos_locked || !f->pos_waiters.empty()) {
8657 Cond cond;
8658 f->pos_waiters.push_back(&cond);
8659 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8660 while (f->pos_locked || f->pos_waiters.front() != &cond)
8661 cond.Wait(client_lock);
8662 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8663 assert(f->pos_waiters.front() == &cond);
8664 f->pos_waiters.pop_front();
8665 }
8666
8667 f->pos_locked = true;
8668}
8669
8670void Client::unlock_fh_pos(Fh *f)
8671{
8672 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8673 f->pos_locked = false;
8674}
8675
8676int Client::uninline_data(Inode *in, Context *onfinish)
8677{
8678 if (!in->inline_data.length()) {
8679 onfinish->complete(0);
8680 return 0;
8681 }
8682
8683 char oid_buf[32];
8684 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8685 object_t oid = oid_buf;
8686
8687 ObjectOperation create_ops;
8688 create_ops.create(false);
8689
8690 objecter->mutate(oid,
8691 OSDMap::file_to_object_locator(in->layout),
8692 create_ops,
8693 in->snaprealm->get_snap_context(),
8694 ceph::real_clock::now(),
8695 0,
8696 NULL);
8697
8698 bufferlist inline_version_bl;
8699 ::encode(in->inline_version, inline_version_bl);
8700
8701 ObjectOperation uninline_ops;
8702 uninline_ops.cmpxattr("inline_version",
8703 CEPH_OSD_CMPXATTR_OP_GT,
8704 CEPH_OSD_CMPXATTR_MODE_U64,
8705 inline_version_bl);
8706 bufferlist inline_data = in->inline_data;
8707 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8708 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8709
8710 objecter->mutate(oid,
8711 OSDMap::file_to_object_locator(in->layout),
8712 uninline_ops,
8713 in->snaprealm->get_snap_context(),
8714 ceph::real_clock::now(),
8715 0,
8716 onfinish);
8717
8718 return 0;
8719}
8720
8721//
8722
8723// blocking osd interface
8724
8725int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8726{
8727 Mutex::Locker lock(client_lock);
8728 tout(cct) << "read" << std::endl;
8729 tout(cct) << fd << std::endl;
8730 tout(cct) << size << std::endl;
8731 tout(cct) << offset << std::endl;
8732
181888fb
FG
8733 if (unmounting)
8734 return -ENOTCONN;
8735
7c673cae
FG
8736 Fh *f = get_filehandle(fd);
8737 if (!f)
8738 return -EBADF;
8739#if defined(__linux__) && defined(O_PATH)
8740 if (f->flags & O_PATH)
8741 return -EBADF;
8742#endif
8743 bufferlist bl;
8744 int r = _read(f, offset, size, &bl);
8745 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8746 if (r >= 0) {
8747 bl.copy(0, bl.length(), buf);
8748 r = bl.length();
8749 }
8750 return r;
8751}
8752
8753int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8754{
8755 if (iovcnt < 0)
8756 return -EINVAL;
8757 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8758}
8759
8760int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8761{
8762 const md_config_t *conf = cct->_conf;
8763 Inode *in = f->inode.get();
8764
8765 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8766 return -EBADF;
8767 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8768
8769 bool movepos = false;
8770 if (offset < 0) {
8771 lock_fh_pos(f);
8772 offset = f->pos;
8773 movepos = true;
8774 }
8775 loff_t start_pos = offset;
8776
8777 if (in->inline_version == 0) {
8778 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8779 if (r < 0) {
8780 if (movepos)
8781 unlock_fh_pos(f);
7c673cae 8782 return r;
c07f9fc5 8783 }
7c673cae
FG
8784 assert(in->inline_version > 0);
8785 }
8786
8787retry:
8788 int have;
8789 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8790 if (r < 0) {
8791 if (movepos)
8792 unlock_fh_pos(f);
7c673cae 8793 return r;
c07f9fc5 8794 }
7c673cae
FG
8795 if (f->flags & O_DIRECT)
8796 have &= ~CEPH_CAP_FILE_CACHE;
8797
8798 Mutex uninline_flock("Client::_read_uninline_data flock");
8799 Cond uninline_cond;
8800 bool uninline_done = false;
8801 int uninline_ret = 0;
8802 Context *onuninline = NULL;
8803
8804 if (in->inline_version < CEPH_INLINE_NONE) {
8805 if (!(have & CEPH_CAP_FILE_CACHE)) {
8806 onuninline = new C_SafeCond(&uninline_flock,
8807 &uninline_cond,
8808 &uninline_done,
8809 &uninline_ret);
8810 uninline_data(in, onuninline);
8811 } else {
8812 uint32_t len = in->inline_data.length();
8813
8814 uint64_t endoff = offset + size;
8815 if (endoff > in->size)
8816 endoff = in->size;
8817
8818 if (offset < len) {
8819 if (endoff <= len) {
8820 bl->substr_of(in->inline_data, offset, endoff - offset);
8821 } else {
8822 bl->substr_of(in->inline_data, offset, len - offset);
8823 bl->append_zero(endoff - len);
8824 }
8825 } else if ((uint64_t)offset < endoff) {
8826 bl->append_zero(endoff - offset);
8827 }
8828
8829 goto success;
8830 }
8831 }
8832
8833 if (!conf->client_debug_force_sync_read &&
8834 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8835
8836 if (f->flags & O_RSYNC) {
8837 _flush_range(in, offset, size);
8838 }
8839 r = _read_async(f, offset, size, bl);
8840 if (r < 0)
8841 goto done;
8842 } else {
8843 if (f->flags & O_DIRECT)
8844 _flush_range(in, offset, size);
8845
8846 bool checkeof = false;
8847 r = _read_sync(f, offset, size, bl, &checkeof);
8848 if (r < 0)
8849 goto done;
8850 if (checkeof) {
8851 offset += r;
8852 size -= r;
8853
8854 put_cap_ref(in, CEPH_CAP_FILE_RD);
8855 have = 0;
8856 // reverify size
8857 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8858 if (r < 0)
8859 goto done;
8860
8861 // eof? short read.
8862 if ((uint64_t)offset < in->size)
8863 goto retry;
8864 }
8865 }
8866
8867success:
8868 if (movepos) {
8869 // adjust fd pos
8870 f->pos = start_pos + bl->length();
8871 unlock_fh_pos(f);
8872 }
8873
8874done:
8875 // done!
8876
8877 if (onuninline) {
8878 client_lock.Unlock();
8879 uninline_flock.Lock();
8880 while (!uninline_done)
8881 uninline_cond.Wait(uninline_flock);
8882 uninline_flock.Unlock();
8883 client_lock.Lock();
8884
8885 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8886 in->inline_data.clear();
8887 in->inline_version = CEPH_INLINE_NONE;
28e407b8 8888 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
8889 check_caps(in, 0);
8890 } else
8891 r = uninline_ret;
8892 }
8893
8894 if (have)
8895 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8896 if (r < 0) {
8897 if (movepos)
8898 unlock_fh_pos(f);
8899 return r;
8900 } else
8901 return bl->length();
7c673cae
FG
8902}
8903
8904Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8905 client(c), f(f) {
8906 f->get();
8907 f->readahead.inc_pending();
8908}
8909
8910Client::C_Readahead::~C_Readahead() {
8911 f->readahead.dec_pending();
8912 client->_put_fh(f);
8913}
8914
8915void Client::C_Readahead::finish(int r) {
8916 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8917 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8918}
8919
8920int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8921{
8922 const md_config_t *conf = cct->_conf;
8923 Inode *in = f->inode.get();
8924
8925 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8926
8927 // trim read based on file size?
8928 if (off >= in->size)
8929 return 0;
8930 if (len == 0)
8931 return 0;
8932 if (off + len > in->size) {
8933 len = in->size - off;
8934 }
8935
8936 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8937 << " max_bytes=" << f->readahead.get_max_readahead_size()
8938 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8939
8940 // read (and possibly block)
8941 int r, rvalue = 0;
8942 Mutex flock("Client::_read_async flock");
8943 Cond cond;
8944 bool done = false;
8945 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8946 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8947 off, len, bl, 0, onfinish);
8948 if (r == 0) {
8949 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8950 client_lock.Unlock();
8951 flock.Lock();
8952 while (!done)
8953 cond.Wait(flock);
8954 flock.Unlock();
8955 client_lock.Lock();
8956 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8957 r = rvalue;
8958 } else {
8959 // it was cached.
8960 delete onfinish;
8961 }
8962
8963 if(f->readahead.get_min_readahead_size() > 0) {
8964 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8965 if (readahead_extent.second > 0) {
8966 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8967 << " (caller wants " << off << "~" << len << ")" << dendl;
8968 Context *onfinish2 = new C_Readahead(this, f);
8969 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8970 readahead_extent.first, readahead_extent.second,
8971 NULL, 0, onfinish2);
8972 if (r2 == 0) {
8973 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8974 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8975 } else {
8976 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8977 delete onfinish2;
8978 }
8979 }
8980 }
8981
8982 return r;
8983}
8984
8985int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8986 bool *checkeof)
8987{
8988 Inode *in = f->inode.get();
8989 uint64_t pos = off;
8990 int left = len;
8991 int read = 0;
8992
8993 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8994
8995 Mutex flock("Client::_read_sync flock");
8996 Cond cond;
8997 while (left > 0) {
8998 int r = 0;
8999 bool done = false;
9000 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
9001 bufferlist tbl;
9002
9003 int wanted = left;
9004 filer->read_trunc(in->ino, &in->layout, in->snapid,
9005 pos, left, &tbl, 0,
9006 in->truncate_size, in->truncate_seq,
9007 onfinish);
9008 client_lock.Unlock();
9009 flock.Lock();
9010 while (!done)
9011 cond.Wait(flock);
9012 flock.Unlock();
9013 client_lock.Lock();
9014
9015 // if we get ENOENT from OSD, assume 0 bytes returned
9016 if (r == -ENOENT)
9017 r = 0;
9018 if (r < 0)
9019 return r;
9020 if (tbl.length()) {
9021 r = tbl.length();
9022
9023 read += r;
9024 pos += r;
9025 left -= r;
9026 bl->claim_append(tbl);
9027 }
9028 // short read?
9029 if (r >= 0 && r < wanted) {
9030 if (pos < in->size) {
9031 // zero up to known EOF
9032 int64_t some = in->size - pos;
9033 if (some > left)
9034 some = left;
9035 bufferptr z(some);
9036 z.zero();
9037 bl->push_back(z);
9038 read += some;
9039 pos += some;
9040 left -= some;
9041 if (left == 0)
9042 return read;
9043 }
9044
9045 *checkeof = true;
9046 return read;
9047 }
9048 }
9049 return read;
9050}
9051
9052
9053/*
9054 * we keep count of uncommitted sync writes on the inode, so that
9055 * fsync can DDRT.
9056 */
9057void Client::_sync_write_commit(Inode *in)
9058{
9059 assert(unsafe_sync_write > 0);
9060 unsafe_sync_write--;
9061
9062 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9063
9064 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9065 if (unsafe_sync_write == 0 && unmounting) {
9066 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9067 mount_cond.Signal();
9068 }
9069}
9070
9071int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9072{
9073 Mutex::Locker lock(client_lock);
9074 tout(cct) << "write" << std::endl;
9075 tout(cct) << fd << std::endl;
9076 tout(cct) << size << std::endl;
9077 tout(cct) << offset << std::endl;
9078
181888fb
FG
9079 if (unmounting)
9080 return -ENOTCONN;
9081
7c673cae
FG
9082 Fh *fh = get_filehandle(fd);
9083 if (!fh)
9084 return -EBADF;
9085#if defined(__linux__) && defined(O_PATH)
9086 if (fh->flags & O_PATH)
9087 return -EBADF;
9088#endif
9089 int r = _write(fh, offset, size, buf, NULL, 0);
9090 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9091 return r;
9092}
9093
9094int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9095{
9096 if (iovcnt < 0)
9097 return -EINVAL;
9098 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9099}
9100
9101int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9102{
9103 Mutex::Locker lock(client_lock);
9104 tout(cct) << fd << std::endl;
9105 tout(cct) << offset << std::endl;
9106
181888fb
FG
9107 if (unmounting)
9108 return -ENOTCONN;
9109
7c673cae
FG
9110 Fh *fh = get_filehandle(fd);
9111 if (!fh)
9112 return -EBADF;
9113#if defined(__linux__) && defined(O_PATH)
9114 if (fh->flags & O_PATH)
9115 return -EBADF;
9116#endif
9117 loff_t totallen = 0;
9118 for (unsigned i = 0; i < iovcnt; i++) {
9119 totallen += iov[i].iov_len;
9120 }
9121 if (write) {
9122 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9123 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9124 return w;
9125 } else {
9126 bufferlist bl;
9127 int r = _read(fh, offset, totallen, &bl);
9128 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9129 if (r <= 0)
9130 return r;
9131
9132 int bufoff = 0;
9133 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9134 /*
9135 * This piece of code aims to handle the case that bufferlist does not have enough data
9136 * to fill in the iov
9137 */
9138 if (resid < iov[j].iov_len) {
9139 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9140 break;
9141 } else {
9142 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9143 }
9144 resid -= iov[j].iov_len;
9145 bufoff += iov[j].iov_len;
9146 }
9147 return r;
9148 }
9149}
9150
9151int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9152 const struct iovec *iov, int iovcnt)
9153{
9154 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9155 return -EFBIG;
9156
9157 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9158 Inode *in = f->inode.get();
9159
9160 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9161 return -ENOSPC;
9162 }
9163
9164 assert(in->snapid == CEPH_NOSNAP);
9165
9166 // was Fh opened as writeable?
9167 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9168 return -EBADF;
9169
9170 // check quota
9171 uint64_t endoff = offset + size;
28e407b8
AA
9172 std::list<InodeRef> quota_roots;
9173 if (endoff > in->size &&
9174 is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
7c673cae
FG
9175 return -EDQUOT;
9176 }
9177
9178 // use/adjust fd pos?
9179 if (offset < 0) {
9180 lock_fh_pos(f);
9181 /*
9182 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9183 * change out from under us.
9184 */
9185 if (f->flags & O_APPEND) {
9186 int r = _lseek(f, 0, SEEK_END);
9187 if (r < 0) {
9188 unlock_fh_pos(f);
9189 return r;
9190 }
9191 }
9192 offset = f->pos;
9193 f->pos = offset+size;
9194 unlock_fh_pos(f);
9195 }
9196
9197 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9198
9199 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9200
9201 // time it.
9202 utime_t start = ceph_clock_now();
9203
9204 if (in->inline_version == 0) {
9205 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9206 if (r < 0)
9207 return r;
9208 assert(in->inline_version > 0);
9209 }
9210
9211 // copy into fresh buffer (since our write may be resub, async)
9212 bufferlist bl;
9213 if (buf) {
9214 if (size > 0)
9215 bl.append(buf, size);
9216 } else if (iov){
9217 for (int i = 0; i < iovcnt; i++) {
9218 if (iov[i].iov_len > 0) {
9219 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9220 }
9221 }
9222 }
9223
9224 utime_t lat;
9225 uint64_t totalwritten;
9226 int have;
9227 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9228 CEPH_CAP_FILE_BUFFER, &have, endoff);
9229 if (r < 0)
9230 return r;
9231
9232 /* clear the setuid/setgid bits, if any */
181888fb 9233 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9234 struct ceph_statx stx = { 0 };
9235
9236 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9237 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9238 if (r < 0)
9239 return r;
9240 } else {
9241 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9242 }
9243
9244 if (f->flags & O_DIRECT)
9245 have &= ~CEPH_CAP_FILE_BUFFER;
9246
9247 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9248
9249 Mutex uninline_flock("Client::_write_uninline_data flock");
9250 Cond uninline_cond;
9251 bool uninline_done = false;
9252 int uninline_ret = 0;
9253 Context *onuninline = NULL;
9254
9255 if (in->inline_version < CEPH_INLINE_NONE) {
9256 if (endoff > cct->_conf->client_max_inline_size ||
9257 endoff > CEPH_INLINE_MAX_SIZE ||
9258 !(have & CEPH_CAP_FILE_BUFFER)) {
9259 onuninline = new C_SafeCond(&uninline_flock,
9260 &uninline_cond,
9261 &uninline_done,
9262 &uninline_ret);
9263 uninline_data(in, onuninline);
9264 } else {
9265 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9266
9267 uint32_t len = in->inline_data.length();
9268
9269 if (endoff < len)
9270 in->inline_data.copy(endoff, len - endoff, bl);
9271
9272 if (offset < len)
9273 in->inline_data.splice(offset, len - offset);
9274 else if (offset > len)
9275 in->inline_data.append_zero(offset - len);
9276
9277 in->inline_data.append(bl);
9278 in->inline_version++;
9279
9280 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9281
9282 goto success;
9283 }
9284 }
9285
9286 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9287 // do buffered write
9288 if (!in->oset.dirty_or_tx)
9289 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9290
9291 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9292
9293 // async, caching, non-blocking.
9294 r = objectcacher->file_write(&in->oset, &in->layout,
9295 in->snaprealm->get_snap_context(),
9296 offset, size, bl, ceph::real_clock::now(),
9297 0);
9298 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9299
9300 if (r < 0)
9301 goto done;
9302
9303 // flush cached write if O_SYNC is set on file fh
9304 // O_DSYNC == O_SYNC on linux < 2.6.33
9305 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9306 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9307 _flush_range(in, offset, size);
9308 }
9309 } else {
9310 if (f->flags & O_DIRECT)
9311 _flush_range(in, offset, size);
9312
9313 // simple, non-atomic sync write
9314 Mutex flock("Client::_write flock");
9315 Cond cond;
9316 bool done = false;
9317 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9318
9319 unsafe_sync_write++;
9320 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9321
9322 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9323 offset, size, bl, ceph::real_clock::now(), 0,
9324 in->truncate_size, in->truncate_seq,
9325 onfinish);
9326 client_lock.Unlock();
9327 flock.Lock();
9328
9329 while (!done)
9330 cond.Wait(flock);
9331 flock.Unlock();
9332 client_lock.Lock();
9333 _sync_write_commit(in);
9334 }
9335
9336 // if we get here, write was successful, update client metadata
9337success:
9338 // time
9339 lat = ceph_clock_now();
9340 lat -= start;
9341 logger->tinc(l_c_wrlat, lat);
9342
9343 totalwritten = size;
9344 r = (int)totalwritten;
9345
9346 // extend file?
9347 if (totalwritten + offset > in->size) {
9348 in->size = totalwritten + offset;
28e407b8 9349 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9350
28e407b8 9351 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 9352 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9353 } else if (is_max_size_approaching(in)) {
9354 check_caps(in, 0);
7c673cae
FG
9355 }
9356
9357 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9358 } else {
9359 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9360 }
9361
9362 // mtime
9363 in->mtime = ceph_clock_now();
9364 in->change_attr++;
28e407b8 9365 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9366
9367done:
9368
9369 if (onuninline) {
9370 client_lock.Unlock();
9371 uninline_flock.Lock();
9372 while (!uninline_done)
9373 uninline_cond.Wait(uninline_flock);
9374 uninline_flock.Unlock();
9375 client_lock.Lock();
9376
9377 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9378 in->inline_data.clear();
9379 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9380 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9381 check_caps(in, 0);
9382 } else
9383 r = uninline_ret;
9384 }
9385
9386 put_cap_ref(in, CEPH_CAP_FILE_WR);
9387 return r;
9388}
9389
9390int Client::_flush(Fh *f)
9391{
9392 Inode *in = f->inode.get();
9393 int err = f->take_async_err();
9394 if (err != 0) {
9395 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9396 << cpp_strerror(err) << dendl;
9397 } else {
9398 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9399 }
9400
9401 return err;
9402}
9403
9404int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9405{
9406 struct ceph_statx stx;
9407 stx.stx_size = length;
9408 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9409}
9410
9411int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9412{
9413 Mutex::Locker lock(client_lock);
9414 tout(cct) << "ftruncate" << std::endl;
9415 tout(cct) << fd << std::endl;
9416 tout(cct) << length << std::endl;
9417
181888fb
FG
9418 if (unmounting)
9419 return -ENOTCONN;
9420
7c673cae
FG
9421 Fh *f = get_filehandle(fd);
9422 if (!f)
9423 return -EBADF;
9424#if defined(__linux__) && defined(O_PATH)
9425 if (f->flags & O_PATH)
9426 return -EBADF;
9427#endif
9428 struct stat attr;
9429 attr.st_size = length;
9430 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9431}
9432
9433int Client::fsync(int fd, bool syncdataonly)
9434{
9435 Mutex::Locker lock(client_lock);
9436 tout(cct) << "fsync" << std::endl;
9437 tout(cct) << fd << std::endl;
9438 tout(cct) << syncdataonly << std::endl;
9439
181888fb
FG
9440 if (unmounting)
9441 return -ENOTCONN;
9442
7c673cae
FG
9443 Fh *f = get_filehandle(fd);
9444 if (!f)
9445 return -EBADF;
9446#if defined(__linux__) && defined(O_PATH)
9447 if (f->flags & O_PATH)
9448 return -EBADF;
9449#endif
9450 int r = _fsync(f, syncdataonly);
9451 if (r == 0) {
9452 // The IOs in this fsync were okay, but maybe something happened
9453 // in the background that we shoudl be reporting?
9454 r = f->take_async_err();
1adf2230 9455 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9456 << ") = 0, async_err = " << r << dendl;
9457 } else {
9458 // Assume that an error we encountered during fsync, even reported
9459 // synchronously, would also have applied the error to the Fh, and we
9460 // should clear it here to avoid returning the same error again on next
9461 // call.
1adf2230 9462 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9463 << r << dendl;
9464 f->take_async_err();
9465 }
9466 return r;
9467}
9468
9469int Client::_fsync(Inode *in, bool syncdataonly)
9470{
9471 int r = 0;
9472 Mutex lock("Client::_fsync::lock");
9473 Cond cond;
9474 bool done = false;
9475 C_SafeCond *object_cacher_completion = NULL;
9476 ceph_tid_t flush_tid = 0;
9477 InodeRef tmp_ref;
9478
1adf2230 9479 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9480
9481 if (cct->_conf->client_oc) {
9482 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9483 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9484 _flush(in, object_cacher_completion);
9485 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9486 }
9487
9488 if (!syncdataonly && in->dirty_caps) {
9489 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9490 if (in->flushing_caps)
9491 flush_tid = last_flush_tid;
9492 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9493
9494 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9495 flush_mdlog_sync();
9496
7c673cae
FG
9497 MetaRequest *req = in->unsafe_ops.back();
9498 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9499
9500 req->get();
9501 wait_on_list(req->waitfor_safe);
9502 put_request(req);
9503 }
9504
9505 if (object_cacher_completion) { // wait on a real reply instead of guessing
9506 client_lock.Unlock();
9507 lock.Lock();
9508 ldout(cct, 15) << "waiting on data to flush" << dendl;
9509 while (!done)
9510 cond.Wait(lock);
9511 lock.Unlock();
9512 client_lock.Lock();
9513 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9514 } else {
9515 // FIXME: this can starve
9516 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9517 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9518 << " uncommitted, waiting" << dendl;
9519 wait_on_list(in->waitfor_commit);
9520 }
9521 }
9522
9523 if (!r) {
9524 if (flush_tid > 0)
9525 wait_sync_caps(in, flush_tid);
9526
9527 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9528 } else {
1adf2230 9529 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9530 << cpp_strerror(-r) << dendl;
9531 }
9532
9533 return r;
9534}
9535
9536int Client::_fsync(Fh *f, bool syncdataonly)
9537{
1adf2230 9538 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9539 return _fsync(f->inode.get(), syncdataonly);
9540}
9541
9542int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9543{
9544 Mutex::Locker lock(client_lock);
9545 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9546 tout(cct) << fd << std::endl;
9547
181888fb
FG
9548 if (unmounting)
9549 return -ENOTCONN;
9550
7c673cae
FG
9551 Fh *f = get_filehandle(fd);
9552 if (!f)
9553 return -EBADF;
9554 int r = _getattr(f->inode, mask, perms);
9555 if (r < 0)
9556 return r;
9557 fill_stat(f->inode, stbuf, NULL);
1adf2230 9558 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9559 return r;
9560}
9561
9562int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9563 unsigned int want, unsigned int flags)
9564{
9565 Mutex::Locker lock(client_lock);
9566 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9567 tout(cct) << fd << std::endl;
9568
181888fb
FG
9569 if (unmounting)
9570 return -ENOTCONN;
9571
7c673cae
FG
9572 Fh *f = get_filehandle(fd);
9573 if (!f)
9574 return -EBADF;
9575
9576 unsigned mask = statx_to_mask(flags, want);
9577
9578 int r = 0;
94b18763 9579 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9580 r = _getattr(f->inode, mask, perms);
9581 if (r < 0) {
9582 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9583 return r;
9584 }
9585 }
9586
9587 fill_statx(f->inode, mask, stx);
9588 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9589 return r;
9590}
9591
9592// not written yet, but i want to link!
9593
9594int Client::chdir(const char *relpath, std::string &new_cwd,
9595 const UserPerm& perms)
9596{
9597 Mutex::Locker lock(client_lock);
9598 tout(cct) << "chdir" << std::endl;
9599 tout(cct) << relpath << std::endl;
181888fb
FG
9600
9601 if (unmounting)
9602 return -ENOTCONN;
9603
7c673cae
FG
9604 filepath path(relpath);
9605 InodeRef in;
9606 int r = path_walk(path, &in, perms);
9607 if (r < 0)
9608 return r;
9609 if (cwd != in)
9610 cwd.swap(in);
9611 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9612
b5b8bbf5 9613 _getcwd(new_cwd, perms);
7c673cae
FG
9614 return 0;
9615}
9616
b5b8bbf5 9617void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9618{
9619 filepath path;
9620 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9621
9622 Inode *in = cwd.get();
9623 while (in != root) {
9624 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9625
9626 // A cwd or ancester is unlinked
9627 if (in->dn_set.empty()) {
9628 return;
9629 }
9630
9631 Dentry *dn = in->get_first_parent();
9632
9633
9634 if (!dn) {
9635 // look it up
9636 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9637 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9638 filepath path(in->ino);
9639 req->set_filepath(path);
9640 req->set_inode(in);
9641 int res = make_request(req, perms);
9642 if (res < 0)
9643 break;
9644
9645 // start over
9646 path = filepath();
9647 in = cwd.get();
9648 continue;
9649 }
9650 path.push_front_dentry(dn->name);
9651 in = dn->dir->parent_inode;
9652 }
9653 dir = "/";
9654 dir += path.get_path();
9655}
9656
b5b8bbf5
FG
9657void Client::getcwd(string& dir, const UserPerm& perms)
9658{
9659 Mutex::Locker l(client_lock);
181888fb
FG
9660 if (!unmounting)
9661 _getcwd(dir, perms);
b5b8bbf5
FG
9662}
9663
7c673cae
FG
9664int Client::statfs(const char *path, struct statvfs *stbuf,
9665 const UserPerm& perms)
9666{
9667 Mutex::Locker l(client_lock);
9668 tout(cct) << "statfs" << std::endl;
9669
181888fb
FG
9670 if (unmounting)
9671 return -ENOTCONN;
9672
7c673cae
FG
9673 ceph_statfs stats;
9674 C_SaferCond cond;
d2e6a577
FG
9675
9676 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9677 if (data_pools.size() == 1) {
9678 objecter->get_fs_stats(stats, data_pools[0], &cond);
9679 } else {
9680 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9681 }
7c673cae
FG
9682
9683 client_lock.Unlock();
9684 int rval = cond.wait();
9685 client_lock.Lock();
9686
9687 if (rval < 0) {
9688 ldout(cct, 1) << "underlying call to statfs returned error: "
9689 << cpp_strerror(rval)
9690 << dendl;
9691 return rval;
9692 }
9693
9694 memset(stbuf, 0, sizeof(*stbuf));
9695
9696 /*
9697 * we're going to set a block size of 4MB so we can represent larger
9698 * FSes without overflowing. Additionally convert the space
9699 * measurements from KB to bytes while making them in terms of
9700 * blocks. We use 4MB only because it is big enough, and because it
9701 * actually *is* the (ceph) default block size.
9702 */
9703 const int CEPH_BLOCK_SHIFT = 22;
9704 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9705 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9706 stbuf->f_files = stats.num_objects;
9707 stbuf->f_ffree = -1;
9708 stbuf->f_favail = -1;
9709 stbuf->f_fsid = -1; // ??
9710 stbuf->f_flag = 0; // ??
9711 stbuf->f_namemax = NAME_MAX;
9712
9713 // Usually quota_root will == root_ancestor, but if the mount root has no
9714 // quota but we can see a parent of it that does have a quota, we'll
9715 // respect that one instead.
9716 assert(root != nullptr);
9717 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9718
9719 // get_quota_root should always give us something
9720 // because client quotas are always enabled
9721 assert(quota_root != nullptr);
9722
9723 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9724
9725 // Skip the getattr if any sessions are stale, as we don't want to
9726 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9727 // is unhealthy.
9728 if (!_any_stale_sessions()) {
9729 int r = _getattr(quota_root, 0, perms, true);
9730 if (r != 0) {
9731 // Ignore return value: error getting latest inode metadata is not a good
9732 // reason to break "df".
9733 lderr(cct) << "Error in getattr on quota root 0x"
9734 << std::hex << quota_root->ino << std::dec
9735 << " statfs result may be outdated" << dendl;
9736 }
9737 }
9738
9739 // Special case: if there is a size quota set on the Inode acting
9740 // as the root for this client mount, then report the quota status
9741 // as the filesystem statistics.
9742 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9743 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9744 // It is possible for a quota to be exceeded: arithmetic here must
9745 // handle case where used > total.
9746 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9747
9748 stbuf->f_blocks = total;
9749 stbuf->f_bfree = free;
9750 stbuf->f_bavail = free;
9751 } else {
d2e6a577 9752 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9753 // multiple pools may be used without one filesystem namespace via
9754 // layouts, this is the most correct thing we can do.
9755 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9756 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9757 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9758 }
9759
9760 return rval;
9761}
9762
9763int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9764 struct flock *fl, uint64_t owner, bool removing)
9765{
9766 ldout(cct, 10) << "_do_filelock ino " << in->ino
9767 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9768 << " type " << fl->l_type << " owner " << owner
9769 << " " << fl->l_start << "~" << fl->l_len << dendl;
9770
9771 int lock_cmd;
9772 if (F_RDLCK == fl->l_type)
9773 lock_cmd = CEPH_LOCK_SHARED;
9774 else if (F_WRLCK == fl->l_type)
9775 lock_cmd = CEPH_LOCK_EXCL;
9776 else if (F_UNLCK == fl->l_type)
9777 lock_cmd = CEPH_LOCK_UNLOCK;
9778 else
9779 return -EIO;
9780
9781 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9782 sleep = 0;
9783
9784 /*
9785 * Set the most significant bit, so that MDS knows the 'owner'
9786 * is sufficient to identify the owner of lock. (old code uses
9787 * both 'owner' and 'pid')
9788 */
9789 owner |= (1ULL << 63);
9790
9791 MetaRequest *req = new MetaRequest(op);
9792 filepath path;
9793 in->make_nosnap_relative_path(path);
9794 req->set_filepath(path);
9795 req->set_inode(in);
9796
9797 req->head.args.filelock_change.rule = lock_type;
9798 req->head.args.filelock_change.type = lock_cmd;
9799 req->head.args.filelock_change.owner = owner;
9800 req->head.args.filelock_change.pid = fl->l_pid;
9801 req->head.args.filelock_change.start = fl->l_start;
9802 req->head.args.filelock_change.length = fl->l_len;
9803 req->head.args.filelock_change.wait = sleep;
9804
9805 int ret;
9806 bufferlist bl;
9807
9808 if (sleep && switch_interrupt_cb) {
9809 // enable interrupt
9810 switch_interrupt_cb(callback_handle, req->get());
9811 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9812 // disable interrupt
9813 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9814 if (ret == 0 && req->aborted()) {
9815 // effect of this lock request has been revoked by the 'lock intr' request
9816 ret = req->get_abort_code();
9817 }
7c673cae
FG
9818 put_request(req);
9819 } else {
9820 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9821 }
9822
9823 if (ret == 0) {
9824 if (op == CEPH_MDS_OP_GETFILELOCK) {
9825 ceph_filelock filelock;
9826 bufferlist::iterator p = bl.begin();
9827 ::decode(filelock, p);
9828
9829 if (CEPH_LOCK_SHARED == filelock.type)
9830 fl->l_type = F_RDLCK;
9831 else if (CEPH_LOCK_EXCL == filelock.type)
9832 fl->l_type = F_WRLCK;
9833 else
9834 fl->l_type = F_UNLCK;
9835
9836 fl->l_whence = SEEK_SET;
9837 fl->l_start = filelock.start;
9838 fl->l_len = filelock.length;
9839 fl->l_pid = filelock.pid;
9840 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9841 ceph_lock_state_t *lock_state;
9842 if (lock_type == CEPH_LOCK_FCNTL) {
9843 if (!in->fcntl_locks)
9844 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9845 lock_state = in->fcntl_locks;
9846 } else if (lock_type == CEPH_LOCK_FLOCK) {
9847 if (!in->flock_locks)
9848 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9849 lock_state = in->flock_locks;
9850 } else {
9851 ceph_abort();
9852 return -EINVAL;
9853 }
9854 _update_lock_state(fl, owner, lock_state);
9855
9856 if (!removing) {
9857 if (lock_type == CEPH_LOCK_FCNTL) {
9858 if (!fh->fcntl_locks)
9859 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9860 lock_state = fh->fcntl_locks;
9861 } else {
9862 if (!fh->flock_locks)
9863 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9864 lock_state = fh->flock_locks;
9865 }
9866 _update_lock_state(fl, owner, lock_state);
9867 }
9868 } else
9869 ceph_abort();
9870 }
9871 return ret;
9872}
9873
9874int Client::_interrupt_filelock(MetaRequest *req)
9875{
31f18b77
FG
9876 // Set abort code, but do not kick. The abort code prevents the request
9877 // from being re-sent.
9878 req->abort(-EINTR);
9879 if (req->mds < 0)
9880 return 0; // haven't sent the request
9881
7c673cae
FG
9882 Inode *in = req->inode();
9883
9884 int lock_type;
9885 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9886 lock_type = CEPH_LOCK_FLOCK_INTR;
9887 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9888 lock_type = CEPH_LOCK_FCNTL_INTR;
9889 else {
9890 ceph_abort();
9891 return -EINVAL;
9892 }
9893
9894 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9895 filepath path;
9896 in->make_nosnap_relative_path(path);
9897 intr_req->set_filepath(path);
9898 intr_req->set_inode(in);
9899 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9900 intr_req->head.args.filelock_change.rule = lock_type;
9901 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9902
9903 UserPerm perms(req->get_uid(), req->get_gid());
9904 return make_request(intr_req, perms, NULL, NULL, -1);
9905}
9906
9907void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9908{
9909 if (!in->fcntl_locks && !in->flock_locks)
9910 return;
9911
9912 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9913 ::encode(nr_fcntl_locks, bl);
9914 if (nr_fcntl_locks) {
9915 ceph_lock_state_t* lock_state = in->fcntl_locks;
9916 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9917 p != lock_state->held_locks.end();
9918 ++p)
9919 ::encode(p->second, bl);
9920 }
9921
9922 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9923 ::encode(nr_flock_locks, bl);
9924 if (nr_flock_locks) {
9925 ceph_lock_state_t* lock_state = in->flock_locks;
9926 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9927 p != lock_state->held_locks.end();
9928 ++p)
9929 ::encode(p->second, bl);
9930 }
9931
9932 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9933 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9934}
9935
9936void Client::_release_filelocks(Fh *fh)
9937{
9938 if (!fh->fcntl_locks && !fh->flock_locks)
9939 return;
9940
9941 Inode *in = fh->inode.get();
9942 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9943
9944 list<pair<int, ceph_filelock> > to_release;
9945
9946 if (fh->fcntl_locks) {
9947 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9948 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9949 p != lock_state->held_locks.end();
9950 ++p)
9951 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9952 delete fh->fcntl_locks;
9953 }
9954 if (fh->flock_locks) {
9955 ceph_lock_state_t* lock_state = fh->flock_locks;
9956 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9957 p != lock_state->held_locks.end();
9958 ++p)
9959 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9960 delete fh->flock_locks;
9961 }
9962
9963 if (to_release.empty())
9964 return;
9965
9966 struct flock fl;
9967 memset(&fl, 0, sizeof(fl));
9968 fl.l_whence = SEEK_SET;
9969 fl.l_type = F_UNLCK;
9970
9971 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9972 p != to_release.end();
9973 ++p) {
9974 fl.l_start = p->second.start;
9975 fl.l_len = p->second.length;
9976 fl.l_pid = p->second.pid;
9977 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9978 p->second.owner, true);
9979 }
9980}
9981
9982void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9983 ceph_lock_state_t *lock_state)
9984{
9985 int lock_cmd;
9986 if (F_RDLCK == fl->l_type)
9987 lock_cmd = CEPH_LOCK_SHARED;
9988 else if (F_WRLCK == fl->l_type)
9989 lock_cmd = CEPH_LOCK_EXCL;
9990 else
9991 lock_cmd = CEPH_LOCK_UNLOCK;;
9992
9993 ceph_filelock filelock;
9994 filelock.start = fl->l_start;
9995 filelock.length = fl->l_len;
9996 filelock.client = 0;
9997 // see comment in _do_filelock()
9998 filelock.owner = owner | (1ULL << 63);
9999 filelock.pid = fl->l_pid;
10000 filelock.type = lock_cmd;
10001
10002 if (filelock.type == CEPH_LOCK_UNLOCK) {
10003 list<ceph_filelock> activated_locks;
10004 lock_state->remove_lock(filelock, activated_locks);
10005 } else {
10006 bool r = lock_state->add_lock(filelock, false, false, NULL);
10007 assert(r);
10008 }
10009}
10010
10011int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10012{
10013 Inode *in = fh->inode.get();
10014 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10015 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10016 return ret;
10017}
10018
10019int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10020{
10021 Inode *in = fh->inode.get();
10022 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10023 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10024 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10025 return ret;
10026}
10027
10028int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10029{
10030 Inode *in = fh->inode.get();
10031 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10032
10033 int sleep = !(cmd & LOCK_NB);
10034 cmd &= ~LOCK_NB;
10035
10036 int type;
10037 switch (cmd) {
10038 case LOCK_SH:
10039 type = F_RDLCK;
10040 break;
10041 case LOCK_EX:
10042 type = F_WRLCK;
10043 break;
10044 case LOCK_UN:
10045 type = F_UNLCK;
10046 break;
10047 default:
10048 return -EINVAL;
10049 }
10050
10051 struct flock fl;
10052 memset(&fl, 0, sizeof(fl));
10053 fl.l_type = type;
10054 fl.l_whence = SEEK_SET;
10055
10056 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10057 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10058 return ret;
10059}
10060
10061int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10062{
10063 /* Since the only thing this does is wrap a call to statfs, and
10064 statfs takes a lock, it doesn't seem we have a need to split it
10065 out. */
10066 return statfs(0, stbuf, perms);
10067}
10068
10069void Client::ll_register_callbacks(struct client_callback_args *args)
10070{
10071 if (!args)
10072 return;
10073 Mutex::Locker l(client_lock);
10074 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10075 << " invalidate_ino_cb " << args->ino_cb
10076 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10077 << " switch_interrupt_cb " << args->switch_intr_cb
10078 << " remount_cb " << args->remount_cb
10079 << dendl;
10080 callback_handle = args->handle;
10081 if (args->ino_cb) {
10082 ino_invalidate_cb = args->ino_cb;
10083 async_ino_invalidator.start();
10084 }
10085 if (args->dentry_cb) {
10086 dentry_invalidate_cb = args->dentry_cb;
10087 async_dentry_invalidator.start();
10088 }
10089 if (args->switch_intr_cb) {
10090 switch_interrupt_cb = args->switch_intr_cb;
10091 interrupt_finisher.start();
10092 }
10093 if (args->remount_cb) {
10094 remount_cb = args->remount_cb;
10095 remount_finisher.start();
10096 }
7c673cae
FG
10097 umask_cb = args->umask_cb;
10098}
10099
10100int Client::test_dentry_handling(bool can_invalidate)
10101{
10102 int r = 0;
10103
10104 can_invalidate_dentries = can_invalidate;
10105
10106 if (can_invalidate_dentries) {
10107 assert(dentry_invalidate_cb);
10108 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10109 r = 0;
7c673cae
FG
10110 } else if (remount_cb) {
10111 ldout(cct, 1) << "using remount_cb" << dendl;
b32b8144
FG
10112 r = _do_remount();
10113 }
10114 if (r) {
10115 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10116 if (should_abort) {
10117 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
7c673cae 10118 ceph_abort();
b32b8144
FG
10119 } else {
10120 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10121 }
7c673cae
FG
10122 }
10123 return r;
10124}
10125
10126int Client::_sync_fs()
10127{
10128 ldout(cct, 10) << "_sync_fs" << dendl;
10129
10130 // flush file data
10131 Mutex lock("Client::_fsync::lock");
10132 Cond cond;
10133 bool flush_done = false;
10134 if (cct->_conf->client_oc)
10135 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10136 else
10137 flush_done = true;
10138
10139 // flush caps
10140 flush_caps_sync();
10141 ceph_tid_t flush_tid = last_flush_tid;
10142
10143 // wait for unsafe mds requests
10144 wait_unsafe_requests();
10145
10146 wait_sync_caps(flush_tid);
10147
10148 if (!flush_done) {
10149 client_lock.Unlock();
10150 lock.Lock();
10151 ldout(cct, 15) << "waiting on data to flush" << dendl;
10152 while (!flush_done)
10153 cond.Wait(lock);
10154 lock.Unlock();
10155 client_lock.Lock();
10156 }
10157
10158 return 0;
10159}
10160
10161int Client::sync_fs()
10162{
10163 Mutex::Locker l(client_lock);
181888fb
FG
10164
10165 if (unmounting)
10166 return -ENOTCONN;
10167
7c673cae
FG
10168 return _sync_fs();
10169}
10170
10171int64_t Client::drop_caches()
10172{
10173 Mutex::Locker l(client_lock);
10174 return objectcacher->release_all();
10175}
10176
10177
10178int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10179{
10180 Mutex::Locker l(client_lock);
10181 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10182 << ", " << offset << ", " << count << ")" << dendl;
10183
10184 Fh *f = get_filehandle(fd);
10185 if (!f)
10186 return -EBADF;
10187
10188 // for now
10189 _fsync(f, true);
10190
10191 return 0;
10192}
10193
10194int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10195{
10196 Mutex::Locker l(client_lock);
10197 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10198 << ", " << offset << ", " << count << ")" << dendl;
10199
10200 Fh *f = get_filehandle(fd);
10201 if (!f)
10202 return -EBADF;
10203 Inode *in = f->inode.get();
10204
10205 _fsync(f, true);
10206 if (_release(in))
10207 check_caps(in, 0);
10208 return 0;
10209}
10210
10211
10212// =============================
10213// snaps
10214
10215int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10216{
10217 Mutex::Locker l(client_lock);
181888fb
FG
10218
10219 if (unmounting)
10220 return -ENOTCONN;
10221
7c673cae
FG
10222 filepath path(relpath);
10223 InodeRef in;
10224 int r = path_walk(path, &in, perm);
10225 if (r < 0)
10226 return r;
10227 if (cct->_conf->client_permissions) {
10228 r = may_create(in.get(), perm);
10229 if (r < 0)
10230 return r;
10231 }
10232 Inode *snapdir = open_snapdir(in.get());
10233 return _mkdir(snapdir, name, 0, perm);
10234}
181888fb 10235
7c673cae
FG
10236int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10237{
10238 Mutex::Locker l(client_lock);
181888fb
FG
10239
10240 if (unmounting)
10241 return -ENOTCONN;
10242
7c673cae
FG
10243 filepath path(relpath);
10244 InodeRef in;
10245 int r = path_walk(path, &in, perms);
10246 if (r < 0)
10247 return r;
10248 if (cct->_conf->client_permissions) {
10249 r = may_delete(in.get(), NULL, perms);
10250 if (r < 0)
10251 return r;
10252 }
10253 Inode *snapdir = open_snapdir(in.get());
10254 return _rmdir(snapdir, name, perms);
10255}
10256
10257// =============================
10258// expose caps
10259
10260int Client::get_caps_issued(int fd) {
10261
10262 Mutex::Locker lock(client_lock);
10263
181888fb
FG
10264 if (unmounting)
10265 return -ENOTCONN;
10266
7c673cae
FG
10267 Fh *f = get_filehandle(fd);
10268 if (!f)
10269 return -EBADF;
10270
10271 return f->inode->caps_issued();
10272}
10273
10274int Client::get_caps_issued(const char *path, const UserPerm& perms)
10275{
10276 Mutex::Locker lock(client_lock);
181888fb
FG
10277
10278 if (unmounting)
10279 return -ENOTCONN;
10280
7c673cae
FG
10281 filepath p(path);
10282 InodeRef in;
10283 int r = path_walk(p, &in, perms, true);
10284 if (r < 0)
10285 return r;
10286 return in->caps_issued();
10287}
10288
10289// =========================================
10290// low level
10291
10292Inode *Client::open_snapdir(Inode *diri)
10293{
10294 Inode *in;
10295 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10296 if (!inode_map.count(vino)) {
10297 in = new Inode(this, vino, &diri->layout);
10298
10299 in->ino = diri->ino;
10300 in->snapid = CEPH_SNAPDIR;
10301 in->mode = diri->mode;
10302 in->uid = diri->uid;
10303 in->gid = diri->gid;
10304 in->mtime = diri->mtime;
10305 in->ctime = diri->ctime;
10306 in->btime = diri->btime;
10307 in->size = diri->size;
10308 in->change_attr = diri->change_attr;
10309
10310 in->dirfragtree.clear();
10311 in->snapdir_parent = diri;
10312 diri->flags |= I_SNAPDIR_OPEN;
10313 inode_map[vino] = in;
10314 if (use_faked_inos())
10315 _assign_faked_ino(in);
10316 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10317 } else {
10318 in = inode_map[vino];
10319 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10320 }
10321 return in;
10322}
10323
10324int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10325 Inode **out, const UserPerm& perms)
10326{
10327 Mutex::Locker lock(client_lock);
31f18b77
FG
10328 vinodeno_t vparent = _get_vino(parent);
10329 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10330 tout(cct) << "ll_lookup" << std::endl;
10331 tout(cct) << name << std::endl;
10332
181888fb
FG
10333 if (unmounting)
10334 return -ENOTCONN;
10335
7c673cae
FG
10336 int r = 0;
10337 if (!cct->_conf->fuse_default_permissions) {
10338 r = may_lookup(parent, perms);
10339 if (r < 0)
10340 return r;
10341 }
10342
10343 string dname(name);
10344 InodeRef in;
10345
10346 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10347 if (r < 0) {
10348 attr->st_ino = 0;
10349 goto out;
10350 }
10351
10352 assert(in);
10353 fill_stat(in, attr);
10354 _ll_get(in.get());
10355
10356 out:
31f18b77 10357 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10358 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10359 tout(cct) << attr->st_ino << std::endl;
10360 *out = in.get();
10361 return r;
10362}
10363
1adf2230
AA
10364int Client::ll_lookup_inode(
10365 struct inodeno_t ino,
10366 const UserPerm& perms,
10367 Inode **inode)
10368{
10369 Mutex::Locker lock(client_lock);
10370 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10371
10372 // Num1: get inode and *inode
10373 int r = _lookup_ino(ino, perms, inode);
10374 if (r) {
10375 return r;
10376 }
10377 assert(inode != NULL);
10378 assert(*inode != NULL);
10379
10380 // Num2: Request the parent inode, so that we can look up the name
10381 Inode *parent;
10382 r = _lookup_parent(*inode, perms, &parent);
10383 if (r && r != -EINVAL) {
10384 // Unexpected error
10385 _ll_forget(*inode, 1);
10386 return r;
10387 } else if (r == -EINVAL) {
10388 // EINVAL indicates node without parents (root), drop out now
10389 // and don't try to look up the non-existent dentry.
10390 return 0;
10391 }
10392 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10393 // is already in cache
10394 assert(parent != NULL);
10395
10396 // Num3: Finally, get the name (dentry) of the requested inode
10397 r = _lookup_name(*inode, parent, perms);
10398 if (r) {
10399 // Unexpected error
10400 _ll_forget(parent, 1);
10401 _ll_forget(*inode, 1);
10402 return r;
10403 }
10404
10405 _ll_forget(parent, 1);
10406 return 0;
10407}
10408
7c673cae
FG
10409int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10410 struct ceph_statx *stx, unsigned want, unsigned flags,
10411 const UserPerm& perms)
10412{
10413 Mutex::Locker lock(client_lock);
31f18b77
FG
10414 vinodeno_t vparent = _get_vino(parent);
10415 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10416 tout(cct) << "ll_lookupx" << std::endl;
10417 tout(cct) << name << std::endl;
10418
181888fb
FG
10419 if (unmounting)
10420 return -ENOTCONN;
10421
7c673cae
FG
10422 int r = 0;
10423 if (!cct->_conf->fuse_default_permissions) {
10424 r = may_lookup(parent, perms);
10425 if (r < 0)
10426 return r;
10427 }
10428
10429 string dname(name);
10430 InodeRef in;
10431
10432 unsigned mask = statx_to_mask(flags, want);
10433 r = _lookup(parent, dname, mask, &in, perms);
10434 if (r < 0) {
10435 stx->stx_ino = 0;
10436 stx->stx_mask = 0;
10437 } else {
10438 assert(in);
10439 fill_statx(in, mask, stx);
10440 _ll_get(in.get());
10441 }
10442
31f18b77 10443 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10444 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10445 tout(cct) << stx->stx_ino << std::endl;
10446 *out = in.get();
10447 return r;
10448}
10449
10450int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10451 unsigned int want, unsigned int flags, const UserPerm& perms)
10452{
10453 Mutex::Locker lock(client_lock);
181888fb
FG
10454
10455 if (unmounting)
10456 return -ENOTCONN;
10457
7c673cae
FG
10458 filepath fp(name, 0);
10459 InodeRef in;
10460 int rc;
10461 unsigned mask = statx_to_mask(flags, want);
10462
10463 ldout(cct, 3) << "ll_walk" << name << dendl;
10464 tout(cct) << "ll_walk" << std::endl;
10465 tout(cct) << name << std::endl;
10466
10467 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10468 if (rc < 0) {
10469 /* zero out mask, just in case... */
10470 stx->stx_mask = 0;
10471 stx->stx_ino = 0;
10472 *out = NULL;
10473 return rc;
10474 } else {
10475 assert(in);
10476 fill_statx(in, mask, stx);
10477 _ll_get(in.get());
10478 *out = in.get();
10479 return 0;
10480 }
10481}
10482
10483void Client::_ll_get(Inode *in)
10484{
10485 if (in->ll_ref == 0) {
10486 in->get();
10487 if (in->is_dir() && !in->dn_set.empty()) {
10488 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10489 in->get_first_parent()->get(); // pin dentry
10490 }
10491 }
10492 in->ll_get();
10493 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10494}
10495
10496int Client::_ll_put(Inode *in, int num)
10497{
10498 in->ll_put(num);
10499 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10500 if (in->ll_ref == 0) {
10501 if (in->is_dir() && !in->dn_set.empty()) {
10502 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10503 in->get_first_parent()->put(); // unpin dentry
10504 }
10505 put_inode(in);
10506 return 0;
10507 } else {
10508 return in->ll_ref;
10509 }
10510}
10511
10512void Client::_ll_drop_pins()
10513{
10514 ldout(cct, 10) << "_ll_drop_pins" << dendl;
1adf2230 10515 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10516 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10517 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10518 it != inode_map.end();
10519 it = next) {
10520 Inode *in = it->second;
10521 next = it;
10522 ++next;
1adf2230
AA
10523 if (in->ll_ref){
10524 to_be_put.insert(in);
7c673cae 10525 _ll_put(in, in->ll_ref);
1adf2230 10526 }
7c673cae
FG
10527 }
10528}
10529
1adf2230 10530bool Client::_ll_forget(Inode *in, int count)
7c673cae 10531{
7c673cae
FG
10532 inodeno_t ino = _get_inodeno(in);
10533
1adf2230 10534 ldout(cct, 8) << "ll_forget " << ino << " " << count << dendl;
7c673cae
FG
10535 tout(cct) << "ll_forget" << std::endl;
10536 tout(cct) << ino.val << std::endl;
10537 tout(cct) << count << std::endl;
10538
181888fb
FG
10539 // Ignore forget if we're no longer mounted
10540 if (unmounting)
10541 return true;
10542
7c673cae
FG
10543 if (ino == 1) return true; // ignore forget on root.
10544
10545 bool last = false;
10546 if (in->ll_ref < count) {
10547 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10548 << ", which only has ll_ref=" << in->ll_ref << dendl;
10549 _ll_put(in, in->ll_ref);
10550 last = true;
10551 } else {
10552 if (_ll_put(in, count) == 0)
10553 last = true;
10554 }
10555
10556 return last;
10557}
10558
1adf2230
AA
10559bool Client::ll_forget(Inode *in, int count)
10560{
10561 Mutex::Locker lock(client_lock);
10562 return _ll_forget(in, count);
10563}
10564
7c673cae
FG
10565bool Client::ll_put(Inode *in)
10566{
10567 /* ll_forget already takes the lock */
10568 return ll_forget(in, 1);
10569}
10570
10571snapid_t Client::ll_get_snapid(Inode *in)
10572{
10573 Mutex::Locker lock(client_lock);
10574 return in->snapid;
10575}
10576
10577Inode *Client::ll_get_inode(ino_t ino)
10578{
10579 Mutex::Locker lock(client_lock);
181888fb
FG
10580
10581 if (unmounting)
10582 return NULL;
10583
7c673cae
FG
10584 vinodeno_t vino = _map_faked_ino(ino);
10585 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10586 if (p == inode_map.end())
10587 return NULL;
10588 Inode *in = p->second;
10589 _ll_get(in);
10590 return in;
10591}
10592
10593Inode *Client::ll_get_inode(vinodeno_t vino)
10594{
10595 Mutex::Locker lock(client_lock);
181888fb
FG
10596
10597 if (unmounting)
10598 return NULL;
10599
7c673cae
FG
10600 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10601 if (p == inode_map.end())
10602 return NULL;
10603 Inode *in = p->second;
10604 _ll_get(in);
10605 return in;
10606}
10607
10608int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10609{
10610 vinodeno_t vino = _get_vino(in);
10611
1adf2230 10612 ldout(cct, 8) << "ll_getattr " << vino << dendl;
7c673cae
FG
10613 tout(cct) << "ll_getattr" << std::endl;
10614 tout(cct) << vino.ino.val << std::endl;
10615
10616 if (vino.snapid < CEPH_NOSNAP)
10617 return 0;
10618 else
10619 return _getattr(in, caps, perms);
10620}
10621
10622int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10623{
10624 Mutex::Locker lock(client_lock);
10625
181888fb
FG
10626 if (unmounting)
10627 return -ENOTCONN;
10628
7c673cae
FG
10629 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10630
10631 if (res == 0)
10632 fill_stat(in, attr);
10633 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10634 return res;
10635}
10636
10637int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10638 unsigned int flags, const UserPerm& perms)
10639{
10640 Mutex::Locker lock(client_lock);
10641
181888fb
FG
10642 if (unmounting)
10643 return -ENOTCONN;
10644
7c673cae
FG
10645 int res = 0;
10646 unsigned mask = statx_to_mask(flags, want);
10647
94b18763 10648 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10649 res = _ll_getattr(in, mask, perms);
10650
10651 if (res == 0)
10652 fill_statx(in, mask, stx);
10653 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10654 return res;
10655}
10656
10657int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10658 const UserPerm& perms, InodeRef *inp)
10659{
10660 vinodeno_t vino = _get_vino(in);
10661
1adf2230 10662 ldout(cct, 8) << "ll_setattrx " << vino << " mask " << hex << mask << dec
7c673cae
FG
10663 << dendl;
10664 tout(cct) << "ll_setattrx" << std::endl;
10665 tout(cct) << vino.ino.val << std::endl;
10666 tout(cct) << stx->stx_mode << std::endl;
10667 tout(cct) << stx->stx_uid << std::endl;
10668 tout(cct) << stx->stx_gid << std::endl;
10669 tout(cct) << stx->stx_size << std::endl;
10670 tout(cct) << stx->stx_mtime << std::endl;
10671 tout(cct) << stx->stx_atime << std::endl;
10672 tout(cct) << stx->stx_btime << std::endl;
10673 tout(cct) << mask << std::endl;
10674
10675 if (!cct->_conf->fuse_default_permissions) {
10676 int res = may_setattr(in, stx, mask, perms);
10677 if (res < 0)
10678 return res;
10679 }
10680
10681 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10682
10683 return __setattrx(in, stx, mask, perms, inp);
10684}
10685
10686int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10687 const UserPerm& perms)
10688{
10689 Mutex::Locker lock(client_lock);
181888fb
FG
10690
10691 if (unmounting)
10692 return -ENOTCONN;
10693
7c673cae
FG
10694 InodeRef target(in);
10695 int res = _ll_setattrx(in, stx, mask, perms, &target);
10696 if (res == 0) {
10697 assert(in == target.get());
10698 fill_statx(in, in->caps_issued(), stx);
10699 }
10700
10701 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10702 return res;
10703}
10704
10705int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10706 const UserPerm& perms)
10707{
10708 struct ceph_statx stx;
10709 stat_to_statx(attr, &stx);
10710
10711 Mutex::Locker lock(client_lock);
181888fb
FG
10712
10713 if (unmounting)
10714 return -ENOTCONN;
10715
7c673cae
FG
10716 InodeRef target(in);
10717 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10718 if (res == 0) {
10719 assert(in == target.get());
10720 fill_stat(in, attr);
10721 }
10722
10723 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10724 return res;
10725}
10726
10727
10728// ----------
10729// xattrs
10730
10731int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10732 const UserPerm& perms)
10733{
10734 Mutex::Locker lock(client_lock);
181888fb
FG
10735
10736 if (unmounting)
10737 return -ENOTCONN;
10738
7c673cae
FG
10739 InodeRef in;
10740 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10741 if (r < 0)
10742 return r;
10743 return _getxattr(in, name, value, size, perms);
10744}
10745
10746int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10747 const UserPerm& perms)
10748{
10749 Mutex::Locker lock(client_lock);
181888fb
FG
10750
10751 if (unmounting)
10752 return -ENOTCONN;
10753
7c673cae
FG
10754 InodeRef in;
10755 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10756 if (r < 0)
10757 return r;
10758 return _getxattr(in, name, value, size, perms);
10759}
10760
10761int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10762 const UserPerm& perms)
10763{
10764 Mutex::Locker lock(client_lock);
181888fb
FG
10765
10766 if (unmounting)
10767 return -ENOTCONN;
10768
7c673cae
FG
10769 Fh *f = get_filehandle(fd);
10770 if (!f)
10771 return -EBADF;
10772 return _getxattr(f->inode, name, value, size, perms);
10773}
10774
10775int Client::listxattr(const char *path, char *list, size_t size,
10776 const UserPerm& perms)
10777{
10778 Mutex::Locker lock(client_lock);
181888fb
FG
10779
10780 if (unmounting)
10781 return -ENOTCONN;
10782
7c673cae
FG
10783 InodeRef in;
10784 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10785 if (r < 0)
10786 return r;
10787 return Client::_listxattr(in.get(), list, size, perms);
10788}
10789
10790int Client::llistxattr(const char *path, char *list, size_t size,
10791 const UserPerm& perms)
10792{
10793 Mutex::Locker lock(client_lock);
181888fb
FG
10794
10795 if (unmounting)
10796 return -ENOTCONN;
10797
7c673cae
FG
10798 InodeRef in;
10799 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10800 if (r < 0)
10801 return r;
10802 return Client::_listxattr(in.get(), list, size, perms);
10803}
10804
10805int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10806{
10807 Mutex::Locker lock(client_lock);
181888fb
FG
10808
10809 if (unmounting)
10810 return -ENOTCONN;
10811
7c673cae
FG
10812 Fh *f = get_filehandle(fd);
10813 if (!f)
10814 return -EBADF;
10815 return Client::_listxattr(f->inode.get(), list, size, perms);
10816}
10817
10818int Client::removexattr(const char *path, const char *name,
10819 const UserPerm& perms)
10820{
10821 Mutex::Locker lock(client_lock);
181888fb
FG
10822
10823 if (unmounting)
10824 return -ENOTCONN;
10825
7c673cae
FG
10826 InodeRef in;
10827 int r = Client::path_walk(path, &in, perms, true);
10828 if (r < 0)
10829 return r;
10830 return _removexattr(in, name, perms);
10831}
10832
10833int Client::lremovexattr(const char *path, const char *name,
10834 const UserPerm& perms)
10835{
10836 Mutex::Locker lock(client_lock);
181888fb
FG
10837
10838 if (unmounting)
10839 return -ENOTCONN;
10840
7c673cae
FG
10841 InodeRef in;
10842 int r = Client::path_walk(path, &in, perms, false);
10843 if (r < 0)
10844 return r;
10845 return _removexattr(in, name, perms);
10846}
10847
10848int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10849{
10850 Mutex::Locker lock(client_lock);
181888fb
FG
10851
10852 if (unmounting)
10853 return -ENOTCONN;
10854
7c673cae
FG
10855 Fh *f = get_filehandle(fd);
10856 if (!f)
10857 return -EBADF;
10858 return _removexattr(f->inode, name, perms);
10859}
10860
10861int Client::setxattr(const char *path, const char *name, const void *value,
10862 size_t size, int flags, const UserPerm& perms)
10863{
10864 _setxattr_maybe_wait_for_osdmap(name, value, size);
10865
10866 Mutex::Locker lock(client_lock);
181888fb
FG
10867
10868 if (unmounting)
10869 return -ENOTCONN;
10870
7c673cae
FG
10871 InodeRef in;
10872 int r = Client::path_walk(path, &in, perms, true);
10873 if (r < 0)
10874 return r;
10875 return _setxattr(in, name, value, size, flags, perms);
10876}
10877
10878int Client::lsetxattr(const char *path, const char *name, const void *value,
10879 size_t size, int flags, const UserPerm& perms)
10880{
10881 _setxattr_maybe_wait_for_osdmap(name, value, size);
10882
10883 Mutex::Locker lock(client_lock);
181888fb
FG
10884
10885 if (unmounting)
10886 return -ENOTCONN;
10887
7c673cae
FG
10888 InodeRef in;
10889 int r = Client::path_walk(path, &in, perms, false);
10890 if (r < 0)
10891 return r;
10892 return _setxattr(in, name, value, size, flags, perms);
10893}
10894
10895int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10896 int flags, const UserPerm& perms)
10897{
10898 _setxattr_maybe_wait_for_osdmap(name, value, size);
10899
10900 Mutex::Locker lock(client_lock);
181888fb
FG
10901
10902 if (unmounting)
10903 return -ENOTCONN;
10904
7c673cae
FG
10905 Fh *f = get_filehandle(fd);
10906 if (!f)
10907 return -EBADF;
10908 return _setxattr(f->inode, name, value, size, flags, perms);
10909}
10910
10911int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10912 const UserPerm& perms)
10913{
10914 int r;
10915
10916 const VXattr *vxattr = _match_vxattr(in, name);
10917 if (vxattr) {
10918 r = -ENODATA;
10919
10920 // Do a force getattr to get the latest quota before returning
10921 // a value to userspace.
28e407b8
AA
10922 int flags = 0;
10923 if (vxattr->flags & VXATTR_RSTAT) {
10924 flags |= CEPH_STAT_RSTAT;
10925 }
10926 r = _getattr(in, flags, perms, true);
7c673cae
FG
10927 if (r != 0) {
10928 // Error from getattr!
10929 return r;
10930 }
10931
10932 // call pointer-to-member function
10933 char buf[256];
10934 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10935 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10936 } else {
10937 r = -ENODATA;
10938 }
10939
10940 if (size != 0) {
10941 if (r > (int)size) {
10942 r = -ERANGE;
10943 } else if (r > 0) {
10944 memcpy(value, buf, r);
10945 }
10946 }
10947 goto out;
10948 }
10949
10950 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10951 r = -EOPNOTSUPP;
10952 goto out;
10953 }
10954
10955 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10956 if (r == 0) {
10957 string n(name);
10958 r = -ENODATA;
10959 if (in->xattrs.count(n)) {
10960 r = in->xattrs[n].length();
10961 if (r > 0 && size != 0) {
10962 if (size >= (unsigned)r)
10963 memcpy(value, in->xattrs[n].c_str(), r);
10964 else
10965 r = -ERANGE;
10966 }
10967 }
10968 }
10969 out:
1adf2230 10970 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
10971 return r;
10972}
10973
10974int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10975 const UserPerm& perms)
10976{
10977 if (cct->_conf->client_permissions) {
10978 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10979 if (r < 0)
10980 return r;
10981 }
10982 return _getxattr(in.get(), name, value, size, perms);
10983}
10984
10985int Client::ll_getxattr(Inode *in, const char *name, void *value,
10986 size_t size, const UserPerm& perms)
10987{
10988 Mutex::Locker lock(client_lock);
10989
181888fb
FG
10990 if (unmounting)
10991 return -ENOTCONN;
10992
7c673cae
FG
10993 vinodeno_t vino = _get_vino(in);
10994
10995 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10996 tout(cct) << "ll_getxattr" << std::endl;
10997 tout(cct) << vino.ino.val << std::endl;
10998 tout(cct) << name << std::endl;
10999
11000 if (!cct->_conf->fuse_default_permissions) {
11001 int r = xattr_permission(in, name, MAY_READ, perms);
11002 if (r < 0)
11003 return r;
11004 }
11005
11006 return _getxattr(in, name, value, size, perms);
11007}
11008
11009int Client::_listxattr(Inode *in, char *name, size_t size,
11010 const UserPerm& perms)
11011{
11012 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11013 if (r == 0) {
11014 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11015 p != in->xattrs.end();
11016 ++p)
11017 r += p->first.length() + 1;
11018
11019 const VXattr *vxattrs = _get_vxattrs(in);
11020 r += _vxattrs_name_size(vxattrs);
11021
11022 if (size != 0) {
11023 if (size >= (unsigned)r) {
11024 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11025 p != in->xattrs.end();
11026 ++p) {
11027 memcpy(name, p->first.c_str(), p->first.length());
11028 name += p->first.length();
11029 *name = '\0';
11030 name++;
11031 }
11032 if (vxattrs) {
11033 for (int i = 0; !vxattrs[i].name.empty(); i++) {
11034 const VXattr& vxattr = vxattrs[i];
11035 if (vxattr.hidden)
11036 continue;
11037 // call pointer-to-member function
11038 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11039 continue;
11040 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11041 name += vxattr.name.length();
11042 *name = '\0';
11043 name++;
11044 }
11045 }
11046 } else
11047 r = -ERANGE;
11048 }
11049 }
1adf2230 11050 ldout(cct, 8) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11051 return r;
11052}
11053
11054int Client::ll_listxattr(Inode *in, char *names, size_t size,
11055 const UserPerm& perms)
11056{
11057 Mutex::Locker lock(client_lock);
11058
181888fb
FG
11059 if (unmounting)
11060 return -ENOTCONN;
11061
7c673cae
FG
11062 vinodeno_t vino = _get_vino(in);
11063
11064 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
11065 tout(cct) << "ll_listxattr" << std::endl;
11066 tout(cct) << vino.ino.val << std::endl;
11067 tout(cct) << size << std::endl;
11068
11069 return _listxattr(in, names, size, perms);
11070}
11071
11072int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11073 size_t size, int flags, const UserPerm& perms)
11074{
11075
11076 int xattr_flags = 0;
11077 if (!value)
11078 xattr_flags |= CEPH_XATTR_REMOVE;
11079 if (flags & XATTR_CREATE)
11080 xattr_flags |= CEPH_XATTR_CREATE;
11081 if (flags & XATTR_REPLACE)
11082 xattr_flags |= CEPH_XATTR_REPLACE;
11083
11084 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11085 filepath path;
11086 in->make_nosnap_relative_path(path);
11087 req->set_filepath(path);
11088 req->set_string2(name);
11089 req->set_inode(in);
11090 req->head.args.setxattr.flags = xattr_flags;
11091
11092 bufferlist bl;
11093 bl.append((const char*)value, size);
11094 req->set_data(bl);
11095
11096 int res = make_request(req, perms);
11097
11098 trim_cache();
11099 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11100 res << dendl;
11101 return res;
11102}
11103
11104int Client::_setxattr(Inode *in, const char *name, const void *value,
11105 size_t size, int flags, const UserPerm& perms)
11106{
11107 if (in->snapid != CEPH_NOSNAP) {
11108 return -EROFS;
11109 }
11110
11111 bool posix_acl_xattr = false;
11112 if (acl_type == POSIX_ACL)
11113 posix_acl_xattr = !strncmp(name, "system.", 7);
11114
11115 if (strncmp(name, "user.", 5) &&
11116 strncmp(name, "security.", 9) &&
11117 strncmp(name, "trusted.", 8) &&
11118 strncmp(name, "ceph.", 5) &&
11119 !posix_acl_xattr)
11120 return -EOPNOTSUPP;
11121
11122 if (posix_acl_xattr) {
11123 if (!strcmp(name, ACL_EA_ACCESS)) {
11124 mode_t new_mode = in->mode;
11125 if (value) {
11126 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11127 if (ret < 0)
11128 return ret;
11129 if (ret == 0) {
11130 value = NULL;
11131 size = 0;
11132 }
11133 if (new_mode != in->mode) {
11134 struct ceph_statx stx;
11135 stx.stx_mode = new_mode;
11136 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11137 if (ret < 0)
11138 return ret;
11139 }
11140 }
11141 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11142 if (value) {
11143 if (!S_ISDIR(in->mode))
11144 return -EACCES;
11145 int ret = posix_acl_check(value, size);
11146 if (ret < 0)
11147 return -EINVAL;
11148 if (ret == 0) {
11149 value = NULL;
11150 size = 0;
11151 }
11152 }
11153 } else {
11154 return -EOPNOTSUPP;
11155 }
11156 } else {
11157 const VXattr *vxattr = _match_vxattr(in, name);
11158 if (vxattr && vxattr->readonly)
11159 return -EOPNOTSUPP;
11160 }
11161
11162 return _do_setxattr(in, name, value, size, flags, perms);
11163}
11164
11165int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11166 size_t size, int flags, const UserPerm& perms)
11167{
11168 if (cct->_conf->client_permissions) {
11169 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11170 if (r < 0)
11171 return r;
11172 }
11173 return _setxattr(in.get(), name, value, size, flags, perms);
11174}
11175
11176int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11177{
11178 string tmp;
11179 if (name == "layout") {
11180 string::iterator begin = value.begin();
11181 string::iterator end = value.end();
11182 keys_and_values<string::iterator> p; // create instance of parser
11183 std::map<string, string> m; // map to receive results
11184 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11185 return -EINVAL;
11186 }
11187 if (begin != end)
11188 return -EINVAL;
11189 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11190 if (q->first == "pool") {
11191 tmp = q->second;
11192 break;
11193 }
11194 }
11195 } else if (name == "layout.pool") {
11196 tmp = value;
11197 }
11198
11199 if (tmp.length()) {
11200 int64_t pool;
11201 try {
11202 pool = boost::lexical_cast<unsigned>(tmp);
11203 if (!osdmap->have_pg_pool(pool))
11204 return -ENOENT;
11205 } catch (boost::bad_lexical_cast const&) {
11206 pool = osdmap->lookup_pg_pool_name(tmp);
11207 if (pool < 0) {
11208 return -ENOENT;
11209 }
11210 }
11211 }
11212
11213 return 0;
11214}
11215
11216void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11217{
11218 // For setting pool of layout, MetaRequest need osdmap epoch.
11219 // There is a race which create a new data pool but client and mds both don't have.
11220 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11221 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11222 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11223 string rest(strstr(name, "layout"));
11224 string v((const char*)value, size);
11225 int r = objecter->with_osdmap([&](const OSDMap& o) {
11226 return _setxattr_check_data_pool(rest, v, &o);
11227 });
11228
11229 if (r == -ENOENT) {
11230 C_SaferCond ctx;
11231 objecter->wait_for_latest_osdmap(&ctx);
11232 ctx.wait();
11233 }
11234 }
11235}
11236
11237int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11238 size_t size, int flags, const UserPerm& perms)
11239{
11240 _setxattr_maybe_wait_for_osdmap(name, value, size);
11241
11242 Mutex::Locker lock(client_lock);
11243
181888fb
FG
11244 if (unmounting)
11245 return -ENOTCONN;
11246
7c673cae
FG
11247 vinodeno_t vino = _get_vino(in);
11248
11249 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11250 tout(cct) << "ll_setxattr" << std::endl;
11251 tout(cct) << vino.ino.val << std::endl;
11252 tout(cct) << name << std::endl;
11253
11254 if (!cct->_conf->fuse_default_permissions) {
11255 int r = xattr_permission(in, name, MAY_WRITE, perms);
11256 if (r < 0)
11257 return r;
11258 }
11259 return _setxattr(in, name, value, size, flags, perms);
11260}
11261
11262int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11263{
11264 if (in->snapid != CEPH_NOSNAP) {
11265 return -EROFS;
11266 }
11267
11268 // same xattrs supported by kernel client
11269 if (strncmp(name, "user.", 5) &&
11270 strncmp(name, "system.", 7) &&
11271 strncmp(name, "security.", 9) &&
11272 strncmp(name, "trusted.", 8) &&
11273 strncmp(name, "ceph.", 5))
11274 return -EOPNOTSUPP;
11275
11276 const VXattr *vxattr = _match_vxattr(in, name);
11277 if (vxattr && vxattr->readonly)
11278 return -EOPNOTSUPP;
11279
11280 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11281 filepath path;
11282 in->make_nosnap_relative_path(path);
11283 req->set_filepath(path);
11284 req->set_filepath2(name);
11285 req->set_inode(in);
11286
11287 int res = make_request(req, perms);
11288
11289 trim_cache();
1adf2230 11290 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11291 return res;
11292}
11293
11294int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11295{
11296 if (cct->_conf->client_permissions) {
11297 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11298 if (r < 0)
11299 return r;
11300 }
11301 return _removexattr(in.get(), name, perms);
11302}
11303
11304int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11305{
11306 Mutex::Locker lock(client_lock);
11307
181888fb
FG
11308 if (unmounting)
11309 return -ENOTCONN;
11310
7c673cae
FG
11311 vinodeno_t vino = _get_vino(in);
11312
11313 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11314 tout(cct) << "ll_removexattr" << std::endl;
11315 tout(cct) << vino.ino.val << std::endl;
11316 tout(cct) << name << std::endl;
11317
11318 if (!cct->_conf->fuse_default_permissions) {
11319 int r = xattr_permission(in, name, MAY_WRITE, perms);
11320 if (r < 0)
11321 return r;
11322 }
11323
11324 return _removexattr(in, name, perms);
11325}
11326
11327bool Client::_vxattrcb_quota_exists(Inode *in)
11328{
11329 return in->quota.is_enable();
11330}
11331size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11332{
11333 return snprintf(val, size,
11334 "max_bytes=%lld max_files=%lld",
11335 (long long int)in->quota.max_bytes,
11336 (long long int)in->quota.max_files);
11337}
11338size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11339{
11340 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11341}
11342size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11343{
11344 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11345}
11346
11347bool Client::_vxattrcb_layout_exists(Inode *in)
11348{
11349 return in->layout != file_layout_t();
11350}
11351size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11352{
11353 int r = snprintf(val, size,
11354 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11355 (unsigned long long)in->layout.stripe_unit,
11356 (unsigned long long)in->layout.stripe_count,
11357 (unsigned long long)in->layout.object_size);
11358 objecter->with_osdmap([&](const OSDMap& o) {
11359 if (o.have_pg_pool(in->layout.pool_id))
11360 r += snprintf(val + r, size - r, "%s",
11361 o.get_pool_name(in->layout.pool_id).c_str());
11362 else
11363 r += snprintf(val + r, size - r, "%" PRIu64,
11364 (uint64_t)in->layout.pool_id);
11365 });
11366 if (in->layout.pool_ns.length())
11367 r += snprintf(val + r, size - r, " pool_namespace=%s",
11368 in->layout.pool_ns.c_str());
11369 return r;
11370}
11371size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11372{
11373 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11374}
11375size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11376{
11377 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11378}
11379size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11380{
11381 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11382}
11383size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11384{
11385 size_t r;
11386 objecter->with_osdmap([&](const OSDMap& o) {
11387 if (o.have_pg_pool(in->layout.pool_id))
11388 r = snprintf(val, size, "%s", o.get_pool_name(
11389 in->layout.pool_id).c_str());
11390 else
11391 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11392 });
11393 return r;
11394}
11395size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11396{
11397 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11398}
11399size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11400{
11401 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11402}
11403size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11404{
11405 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11406}
11407size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11408{
11409 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11410}
11411size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11412{
11413 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11414}
11415size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11416{
11417 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11418}
11419size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11420{
11421 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11422}
11423size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11424{
11425 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11426}
11427size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11428{
11429 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11430 (long)in->rstat.rctime.nsec());
11431}
11432
11433#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11434#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11435
11436#define XATTR_NAME_CEPH(_type, _name) \
11437{ \
11438 name: CEPH_XATTR_NAME(_type, _name), \
11439 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11440 readonly: true, \
11441 hidden: false, \
11442 exists_cb: NULL, \
28e407b8
AA
11443 flags: 0, \
11444}
11445#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11446{ \
11447 name: CEPH_XATTR_NAME(_type, _name), \
11448 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11449 readonly: true, \
11450 hidden: false, \
11451 exists_cb: NULL, \
11452 flags: _flags, \
7c673cae
FG
11453}
11454#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11455{ \
11456 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11457 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11458 readonly: false, \
11459 hidden: true, \
11460 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11461 flags: 0, \
7c673cae
FG
11462}
11463#define XATTR_QUOTA_FIELD(_type, _name) \
11464{ \
11465 name: CEPH_XATTR_NAME(_type, _name), \
11466 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11467 readonly: false, \
11468 hidden: true, \
11469 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11470 flags: 0, \
7c673cae
FG
11471}
11472
11473const Client::VXattr Client::_dir_vxattrs[] = {
11474 {
11475 name: "ceph.dir.layout",
11476 getxattr_cb: &Client::_vxattrcb_layout,
11477 readonly: false,
11478 hidden: true,
11479 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11480 flags: 0,
7c673cae
FG
11481 },
11482 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11483 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11484 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11485 XATTR_LAYOUT_FIELD(dir, layout, pool),
11486 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11487 XATTR_NAME_CEPH(dir, entries),
11488 XATTR_NAME_CEPH(dir, files),
11489 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11490 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11491 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11492 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11493 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11494 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11495 {
11496 name: "ceph.quota",
11497 getxattr_cb: &Client::_vxattrcb_quota,
11498 readonly: false,
11499 hidden: true,
11500 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11501 flags: 0,
7c673cae
FG
11502 },
11503 XATTR_QUOTA_FIELD(quota, max_bytes),
11504 XATTR_QUOTA_FIELD(quota, max_files),
11505 { name: "" } /* Required table terminator */
11506};
11507
11508const Client::VXattr Client::_file_vxattrs[] = {
11509 {
11510 name: "ceph.file.layout",
11511 getxattr_cb: &Client::_vxattrcb_layout,
11512 readonly: false,
11513 hidden: true,
11514 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11515 flags: 0,
7c673cae
FG
11516 },
11517 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11518 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11519 XATTR_LAYOUT_FIELD(file, layout, object_size),
11520 XATTR_LAYOUT_FIELD(file, layout, pool),
11521 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11522 { name: "" } /* Required table terminator */
11523};
11524
11525const Client::VXattr *Client::_get_vxattrs(Inode *in)
11526{
11527 if (in->is_dir())
11528 return _dir_vxattrs;
11529 else if (in->is_file())
11530 return _file_vxattrs;
11531 return NULL;
11532}
11533
11534const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11535{
11536 if (strncmp(name, "ceph.", 5) == 0) {
11537 const VXattr *vxattr = _get_vxattrs(in);
11538 if (vxattr) {
11539 while (!vxattr->name.empty()) {
11540 if (vxattr->name == name)
11541 return vxattr;
11542 vxattr++;
11543 }
11544 }
11545 }
11546 return NULL;
11547}
11548
11549size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11550{
11551 size_t len = 0;
11552 while (!vxattr->name.empty()) {
11553 if (!vxattr->hidden)
11554 len += vxattr->name.length() + 1;
11555 vxattr++;
11556 }
11557 return len;
11558}
11559
11560int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11561{
11562 Mutex::Locker lock(client_lock);
11563
181888fb
FG
11564 if (unmounting)
11565 return -ENOTCONN;
11566
7c673cae
FG
11567 vinodeno_t vino = _get_vino(in);
11568
11569 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11570 tout(cct) << "ll_readlink" << std::endl;
11571 tout(cct) << vino.ino.val << std::endl;
11572
11573 set<Dentry*>::iterator dn = in->dn_set.begin();
11574 while (dn != in->dn_set.end()) {
11575 touch_dn(*dn);
11576 ++dn;
11577 }
11578
11579 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11580 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11581 return r;
11582}
11583
11584int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11585 const UserPerm& perms, InodeRef *inp)
11586{
1adf2230 11587 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11588 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11589 << ", gid " << perms.gid() << ")" << dendl;
11590
11591 if (strlen(name) > NAME_MAX)
11592 return -ENAMETOOLONG;
11593
11594 if (dir->snapid != CEPH_NOSNAP) {
11595 return -EROFS;
11596 }
11597 if (is_quota_files_exceeded(dir, perms)) {
11598 return -EDQUOT;
11599 }
11600
11601 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11602
11603 filepath path;
11604 dir->make_nosnap_relative_path(path);
11605 path.push_dentry(name);
11606 req->set_filepath(path);
11607 req->set_inode(dir);
11608 req->head.args.mknod.rdev = rdev;
11609 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11610 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11611
11612 bufferlist xattrs_bl;
11613 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11614 if (res < 0)
11615 goto fail;
11616 req->head.args.mknod.mode = mode;
11617 if (xattrs_bl.length() > 0)
11618 req->set_data(xattrs_bl);
11619
11620 Dentry *de;
11621 res = get_or_create(dir, name, &de);
11622 if (res < 0)
11623 goto fail;
11624 req->set_dentry(de);
11625
11626 res = make_request(req, perms, inp);
11627
11628 trim_cache();
11629
1adf2230 11630 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11631 return res;
11632
11633 fail:
11634 put_request(req);
11635 return res;
11636}
11637
11638int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11639 dev_t rdev, struct stat *attr, Inode **out,
11640 const UserPerm& perms)
11641{
11642 Mutex::Locker lock(client_lock);
11643
181888fb
FG
11644 if (unmounting)
11645 return -ENOTCONN;
11646
7c673cae
FG
11647 vinodeno_t vparent = _get_vino(parent);
11648
11649 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11650 tout(cct) << "ll_mknod" << std::endl;
11651 tout(cct) << vparent.ino.val << std::endl;
11652 tout(cct) << name << std::endl;
11653 tout(cct) << mode << std::endl;
11654 tout(cct) << rdev << std::endl;
11655
11656 if (!cct->_conf->fuse_default_permissions) {
11657 int r = may_create(parent, perms);
11658 if (r < 0)
11659 return r;
11660 }
11661
11662 InodeRef in;
11663 int r = _mknod(parent, name, mode, rdev, perms, &in);
11664 if (r == 0) {
11665 fill_stat(in, attr);
11666 _ll_get(in.get());
11667 }
11668 tout(cct) << attr->st_ino << std::endl;
11669 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11670 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11671 *out = in.get();
11672 return r;
11673}
11674
11675int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11676 dev_t rdev, Inode **out,
11677 struct ceph_statx *stx, unsigned want, unsigned flags,
11678 const UserPerm& perms)
11679{
11680 unsigned caps = statx_to_mask(flags, want);
11681 Mutex::Locker lock(client_lock);
11682
181888fb
FG
11683 if (unmounting)
11684 return -ENOTCONN;
11685
7c673cae
FG
11686 vinodeno_t vparent = _get_vino(parent);
11687
11688 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11689 tout(cct) << "ll_mknodx" << std::endl;
11690 tout(cct) << vparent.ino.val << std::endl;
11691 tout(cct) << name << std::endl;
11692 tout(cct) << mode << std::endl;
11693 tout(cct) << rdev << std::endl;
11694
11695 if (!cct->_conf->fuse_default_permissions) {
11696 int r = may_create(parent, perms);
11697 if (r < 0)
11698 return r;
11699 }
11700
11701 InodeRef in;
11702 int r = _mknod(parent, name, mode, rdev, perms, &in);
11703 if (r == 0) {
11704 fill_statx(in, caps, stx);
11705 _ll_get(in.get());
11706 }
11707 tout(cct) << stx->stx_ino << std::endl;
11708 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11709 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11710 *out = in.get();
11711 return r;
11712}
11713
11714int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11715 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11716 int object_size, const char *data_pool, bool *created,
11717 const UserPerm& perms)
11718{
1adf2230 11719 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
11720 mode << dec << ")" << dendl;
11721
11722 if (strlen(name) > NAME_MAX)
11723 return -ENAMETOOLONG;
11724 if (dir->snapid != CEPH_NOSNAP) {
11725 return -EROFS;
11726 }
11727 if (is_quota_files_exceeded(dir, perms)) {
11728 return -EDQUOT;
11729 }
11730
11731 // use normalized flags to generate cmode
11732 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11733 if (cmode < 0)
11734 return -EINVAL;
11735
11736 int64_t pool_id = -1;
11737 if (data_pool && *data_pool) {
11738 pool_id = objecter->with_osdmap(
11739 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11740 if (pool_id < 0)
11741 return -EINVAL;
11742 if (pool_id > 0xffffffffll)
11743 return -ERANGE; // bummer!
11744 }
11745
11746 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11747
11748 filepath path;
11749 dir->make_nosnap_relative_path(path);
11750 path.push_dentry(name);
11751 req->set_filepath(path);
11752 req->set_inode(dir);
11753 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11754
11755 req->head.args.open.stripe_unit = stripe_unit;
11756 req->head.args.open.stripe_count = stripe_count;
11757 req->head.args.open.object_size = object_size;
11758 if (cct->_conf->client_debug_getattr_caps)
11759 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11760 else
11761 req->head.args.open.mask = 0;
11762 req->head.args.open.pool = pool_id;
11763 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11764 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11765
11766 mode |= S_IFREG;
11767 bufferlist xattrs_bl;
11768 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11769 if (res < 0)
11770 goto fail;
11771 req->head.args.open.mode = mode;
11772 if (xattrs_bl.length() > 0)
11773 req->set_data(xattrs_bl);
11774
11775 Dentry *de;
11776 res = get_or_create(dir, name, &de);
11777 if (res < 0)
11778 goto fail;
11779 req->set_dentry(de);
11780
11781 res = make_request(req, perms, inp, created);
11782 if (res < 0) {
11783 goto reply_error;
11784 }
11785
11786 /* If the caller passed a value in fhp, do the open */
11787 if(fhp) {
11788 (*inp)->get_open_ref(cmode);
11789 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11790 }
11791
11792 reply_error:
11793 trim_cache();
11794
1adf2230 11795 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
11796 << " layout " << stripe_unit
11797 << ' ' << stripe_count
11798 << ' ' << object_size
11799 <<") = " << res << dendl;
11800 return res;
11801
11802 fail:
11803 put_request(req);
11804 return res;
11805}
11806
11807
11808int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11809 InodeRef *inp)
11810{
1adf2230 11811 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11812 << mode << dec << ", uid " << perm.uid()
11813 << ", gid " << perm.gid() << ")" << dendl;
11814
11815 if (strlen(name) > NAME_MAX)
11816 return -ENAMETOOLONG;
11817
11818 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11819 return -EROFS;
11820 }
11821 if (is_quota_files_exceeded(dir, perm)) {
11822 return -EDQUOT;
11823 }
11824 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11825 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11826
11827 filepath path;
11828 dir->make_nosnap_relative_path(path);
11829 path.push_dentry(name);
11830 req->set_filepath(path);
11831 req->set_inode(dir);
11832 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11833 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11834
11835 mode |= S_IFDIR;
11836 bufferlist xattrs_bl;
11837 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11838 if (res < 0)
11839 goto fail;
11840 req->head.args.mkdir.mode = mode;
11841 if (xattrs_bl.length() > 0)
11842 req->set_data(xattrs_bl);
11843
11844 Dentry *de;
11845 res = get_or_create(dir, name, &de);
11846 if (res < 0)
11847 goto fail;
11848 req->set_dentry(de);
11849
11850 ldout(cct, 10) << "_mkdir: making request" << dendl;
11851 res = make_request(req, perm, inp);
11852 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11853
11854 trim_cache();
11855
1adf2230 11856 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
11857 return res;
11858
11859 fail:
11860 put_request(req);
11861 return res;
11862}
11863
11864int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11865 struct stat *attr, Inode **out, const UserPerm& perm)
11866{
11867 Mutex::Locker lock(client_lock);
11868
181888fb
FG
11869 if (unmounting)
11870 return -ENOTCONN;
11871
7c673cae
FG
11872 vinodeno_t vparent = _get_vino(parent);
11873
11874 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11875 tout(cct) << "ll_mkdir" << std::endl;
11876 tout(cct) << vparent.ino.val << std::endl;
11877 tout(cct) << name << std::endl;
11878 tout(cct) << mode << std::endl;
11879
11880 if (!cct->_conf->fuse_default_permissions) {
11881 int r = may_create(parent, perm);
11882 if (r < 0)
11883 return r;
11884 }
11885
11886 InodeRef in;
11887 int r = _mkdir(parent, name, mode, perm, &in);
11888 if (r == 0) {
11889 fill_stat(in, attr);
11890 _ll_get(in.get());
11891 }
11892 tout(cct) << attr->st_ino << std::endl;
11893 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11894 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11895 *out = in.get();
11896 return r;
11897}
11898
11899int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11900 struct ceph_statx *stx, unsigned want, unsigned flags,
11901 const UserPerm& perms)
11902{
11903 Mutex::Locker lock(client_lock);
11904
181888fb
FG
11905 if (unmounting)
11906 return -ENOTCONN;
11907
7c673cae
FG
11908 vinodeno_t vparent = _get_vino(parent);
11909
11910 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11911 tout(cct) << "ll_mkdirx" << std::endl;
11912 tout(cct) << vparent.ino.val << std::endl;
11913 tout(cct) << name << std::endl;
11914 tout(cct) << mode << std::endl;
11915
11916 if (!cct->_conf->fuse_default_permissions) {
11917 int r = may_create(parent, perms);
11918 if (r < 0)
11919 return r;
11920 }
11921
11922 InodeRef in;
11923 int r = _mkdir(parent, name, mode, perms, &in);
11924 if (r == 0) {
11925 fill_statx(in, statx_to_mask(flags, want), stx);
11926 _ll_get(in.get());
11927 } else {
11928 stx->stx_ino = 0;
11929 stx->stx_mask = 0;
11930 }
11931 tout(cct) << stx->stx_ino << std::endl;
11932 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11933 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11934 *out = in.get();
11935 return r;
11936}
11937
11938int Client::_symlink(Inode *dir, const char *name, const char *target,
11939 const UserPerm& perms, InodeRef *inp)
11940{
1adf2230 11941 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
11942 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11943 << dendl;
11944
11945 if (strlen(name) > NAME_MAX)
11946 return -ENAMETOOLONG;
11947
11948 if (dir->snapid != CEPH_NOSNAP) {
11949 return -EROFS;
11950 }
11951 if (is_quota_files_exceeded(dir, perms)) {
11952 return -EDQUOT;
11953 }
11954
11955 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11956
11957 filepath path;
11958 dir->make_nosnap_relative_path(path);
11959 path.push_dentry(name);
11960 req->set_filepath(path);
11961 req->set_inode(dir);
11962 req->set_string2(target);
11963 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11964 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11965
11966 Dentry *de;
11967 int res = get_or_create(dir, name, &de);
11968 if (res < 0)
11969 goto fail;
11970 req->set_dentry(de);
11971
11972 res = make_request(req, perms, inp);
11973
11974 trim_cache();
1adf2230 11975 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
11976 res << dendl;
11977 return res;
11978
11979 fail:
11980 put_request(req);
11981 return res;
11982}
11983
11984int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11985 struct stat *attr, Inode **out, const UserPerm& perms)
11986{
11987 Mutex::Locker lock(client_lock);
11988
181888fb
FG
11989 if (unmounting)
11990 return -ENOTCONN;
11991
7c673cae
FG
11992 vinodeno_t vparent = _get_vino(parent);
11993
11994 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11995 << dendl;
11996 tout(cct) << "ll_symlink" << std::endl;
11997 tout(cct) << vparent.ino.val << std::endl;
11998 tout(cct) << name << std::endl;
11999 tout(cct) << value << std::endl;
12000
12001 if (!cct->_conf->fuse_default_permissions) {
12002 int r = may_create(parent, perms);
12003 if (r < 0)
12004 return r;
12005 }
12006
12007 InodeRef in;
12008 int r = _symlink(parent, name, value, perms, &in);
12009 if (r == 0) {
12010 fill_stat(in, attr);
12011 _ll_get(in.get());
12012 }
12013 tout(cct) << attr->st_ino << std::endl;
12014 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12015 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12016 *out = in.get();
12017 return r;
12018}
12019
12020int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12021 Inode **out, struct ceph_statx *stx, unsigned want,
12022 unsigned flags, const UserPerm& perms)
12023{
12024 Mutex::Locker lock(client_lock);
12025
181888fb
FG
12026 if (unmounting)
12027 return -ENOTCONN;
12028
7c673cae
FG
12029 vinodeno_t vparent = _get_vino(parent);
12030
12031 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12032 << dendl;
12033 tout(cct) << "ll_symlinkx" << std::endl;
12034 tout(cct) << vparent.ino.val << std::endl;
12035 tout(cct) << name << std::endl;
12036 tout(cct) << value << std::endl;
12037
12038 if (!cct->_conf->fuse_default_permissions) {
12039 int r = may_create(parent, perms);
12040 if (r < 0)
12041 return r;
12042 }
12043
12044 InodeRef in;
12045 int r = _symlink(parent, name, value, perms, &in);
12046 if (r == 0) {
12047 fill_statx(in, statx_to_mask(flags, want), stx);
12048 _ll_get(in.get());
12049 }
12050 tout(cct) << stx->stx_ino << std::endl;
12051 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12052 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12053 *out = in.get();
12054 return r;
12055}
12056
12057int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12058{
1adf2230 12059 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12060 << " uid " << perm.uid() << " gid " << perm.gid()
12061 << ")" << dendl;
12062
12063 if (dir->snapid != CEPH_NOSNAP) {
12064 return -EROFS;
12065 }
12066
12067 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12068
12069 filepath path;
12070 dir->make_nosnap_relative_path(path);
12071 path.push_dentry(name);
12072 req->set_filepath(path);
12073
12074 InodeRef otherin;
b32b8144 12075 Inode *in;
7c673cae 12076 Dentry *de;
b32b8144 12077
7c673cae
FG
12078 int res = get_or_create(dir, name, &de);
12079 if (res < 0)
12080 goto fail;
12081 req->set_dentry(de);
12082 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12083 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12084
12085 res = _lookup(dir, name, 0, &otherin, perm);
12086 if (res < 0)
12087 goto fail;
b32b8144
FG
12088
12089 in = otherin.get();
12090 req->set_other_inode(in);
12091 in->break_all_delegs();
7c673cae
FG
12092 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12093
12094 req->set_inode(dir);
12095
12096 res = make_request(req, perm);
12097
12098 trim_cache();
1adf2230 12099 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12100 return res;
12101
12102 fail:
12103 put_request(req);
12104 return res;
12105}
12106
12107int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12108{
12109 Mutex::Locker lock(client_lock);
12110
181888fb
FG
12111 if (unmounting)
12112 return -ENOTCONN;
12113
7c673cae
FG
12114 vinodeno_t vino = _get_vino(in);
12115
12116 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12117 tout(cct) << "ll_unlink" << std::endl;
12118 tout(cct) << vino.ino.val << std::endl;
12119 tout(cct) << name << std::endl;
12120
12121 if (!cct->_conf->fuse_default_permissions) {
12122 int r = may_delete(in, name, perm);
12123 if (r < 0)
12124 return r;
12125 }
12126 return _unlink(in, name, perm);
12127}
12128
12129int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12130{
1adf2230 12131 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12132 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12133
12134 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12135 return -EROFS;
12136 }
b32b8144
FG
12137
12138 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12139 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12140 filepath path;
12141 dir->make_nosnap_relative_path(path);
12142 path.push_dentry(name);
12143 req->set_filepath(path);
12144
12145 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12146 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12147 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12148
12149 InodeRef in;
12150
12151 Dentry *de;
12152 int res = get_or_create(dir, name, &de);
12153 if (res < 0)
12154 goto fail;
b32b8144
FG
12155 if (op == CEPH_MDS_OP_RMDIR)
12156 req->set_dentry(de);
12157 else
12158 de->get();
12159
7c673cae
FG
12160 res = _lookup(dir, name, 0, &in, perms);
12161 if (res < 0)
12162 goto fail;
b32b8144 12163 if (op == CEPH_MDS_OP_RMDIR) {
7c673cae 12164 req->set_inode(dir);
7c673cae
FG
12165 req->set_other_inode(in.get());
12166 } else {
12167 unlink(de, true, true);
b32b8144 12168 de->put();
7c673cae
FG
12169 req->set_other_inode(in.get());
12170 }
12171
12172 res = make_request(req, perms);
12173
12174 trim_cache();
1adf2230 12175 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12176 return res;
12177
12178 fail:
12179 put_request(req);
12180 return res;
12181}
12182
12183int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12184{
12185 Mutex::Locker lock(client_lock);
12186
181888fb
FG
12187 if (unmounting)
12188 return -ENOTCONN;
12189
7c673cae
FG
12190 vinodeno_t vino = _get_vino(in);
12191
12192 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12193 tout(cct) << "ll_rmdir" << std::endl;
12194 tout(cct) << vino.ino.val << std::endl;
12195 tout(cct) << name << std::endl;
12196
12197 if (!cct->_conf->fuse_default_permissions) {
12198 int r = may_delete(in, name, perms);
12199 if (r < 0)
12200 return r;
12201 }
12202
12203 return _rmdir(in, name, perms);
12204}
12205
12206int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12207{
1adf2230 12208 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12209 << todir->ino << " " << toname
12210 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12211 << dendl;
12212
12213 if (fromdir->snapid != todir->snapid)
12214 return -EXDEV;
12215
12216 int op = CEPH_MDS_OP_RENAME;
12217 if (fromdir->snapid != CEPH_NOSNAP) {
12218 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12219 op = CEPH_MDS_OP_RENAMESNAP;
12220 else
12221 return -EROFS;
12222 }
12223 if (fromdir != todir) {
12224 Inode *fromdir_root =
12225 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12226 Inode *todir_root =
12227 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12228 if (fromdir_root != todir_root) {
12229 return -EXDEV;
12230 }
12231 }
12232
12233 InodeRef target;
12234 MetaRequest *req = new MetaRequest(op);
12235
12236 filepath from;
12237 fromdir->make_nosnap_relative_path(from);
12238 from.push_dentry(fromname);
12239 filepath to;
12240 todir->make_nosnap_relative_path(to);
12241 to.push_dentry(toname);
12242 req->set_filepath(to);
12243 req->set_filepath2(from);
12244
12245 Dentry *oldde;
12246 int res = get_or_create(fromdir, fromname, &oldde);
12247 if (res < 0)
12248 goto fail;
12249 Dentry *de;
12250 res = get_or_create(todir, toname, &de);
12251 if (res < 0)
12252 goto fail;
12253
12254 if (op == CEPH_MDS_OP_RENAME) {
12255 req->set_old_dentry(oldde);
12256 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12257 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12258
12259 req->set_dentry(de);
12260 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12261 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12262
12263 InodeRef oldin, otherin;
12264 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12265 if (res < 0)
12266 goto fail;
b32b8144
FG
12267
12268 Inode *oldinode = oldin.get();
12269 oldinode->break_all_delegs();
12270 req->set_old_inode(oldinode);
7c673cae
FG
12271 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12272
12273 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12274 switch (res) {
12275 case 0:
12276 {
12277 Inode *in = otherin.get();
12278 req->set_other_inode(in);
12279 in->break_all_delegs();
12280 }
7c673cae 12281 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12282 break;
12283 case -ENOENT:
12284 break;
12285 default:
12286 goto fail;
7c673cae
FG
12287 }
12288
12289 req->set_inode(todir);
12290 } else {
12291 // renamesnap reply contains no tracedn, so we need to invalidate
12292 // dentry manually
12293 unlink(oldde, true, true);
12294 unlink(de, true, true);
12295 }
12296
12297 res = make_request(req, perm, &target);
12298 ldout(cct, 10) << "rename result is " << res << dendl;
12299
12300 // renamed item from our cache
12301
12302 trim_cache();
1adf2230 12303 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12304 return res;
12305
12306 fail:
12307 put_request(req);
12308 return res;
12309}
12310
12311int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12312 const char *newname, const UserPerm& perm)
12313{
12314 Mutex::Locker lock(client_lock);
12315
181888fb
FG
12316 if (unmounting)
12317 return -ENOTCONN;
12318
7c673cae
FG
12319 vinodeno_t vparent = _get_vino(parent);
12320 vinodeno_t vnewparent = _get_vino(newparent);
12321
12322 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12323 << vnewparent << " " << newname << dendl;
12324 tout(cct) << "ll_rename" << std::endl;
12325 tout(cct) << vparent.ino.val << std::endl;
12326 tout(cct) << name << std::endl;
12327 tout(cct) << vnewparent.ino.val << std::endl;
12328 tout(cct) << newname << std::endl;
12329
12330 if (!cct->_conf->fuse_default_permissions) {
12331 int r = may_delete(parent, name, perm);
12332 if (r < 0)
12333 return r;
12334 r = may_delete(newparent, newname, perm);
12335 if (r < 0 && r != -ENOENT)
12336 return r;
12337 }
12338
12339 return _rename(parent, name, newparent, newname, perm);
12340}
12341
12342int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12343{
1adf2230 12344 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12345 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12346
12347 if (strlen(newname) > NAME_MAX)
12348 return -ENAMETOOLONG;
12349
12350 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12351 return -EROFS;
12352 }
12353 if (is_quota_files_exceeded(dir, perm)) {
12354 return -EDQUOT;
12355 }
12356
b32b8144 12357 in->break_all_delegs();
7c673cae
FG
12358 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12359
12360 filepath path(newname, dir->ino);
12361 req->set_filepath(path);
12362 filepath existing(in->ino);
12363 req->set_filepath2(existing);
12364
12365 req->set_inode(dir);
12366 req->inode_drop = CEPH_CAP_FILE_SHARED;
12367 req->inode_unless = CEPH_CAP_FILE_EXCL;
12368
12369 Dentry *de;
12370 int res = get_or_create(dir, newname, &de);
12371 if (res < 0)
12372 goto fail;
12373 req->set_dentry(de);
12374
12375 res = make_request(req, perm, inp);
12376 ldout(cct, 10) << "link result is " << res << dendl;
12377
12378 trim_cache();
1adf2230 12379 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12380 return res;
12381
12382 fail:
12383 put_request(req);
12384 return res;
12385}
12386
12387int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12388 const UserPerm& perm)
12389{
12390 Mutex::Locker lock(client_lock);
12391
181888fb
FG
12392 if (unmounting)
12393 return -ENOTCONN;
12394
7c673cae
FG
12395 vinodeno_t vino = _get_vino(in);
12396 vinodeno_t vnewparent = _get_vino(newparent);
12397
31f18b77 12398 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12399 newname << dendl;
12400 tout(cct) << "ll_link" << std::endl;
12401 tout(cct) << vino.ino.val << std::endl;
12402 tout(cct) << vnewparent << std::endl;
12403 tout(cct) << newname << std::endl;
12404
12405 int r = 0;
12406 InodeRef target;
12407
12408 if (!cct->_conf->fuse_default_permissions) {
12409 if (S_ISDIR(in->mode))
12410 return -EPERM;
12411
12412 r = may_hardlink(in, perm);
12413 if (r < 0)
12414 return r;
12415
12416 r = may_create(newparent, perm);
12417 if (r < 0)
12418 return r;
12419 }
12420
12421 return _link(in, newparent, newname, perm, &target);
12422}
12423
12424int Client::ll_num_osds(void)
12425{
12426 Mutex::Locker lock(client_lock);
12427 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12428}
12429
12430int Client::ll_osdaddr(int osd, uint32_t *addr)
12431{
12432 Mutex::Locker lock(client_lock);
181888fb 12433
7c673cae
FG
12434 entity_addr_t g;
12435 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12436 if (!o.exists(osd))
12437 return false;
12438 g = o.get_addr(osd);
12439 return true;
12440 });
12441 if (!exists)
12442 return -1;
12443 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12444 *addr = ntohl(nb_addr);
12445 return 0;
12446}
181888fb 12447
7c673cae
FG
12448uint32_t Client::ll_stripe_unit(Inode *in)
12449{
12450 Mutex::Locker lock(client_lock);
12451 return in->layout.stripe_unit;
12452}
12453
12454uint64_t Client::ll_snap_seq(Inode *in)
12455{
12456 Mutex::Locker lock(client_lock);
12457 return in->snaprealm->seq;
12458}
12459
12460int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12461{
12462 Mutex::Locker lock(client_lock);
12463 *layout = in->layout;
12464 return 0;
12465}
12466
12467int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12468{
12469 return ll_file_layout(fh->inode.get(), layout);
12470}
12471
12472/* Currently we cannot take advantage of redundancy in reads, since we
12473 would have to go through all possible placement groups (a
12474 potentially quite large number determined by a hash), and use CRUSH
12475 to calculate the appropriate set of OSDs for each placement group,
12476 then index into that. An array with one entry per OSD is much more
12477 tractable and works for demonstration purposes. */
12478
12479int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12480 file_layout_t* layout)
12481{
12482 Mutex::Locker lock(client_lock);
181888fb 12483
28e407b8 12484 inodeno_t ino = in->ino;
7c673cae
FG
12485 uint32_t object_size = layout->object_size;
12486 uint32_t su = layout->stripe_unit;
12487 uint32_t stripe_count = layout->stripe_count;
12488 uint64_t stripes_per_object = object_size / su;
12489
12490 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12491 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12492 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12493 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12494
12495 object_t oid = file_object_t(ino, objectno);
12496 return objecter->with_osdmap([&](const OSDMap& o) {
12497 ceph_object_layout olayout =
12498 o.file_to_object_layout(oid, *layout);
12499 pg_t pg = (pg_t)olayout.ol_pgid;
12500 vector<int> osds;
12501 int primary;
12502 o.pg_to_acting_osds(pg, &osds, &primary);
12503 return primary;
12504 });
12505}
12506
12507/* Return the offset of the block, internal to the object */
12508
12509uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12510{
12511 Mutex::Locker lock(client_lock);
12512 file_layout_t *layout=&(in->layout);
12513 uint32_t object_size = layout->object_size;
12514 uint32_t su = layout->stripe_unit;
12515 uint64_t stripes_per_object = object_size / su;
12516
12517 return (blockno % stripes_per_object) * su;
12518}
12519
12520int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12521 const UserPerm& perms)
12522{
12523 Mutex::Locker lock(client_lock);
12524
181888fb
FG
12525 if (unmounting)
12526 return -ENOTCONN;
12527
7c673cae
FG
12528 vinodeno_t vino = _get_vino(in);
12529
12530 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12531 tout(cct) << "ll_opendir" << std::endl;
12532 tout(cct) << vino.ino.val << std::endl;
12533
12534 if (!cct->_conf->fuse_default_permissions) {
12535 int r = may_open(in, flags, perms);
12536 if (r < 0)
12537 return r;
12538 }
12539
12540 int r = _opendir(in, dirpp, perms);
12541 tout(cct) << (unsigned long)*dirpp << std::endl;
12542
12543 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12544 << dendl;
12545 return r;
12546}
12547
12548int Client::ll_releasedir(dir_result_t *dirp)
12549{
12550 Mutex::Locker lock(client_lock);
12551 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12552 tout(cct) << "ll_releasedir" << std::endl;
12553 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12554
12555 if (unmounting)
12556 return -ENOTCONN;
12557
7c673cae
FG
12558 _closedir(dirp);
12559 return 0;
12560}
12561
12562int Client::ll_fsyncdir(dir_result_t *dirp)
12563{
12564 Mutex::Locker lock(client_lock);
12565 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12566 tout(cct) << "ll_fsyncdir" << std::endl;
12567 tout(cct) << (unsigned long)dirp << std::endl;
12568
181888fb
FG
12569 if (unmounting)
12570 return -ENOTCONN;
12571
7c673cae
FG
12572 return _fsync(dirp->inode.get(), false);
12573}
12574
12575int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12576{
12577 assert(!(flags & O_CREAT));
12578
12579 Mutex::Locker lock(client_lock);
12580
181888fb
FG
12581 if (unmounting)
12582 return -ENOTCONN;
12583
7c673cae
FG
12584 vinodeno_t vino = _get_vino(in);
12585
12586 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12587 tout(cct) << "ll_open" << std::endl;
12588 tout(cct) << vino.ino.val << std::endl;
12589 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12590
12591 int r;
12592 if (!cct->_conf->fuse_default_permissions) {
12593 r = may_open(in, flags, perms);
12594 if (r < 0)
12595 goto out;
12596 }
12597
12598 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12599
12600 out:
12601 Fh *fhptr = fhp ? *fhp : NULL;
12602 if (fhptr) {
12603 ll_unclosed_fh_set.insert(fhptr);
12604 }
12605 tout(cct) << (unsigned long)fhptr << std::endl;
12606 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12607 " = " << r << " (" << fhptr << ")" << dendl;
12608 return r;
12609}
12610
12611int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12612 int flags, InodeRef *in, int caps, Fh **fhp,
12613 const UserPerm& perms)
12614{
12615 *fhp = NULL;
12616
12617 vinodeno_t vparent = _get_vino(parent);
12618
1adf2230 12619 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12620 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12621 << ", gid " << perms.gid() << dendl;
12622 tout(cct) << "ll_create" << std::endl;
12623 tout(cct) << vparent.ino.val << std::endl;
12624 tout(cct) << name << std::endl;
12625 tout(cct) << mode << std::endl;
12626 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12627
12628 bool created = false;
12629 int r = _lookup(parent, name, caps, in, perms);
12630
12631 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12632 return -EEXIST;
12633
12634 if (r == -ENOENT && (flags & O_CREAT)) {
12635 if (!cct->_conf->fuse_default_permissions) {
12636 r = may_create(parent, perms);
12637 if (r < 0)
12638 goto out;
12639 }
12640 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12641 perms);
12642 if (r < 0)
12643 goto out;
12644 }
12645
12646 if (r < 0)
12647 goto out;
12648
12649 assert(*in);
12650
12651 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12652 if (!created) {
12653 if (!cct->_conf->fuse_default_permissions) {
12654 r = may_open(in->get(), flags, perms);
12655 if (r < 0) {
12656 if (*fhp) {
12657 int release_r = _release_fh(*fhp);
12658 assert(release_r == 0); // during create, no async data ops should have happened
12659 }
12660 goto out;
12661 }
12662 }
12663 if (*fhp == NULL) {
12664 r = _open(in->get(), flags, mode, fhp, perms);
12665 if (r < 0)
12666 goto out;
12667 }
12668 }
12669
12670out:
12671 if (*fhp) {
12672 ll_unclosed_fh_set.insert(*fhp);
12673 }
12674
12675 ino_t ino = 0;
12676 if (r >= 0) {
12677 Inode *inode = in->get();
12678 if (use_faked_inos())
12679 ino = inode->faked_ino;
12680 else
12681 ino = inode->ino;
12682 }
12683
12684 tout(cct) << (unsigned long)*fhp << std::endl;
12685 tout(cct) << ino << std::endl;
1adf2230 12686 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12687 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12688 *fhp << " " << hex << ino << dec << ")" << dendl;
12689
12690 return r;
12691}
12692
12693int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12694 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12695 const UserPerm& perms)
12696{
12697 Mutex::Locker lock(client_lock);
12698 InodeRef in;
12699
181888fb
FG
12700 if (unmounting)
12701 return -ENOTCONN;
12702
7c673cae
FG
12703 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12704 fhp, perms);
12705 if (r >= 0) {
12706 assert(in);
12707
12708 // passing an Inode in outp requires an additional ref
12709 if (outp) {
12710 _ll_get(in.get());
12711 *outp = in.get();
12712 }
12713 fill_stat(in, attr);
12714 } else {
12715 attr->st_ino = 0;
12716 }
12717
12718 return r;
12719}
12720
12721int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12722 int oflags, Inode **outp, Fh **fhp,
12723 struct ceph_statx *stx, unsigned want, unsigned lflags,
12724 const UserPerm& perms)
12725{
12726 unsigned caps = statx_to_mask(lflags, want);
12727 Mutex::Locker lock(client_lock);
12728 InodeRef in;
12729
181888fb
FG
12730 if (unmounting)
12731 return -ENOTCONN;
7c673cae
FG
12732
12733 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12734 if (r >= 0) {
12735 assert(in);
12736
12737 // passing an Inode in outp requires an additional ref
12738 if (outp) {
12739 _ll_get(in.get());
12740 *outp = in.get();
12741 }
12742 fill_statx(in, caps, stx);
12743 } else {
12744 stx->stx_ino = 0;
12745 stx->stx_mask = 0;
12746 }
12747
12748 return r;
12749}
12750
12751loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12752{
12753 Mutex::Locker lock(client_lock);
12754 tout(cct) << "ll_lseek" << std::endl;
12755 tout(cct) << offset << std::endl;
12756 tout(cct) << whence << std::endl;
12757
181888fb
FG
12758 if (unmounting)
12759 return -ENOTCONN;
12760
7c673cae
FG
12761 return _lseek(fh, offset, whence);
12762}
12763
12764int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12765{
12766 Mutex::Locker lock(client_lock);
12767 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12768 tout(cct) << "ll_read" << std::endl;
12769 tout(cct) << (unsigned long)fh << std::endl;
12770 tout(cct) << off << std::endl;
12771 tout(cct) << len << std::endl;
12772
181888fb
FG
12773 if (unmounting)
12774 return -ENOTCONN;
12775
7c673cae
FG
12776 return _read(fh, off, len, bl);
12777}
12778
12779int Client::ll_read_block(Inode *in, uint64_t blockid,
12780 char *buf,
12781 uint64_t offset,
12782 uint64_t length,
12783 file_layout_t* layout)
12784{
12785 Mutex::Locker lock(client_lock);
181888fb
FG
12786
12787 if (unmounting)
12788 return -ENOTCONN;
12789
b32b8144 12790 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12791 object_t oid = file_object_t(vino.ino, blockid);
12792 C_SaferCond onfinish;
12793 bufferlist bl;
12794
12795 objecter->read(oid,
12796 object_locator_t(layout->pool_id),
12797 offset,
12798 length,
12799 vino.snapid,
12800 &bl,
12801 CEPH_OSD_FLAG_READ,
12802 &onfinish);
12803
12804 client_lock.Unlock();
12805 int r = onfinish.wait();
12806 client_lock.Lock();
12807
12808 if (r >= 0) {
12809 bl.copy(0, bl.length(), buf);
12810 r = bl.length();
12811 }
12812
12813 return r;
12814}
12815
12816/* It appears that the OSD doesn't return success unless the entire
12817 buffer was written, return the write length on success. */
12818
12819int Client::ll_write_block(Inode *in, uint64_t blockid,
12820 char* buf, uint64_t offset,
12821 uint64_t length, file_layout_t* layout,
12822 uint64_t snapseq, uint32_t sync)
12823{
12824 Mutex flock("Client::ll_write_block flock");
12825 vinodeno_t vino = ll_get_vino(in);
12826 Cond cond;
12827 bool done;
12828 int r = 0;
181888fb 12829 Context *onsafe = nullptr;
7c673cae
FG
12830
12831 if (length == 0) {
12832 return -EINVAL;
12833 }
12834 if (true || sync) {
12835 /* if write is stable, the epilogue is waiting on
12836 * flock */
12837 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12838 done = false;
12839 } else {
12840 /* if write is unstable, we just place a barrier for
12841 * future commits to wait on */
12842 /*onsafe = new C_Block_Sync(this, vino.ino,
12843 barrier_interval(offset, offset + length), &r);
12844 */
12845 done = true;
12846 }
12847 object_t oid = file_object_t(vino.ino, blockid);
12848 SnapContext fakesnap;
12849 bufferptr bp;
12850 if (length > 0) bp = buffer::copy(buf, length);
12851 bufferlist bl;
12852 bl.push_back(bp);
12853
12854 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12855 << dendl;
12856
12857 fakesnap.seq = snapseq;
12858
12859 /* lock just in time */
12860 client_lock.Lock();
181888fb
FG
12861 if (unmounting) {
12862 client_lock.Unlock();
12863 delete onsafe;
12864 return -ENOTCONN;
12865 }
7c673cae
FG
12866
12867 objecter->write(oid,
12868 object_locator_t(layout->pool_id),
12869 offset,
12870 length,
12871 fakesnap,
12872 bl,
12873 ceph::real_clock::now(),
12874 0,
12875 onsafe);
12876
12877 client_lock.Unlock();
12878 if (!done /* also !sync */) {
12879 flock.Lock();
12880 while (! done)
12881 cond.Wait(flock);
12882 flock.Unlock();
12883 }
12884
12885 if (r < 0) {
12886 return r;
12887 } else {
12888 return length;
12889 }
12890}
12891
12892int Client::ll_commit_blocks(Inode *in,
12893 uint64_t offset,
12894 uint64_t length)
12895{
12896 Mutex::Locker lock(client_lock);
12897 /*
12898 BarrierContext *bctx;
b32b8144 12899 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12900 uint64_t ino = vino.ino;
12901
12902 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12903 << offset << " to " << length << dendl;
12904
12905 if (length == 0) {
12906 return -EINVAL;
12907 }
12908
12909 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12910 if (p != barriers.end()) {
12911 barrier_interval civ(offset, offset + length);
12912 p->second->commit_barrier(civ);
12913 }
12914 */
12915 return 0;
12916}
12917
12918int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12919{
12920 Mutex::Locker lock(client_lock);
12921 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12922 "~" << len << dendl;
12923 tout(cct) << "ll_write" << std::endl;
12924 tout(cct) << (unsigned long)fh << std::endl;
12925 tout(cct) << off << std::endl;
12926 tout(cct) << len << std::endl;
12927
181888fb
FG
12928 if (unmounting)
12929 return -ENOTCONN;
12930
7c673cae
FG
12931 int r = _write(fh, off, len, data, NULL, 0);
12932 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12933 << dendl;
12934 return r;
12935}
12936
12937int Client::ll_flush(Fh *fh)
12938{
12939 Mutex::Locker lock(client_lock);
12940 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12941 tout(cct) << "ll_flush" << std::endl;
12942 tout(cct) << (unsigned long)fh << std::endl;
12943
181888fb
FG
12944 if (unmounting)
12945 return -ENOTCONN;
12946
7c673cae
FG
12947 return _flush(fh);
12948}
12949
12950int Client::ll_fsync(Fh *fh, bool syncdataonly)
12951{
12952 Mutex::Locker lock(client_lock);
12953 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12954 tout(cct) << "ll_fsync" << std::endl;
12955 tout(cct) << (unsigned long)fh << std::endl;
12956
181888fb
FG
12957 if (unmounting)
12958 return -ENOTCONN;
12959
7c673cae
FG
12960 int r = _fsync(fh, syncdataonly);
12961 if (r) {
12962 // If we're returning an error, clear it from the FH
12963 fh->take_async_err();
12964 }
12965 return r;
12966}
12967
28e407b8
AA
12968int Client::ll_sync_inode(Inode *in, bool syncdataonly)
12969{
12970 Mutex::Locker lock(client_lock);
12971 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
12972 tout(cct) << "ll_sync_inode" << std::endl;
12973 tout(cct) << (unsigned long)in << std::endl;
12974
12975 if (unmounting)
12976 return -ENOTCONN;
12977
12978 return _fsync(in, syncdataonly);
12979}
12980
7c673cae
FG
12981#ifdef FALLOC_FL_PUNCH_HOLE
12982
12983int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12984{
12985 if (offset < 0 || length <= 0)
12986 return -EINVAL;
12987
12988 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12989 return -EOPNOTSUPP;
12990
12991 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12992 return -EOPNOTSUPP;
12993
12994 Inode *in = fh->inode.get();
12995
12996 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12997 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12998 return -ENOSPC;
12999 }
13000
13001 if (in->snapid != CEPH_NOSNAP)
13002 return -EROFS;
13003
13004 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13005 return -EBADF;
13006
13007 uint64_t size = offset + length;
28e407b8 13008 std::list<InodeRef> quota_roots;
7c673cae
FG
13009 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13010 size > in->size &&
28e407b8 13011 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
7c673cae
FG
13012 return -EDQUOT;
13013 }
13014
13015 int have;
13016 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13017 if (r < 0)
13018 return r;
13019
13020 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
13021 Cond uninline_cond;
13022 bool uninline_done = false;
13023 int uninline_ret = 0;
13024 Context *onuninline = NULL;
13025
13026 if (mode & FALLOC_FL_PUNCH_HOLE) {
13027 if (in->inline_version < CEPH_INLINE_NONE &&
13028 (have & CEPH_CAP_FILE_BUFFER)) {
13029 bufferlist bl;
13030 int len = in->inline_data.length();
13031 if (offset < len) {
13032 if (offset > 0)
13033 in->inline_data.copy(0, offset, bl);
13034 int size = length;
13035 if (offset + size > len)
13036 size = len - offset;
13037 if (size > 0)
13038 bl.append_zero(size);
13039 if (offset + size < len)
13040 in->inline_data.copy(offset + size, len - offset - size, bl);
13041 in->inline_data = bl;
13042 in->inline_version++;
13043 }
13044 in->mtime = ceph_clock_now();
13045 in->change_attr++;
28e407b8 13046 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13047 } else {
13048 if (in->inline_version < CEPH_INLINE_NONE) {
13049 onuninline = new C_SafeCond(&uninline_flock,
13050 &uninline_cond,
13051 &uninline_done,
13052 &uninline_ret);
13053 uninline_data(in, onuninline);
13054 }
13055
13056 Mutex flock("Client::_punch_hole flock");
13057 Cond cond;
13058 bool done = false;
13059 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
13060
13061 unsafe_sync_write++;
13062 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13063
13064 _invalidate_inode_cache(in, offset, length);
13065 filer->zero(in->ino, &in->layout,
13066 in->snaprealm->get_snap_context(),
13067 offset, length,
13068 ceph::real_clock::now(),
13069 0, true, onfinish);
13070 in->mtime = ceph_clock_now();
13071 in->change_attr++;
28e407b8 13072 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13073
13074 client_lock.Unlock();
13075 flock.Lock();
13076 while (!done)
13077 cond.Wait(flock);
13078 flock.Unlock();
13079 client_lock.Lock();
13080 _sync_write_commit(in);
13081 }
13082 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13083 uint64_t size = offset + length;
13084 if (size > in->size) {
13085 in->size = size;
13086 in->mtime = ceph_clock_now();
13087 in->change_attr++;
28e407b8 13088 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13089
28e407b8 13090 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 13091 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13092 } else if (is_max_size_approaching(in)) {
13093 check_caps(in, 0);
7c673cae
FG
13094 }
13095 }
13096 }
13097
13098 if (onuninline) {
13099 client_lock.Unlock();
13100 uninline_flock.Lock();
13101 while (!uninline_done)
13102 uninline_cond.Wait(uninline_flock);
13103 uninline_flock.Unlock();
13104 client_lock.Lock();
13105
13106 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13107 in->inline_data.clear();
13108 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13109 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13110 check_caps(in, 0);
13111 } else
13112 r = uninline_ret;
13113 }
13114
13115 put_cap_ref(in, CEPH_CAP_FILE_WR);
13116 return r;
13117}
13118#else
13119
13120int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13121{
13122 return -EOPNOTSUPP;
13123}
13124
13125#endif
13126
13127
13128int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13129{
13130 Mutex::Locker lock(client_lock);
13131 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13132 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13133 tout(cct) << (unsigned long)fh << std::endl;
13134
181888fb
FG
13135 if (unmounting)
13136 return -ENOTCONN;
13137
7c673cae
FG
13138 return _fallocate(fh, mode, offset, length);
13139}
13140
13141int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13142{
13143 Mutex::Locker lock(client_lock);
13144 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13145
181888fb
FG
13146 if (unmounting)
13147 return -ENOTCONN;
13148
7c673cae
FG
13149 Fh *fh = get_filehandle(fd);
13150 if (!fh)
13151 return -EBADF;
13152#if defined(__linux__) && defined(O_PATH)
13153 if (fh->flags & O_PATH)
13154 return -EBADF;
13155#endif
13156 return _fallocate(fh, mode, offset, length);
13157}
13158
13159int Client::ll_release(Fh *fh)
13160{
13161 Mutex::Locker lock(client_lock);
13162 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13163 dendl;
13164 tout(cct) << "ll_release (fh)" << std::endl;
13165 tout(cct) << (unsigned long)fh << std::endl;
13166
181888fb
FG
13167 if (unmounting)
13168 return -ENOTCONN;
13169
7c673cae
FG
13170 if (ll_unclosed_fh_set.count(fh))
13171 ll_unclosed_fh_set.erase(fh);
13172 return _release_fh(fh);
13173}
13174
13175int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13176{
13177 Mutex::Locker lock(client_lock);
13178
13179 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13180 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13181
181888fb
FG
13182 if (unmounting)
13183 return -ENOTCONN;
13184
7c673cae
FG
13185 return _getlk(fh, fl, owner);
13186}
13187
13188int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13189{
13190 Mutex::Locker lock(client_lock);
13191
13192 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13193 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13194
181888fb
FG
13195 if (unmounting)
13196 return -ENOTCONN;
13197
7c673cae
FG
13198 return _setlk(fh, fl, owner, sleep);
13199}
13200
13201int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13202{
13203 Mutex::Locker lock(client_lock);
13204
13205 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13206 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13207
181888fb
FG
13208 if (unmounting)
13209 return -ENOTCONN;
13210
7c673cae
FG
13211 return _flock(fh, cmd, owner);
13212}
13213
b32b8144
FG
13214int Client::set_deleg_timeout(uint32_t timeout)
13215{
13216 Mutex::Locker lock(client_lock);
13217
13218 /*
13219 * The whole point is to prevent blacklisting so we must time out the
13220 * delegation before the session autoclose timeout kicks in.
13221 */
13222 if (timeout >= mdsmap->get_session_autoclose())
13223 return -EINVAL;
13224
13225 deleg_timeout = timeout;
13226 return 0;
13227}
13228
13229int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13230{
13231 int ret = -EINVAL;
13232
13233 Mutex::Locker lock(client_lock);
13234
13235 if (!mounted)
13236 return -ENOTCONN;
13237
13238 Inode *inode = fh->inode.get();
13239
13240 switch(cmd) {
13241 case CEPH_DELEGATION_NONE:
13242 inode->unset_deleg(fh);
13243 ret = 0;
13244 break;
13245 default:
13246 try {
13247 ret = inode->set_deleg(fh, cmd, cb, priv);
13248 } catch (std::bad_alloc) {
13249 ret = -ENOMEM;
13250 }
13251 break;
13252 }
13253 return ret;
13254}
13255
7c673cae
FG
13256class C_Client_RequestInterrupt : public Context {
13257private:
13258 Client *client;
13259 MetaRequest *req;
13260public:
13261 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13262 req->get();
13263 }
13264 void finish(int r) override {
13265 Mutex::Locker l(client->client_lock);
13266 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13267 client->_interrupt_filelock(req);
13268 client->put_request(req);
13269 }
13270};
13271
13272void Client::ll_interrupt(void *d)
13273{
13274 MetaRequest *req = static_cast<MetaRequest*>(d);
13275 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13276 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13277 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13278}
13279
13280// =========================================
13281// layout
13282
13283// expose file layouts
13284
13285int Client::describe_layout(const char *relpath, file_layout_t *lp,
13286 const UserPerm& perms)
13287{
13288 Mutex::Locker lock(client_lock);
13289
181888fb
FG
13290 if (unmounting)
13291 return -ENOTCONN;
13292
7c673cae
FG
13293 filepath path(relpath);
13294 InodeRef in;
13295 int r = path_walk(path, &in, perms);
13296 if (r < 0)
13297 return r;
13298
13299 *lp = in->layout;
13300
13301 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13302 return 0;
13303}
13304
13305int Client::fdescribe_layout(int fd, file_layout_t *lp)
13306{
13307 Mutex::Locker lock(client_lock);
13308
181888fb
FG
13309 if (unmounting)
13310 return -ENOTCONN;
13311
7c673cae
FG
13312 Fh *f = get_filehandle(fd);
13313 if (!f)
13314 return -EBADF;
13315 Inode *in = f->inode.get();
13316
13317 *lp = in->layout;
13318
13319 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13320 return 0;
13321}
13322
d2e6a577
FG
13323int64_t Client::get_default_pool_id()
13324{
13325 Mutex::Locker lock(client_lock);
181888fb
FG
13326
13327 if (unmounting)
13328 return -ENOTCONN;
13329
d2e6a577
FG
13330 /* first data pool is the default */
13331 return mdsmap->get_first_data_pool();
13332}
7c673cae
FG
13333
13334// expose osdmap
13335
13336int64_t Client::get_pool_id(const char *pool_name)
13337{
13338 Mutex::Locker lock(client_lock);
181888fb
FG
13339
13340 if (unmounting)
13341 return -ENOTCONN;
13342
7c673cae
FG
13343 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13344 pool_name);
13345}
13346
13347string Client::get_pool_name(int64_t pool)
13348{
13349 Mutex::Locker lock(client_lock);
181888fb
FG
13350
13351 if (unmounting)
13352 return string();
13353
7c673cae
FG
13354 return objecter->with_osdmap([pool](const OSDMap& o) {
13355 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13356 });
13357}
13358
13359int Client::get_pool_replication(int64_t pool)
13360{
13361 Mutex::Locker lock(client_lock);
181888fb
FG
13362
13363 if (unmounting)
13364 return -ENOTCONN;
13365
7c673cae
FG
13366 return objecter->with_osdmap([pool](const OSDMap& o) {
13367 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13368 });
13369}
13370
13371int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13372{
13373 Mutex::Locker lock(client_lock);
13374
181888fb
FG
13375 if (unmounting)
13376 return -ENOTCONN;
13377
7c673cae
FG
13378 Fh *f = get_filehandle(fd);
13379 if (!f)
13380 return -EBADF;
13381 Inode *in = f->inode.get();
13382
13383 vector<ObjectExtent> extents;
13384 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13385 assert(extents.size() == 1);
13386
13387 objecter->with_osdmap([&](const OSDMap& o) {
13388 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13389 o.pg_to_acting_osds(pg, osds);
13390 });
13391
13392 if (osds.empty())
13393 return -EINVAL;
13394
13395 /*
13396 * Return the remainder of the extent (stripe unit)
13397 *
13398 * If length = 1 is passed to Striper::file_to_extents we get a single
13399 * extent back, but its length is one so we still need to compute the length
13400 * to the end of the stripe unit.
13401 *
13402 * If length = su then we may get 1 or 2 objects back in the extents vector
13403 * which would have to be examined. Even then, the offsets are local to the
13404 * object, so matching up to the file offset is extra work.
13405 *
13406 * It seems simpler to stick with length = 1 and manually compute the
13407 * remainder.
13408 */
13409 if (len) {
13410 uint64_t su = in->layout.stripe_unit;
13411 *len = su - (off % su);
13412 }
13413
13414 return 0;
13415}
13416
13417int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13418{
13419 Mutex::Locker lock(client_lock);
181888fb
FG
13420
13421 if (unmounting)
13422 return -ENOTCONN;
13423
7c673cae
FG
13424 if (id < 0)
13425 return -EINVAL;
13426 return objecter->with_osdmap([&](const OSDMap& o) {
13427 return o.crush->get_full_location_ordered(id, path);
13428 });
13429}
13430
13431int Client::get_file_stripe_address(int fd, loff_t offset,
13432 vector<entity_addr_t>& address)
13433{
13434 Mutex::Locker lock(client_lock);
13435
181888fb
FG
13436 if (unmounting)
13437 return -ENOTCONN;
13438
7c673cae
FG
13439 Fh *f = get_filehandle(fd);
13440 if (!f)
13441 return -EBADF;
13442 Inode *in = f->inode.get();
13443
13444 // which object?
13445 vector<ObjectExtent> extents;
13446 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13447 in->truncate_size, extents);
13448 assert(extents.size() == 1);
13449
13450 // now we have the object and its 'layout'
13451 return objecter->with_osdmap([&](const OSDMap& o) {
13452 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13453 vector<int> osds;
13454 o.pg_to_acting_osds(pg, osds);
13455 if (osds.empty())
13456 return -EINVAL;
13457 for (unsigned i = 0; i < osds.size(); i++) {
13458 entity_addr_t addr = o.get_addr(osds[i]);
13459 address.push_back(addr);
13460 }
13461 return 0;
13462 });
13463}
13464
13465int Client::get_osd_addr(int osd, entity_addr_t& addr)
13466{
13467 Mutex::Locker lock(client_lock);
181888fb
FG
13468
13469 if (unmounting)
13470 return -ENOTCONN;
13471
7c673cae
FG
13472 return objecter->with_osdmap([&](const OSDMap& o) {
13473 if (!o.exists(osd))
13474 return -ENOENT;
13475
13476 addr = o.get_addr(osd);
13477 return 0;
13478 });
13479}
13480
13481int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13482 loff_t length, loff_t offset)
13483{
13484 Mutex::Locker lock(client_lock);
13485
181888fb
FG
13486 if (unmounting)
13487 return -ENOTCONN;
13488
7c673cae
FG
13489 Fh *f = get_filehandle(fd);
13490 if (!f)
13491 return -EBADF;
13492 Inode *in = f->inode.get();
13493
13494 // map to a list of extents
13495 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13496
13497 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13498 return 0;
13499}
13500
13501
b32b8144 13502/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13503int Client::get_local_osd()
13504{
13505 Mutex::Locker lock(client_lock);
181888fb
FG
13506
13507 if (unmounting)
13508 return -ENOTCONN;
13509
7c673cae
FG
13510 objecter->with_osdmap([this](const OSDMap& o) {
13511 if (o.get_epoch() != local_osd_epoch) {
13512 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13513 local_osd_epoch = o.get_epoch();
13514 }
13515 });
13516 return local_osd;
13517}
13518
13519
13520
13521
13522
13523
13524// ===============================
13525
13526void Client::ms_handle_connect(Connection *con)
13527{
13528 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13529}
13530
13531bool Client::ms_handle_reset(Connection *con)
13532{
13533 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13534 return false;
13535}
13536
13537void Client::ms_handle_remote_reset(Connection *con)
13538{
13539 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13540 Mutex::Locker l(client_lock);
13541 switch (con->get_peer_type()) {
13542 case CEPH_ENTITY_TYPE_MDS:
13543 {
13544 // kludge to figure out which mds this is; fixme with a Connection* state
13545 mds_rank_t mds = MDS_RANK_NONE;
13546 MetaSession *s = NULL;
13547 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13548 p != mds_sessions.end();
13549 ++p) {
13550 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13551 mds = p->first;
13552 s = p->second;
13553 }
13554 }
13555 if (mds >= 0) {
d2e6a577 13556 assert (s != NULL);
7c673cae
FG
13557 switch (s->state) {
13558 case MetaSession::STATE_CLOSING:
13559 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13560 _closed_mds_session(s);
13561 break;
13562
13563 case MetaSession::STATE_OPENING:
13564 {
13565 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13566 list<Context*> waiters;
13567 waiters.swap(s->waiting_for_open);
13568 _closed_mds_session(s);
13569 MetaSession *news = _get_or_open_mds_session(mds);
13570 news->waiting_for_open.swap(waiters);
13571 }
13572 break;
13573
13574 case MetaSession::STATE_OPEN:
13575 {
28e407b8 13576 objecter->maybe_request_map(); /* to check if we are blacklisted */
7c673cae
FG
13577 const md_config_t *conf = cct->_conf;
13578 if (conf->client_reconnect_stale) {
13579 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13580 _closed_mds_session(s);
13581 } else {
13582 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13583 s->state = MetaSession::STATE_STALE;
13584 }
13585 }
13586 break;
13587
13588 case MetaSession::STATE_NEW:
13589 case MetaSession::STATE_CLOSED:
13590 default:
13591 break;
13592 }
13593 }
13594 }
13595 break;
13596 }
13597}
13598
13599bool Client::ms_handle_refused(Connection *con)
13600{
13601 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13602 return false;
13603}
13604
13605bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13606{
13607 if (dest_type == CEPH_ENTITY_TYPE_MON)
13608 return true;
13609 *authorizer = monclient->build_authorizer(dest_type);
13610 return true;
13611}
13612
13613Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13614{
13615 Inode *cur = in;
13616 utime_t now = ceph_clock_now();
13617
13618 while (cur) {
13619 if (cur != in && cur->quota.is_enable())
13620 break;
13621
13622 Inode *parent_in = NULL;
13623 if (!cur->dn_set.empty()) {
13624 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13625 Dentry *dn = *p;
13626 if (dn->lease_mds >= 0 &&
13627 dn->lease_ttl > now &&
13628 mds_sessions.count(dn->lease_mds)) {
13629 parent_in = dn->dir->parent_inode;
13630 } else {
13631 Inode *diri = dn->dir->parent_inode;
13632 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13633 diri->shared_gen == dn->cap_shared_gen) {
13634 parent_in = dn->dir->parent_inode;
13635 }
13636 }
13637 if (parent_in)
13638 break;
13639 }
13640 } else if (root_parents.count(cur)) {
13641 parent_in = root_parents[cur].get();
13642 }
13643
13644 if (parent_in) {
13645 cur = parent_in;
13646 continue;
13647 }
13648
13649 if (cur == root_ancestor)
13650 break;
13651
181888fb
FG
13652 // deleted inode
13653 if (cur->nlink == 0) {
13654 cur = root_ancestor;
13655 break;
13656 }
13657
7c673cae
FG
13658 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13659 filepath path(cur->ino);
13660 req->set_filepath(path);
13661 req->set_inode(cur);
13662
13663 InodeRef parent_ref;
13664 int ret = make_request(req, perms, &parent_ref);
13665 if (ret < 0) {
13666 ldout(cct, 1) << __func__ << " " << in->vino()
13667 << " failed to find parent of " << cur->vino()
13668 << " err " << ret << dendl;
13669 // FIXME: what to do?
13670 cur = root_ancestor;
13671 break;
13672 }
13673
13674 now = ceph_clock_now();
13675 if (cur == in)
13676 cur = parent_ref.get();
13677 else
13678 cur = in; // start over
13679 }
13680
13681 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13682 return cur;
13683}
13684
13685/**
13686 * Traverse quota ancestors of the Inode, return true
13687 * if any of them passes the passed function
13688 */
13689bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13690 std::function<bool (const Inode &in)> test)
13691{
13692 while (true) {
13693 assert(in != NULL);
13694 if (test(*in)) {
13695 return true;
13696 }
13697
13698 if (in == root_ancestor) {
13699 // We're done traversing, drop out
13700 return false;
13701 } else {
13702 // Continue up the tree
13703 in = get_quota_root(in, perms);
13704 }
13705 }
13706
13707 return false;
13708}
13709
13710bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13711{
13712 return check_quota_condition(in, perms,
13713 [](const Inode &in) {
13714 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13715 });
13716}
13717
13718bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
28e407b8
AA
13719 const UserPerm& perms,
13720 std::list<InodeRef>* quota_roots)
7c673cae
FG
13721{
13722 return check_quota_condition(in, perms,
28e407b8
AA
13723 [&new_bytes, quota_roots](const Inode &in) {
13724 if (quota_roots)
13725 quota_roots->emplace_back(const_cast<Inode*>(&in));
7c673cae
FG
13726 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13727 > in.quota.max_bytes;
13728 });
13729}
13730
28e407b8 13731bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
7c673cae 13732{
28e407b8
AA
13733 assert(in->size >= in->reported_size);
13734 const uint64_t size = in->size - in->reported_size;
13735
13736 for (auto& diri : quota_roots) {
13737 if (diri->quota.max_bytes) {
13738 if (diri->rstat.rbytes >= diri->quota.max_bytes)
13739 return true;
13740
13741 uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
13742 if ((space >> 4) < size)
13743 return true;
13744 }
13745 }
13746 return false;
7c673cae
FG
13747}
13748
13749enum {
13750 POOL_CHECKED = 1,
13751 POOL_CHECKING = 2,
13752 POOL_READ = 4,
13753 POOL_WRITE = 8,
13754};
13755
13756int Client::check_pool_perm(Inode *in, int need)
13757{
13758 if (!cct->_conf->client_check_pool_perm)
13759 return 0;
13760
13761 int64_t pool_id = in->layout.pool_id;
13762 std::string pool_ns = in->layout.pool_ns;
13763 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13764 int have = 0;
13765 while (true) {
13766 auto it = pool_perms.find(perm_key);
13767 if (it == pool_perms.end())
13768 break;
13769 if (it->second == POOL_CHECKING) {
13770 // avoid concurrent checkings
13771 wait_on_list(waiting_for_pool_perm);
13772 } else {
13773 have = it->second;
13774 assert(have & POOL_CHECKED);
13775 break;
13776 }
13777 }
13778
13779 if (!have) {
13780 if (in->snapid != CEPH_NOSNAP) {
13781 // pool permission check needs to write to the first object. But for snapshot,
13782 // head of the first object may have alread been deleted. To avoid creating
13783 // orphan object, skip the check for now.
13784 return 0;
13785 }
13786
13787 pool_perms[perm_key] = POOL_CHECKING;
13788
13789 char oid_buf[32];
13790 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13791 object_t oid = oid_buf;
13792
13793 SnapContext nullsnapc;
13794
13795 C_SaferCond rd_cond;
13796 ObjectOperation rd_op;
13797 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13798
13799 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13800 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13801
13802 C_SaferCond wr_cond;
13803 ObjectOperation wr_op;
13804 wr_op.create(true);
13805
13806 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13807 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13808
13809 client_lock.Unlock();
13810 int rd_ret = rd_cond.wait();
13811 int wr_ret = wr_cond.wait();
13812 client_lock.Lock();
13813
13814 bool errored = false;
13815
13816 if (rd_ret == 0 || rd_ret == -ENOENT)
13817 have |= POOL_READ;
13818 else if (rd_ret != -EPERM) {
13819 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13820 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13821 errored = true;
13822 }
13823
13824 if (wr_ret == 0 || wr_ret == -EEXIST)
13825 have |= POOL_WRITE;
13826 else if (wr_ret != -EPERM) {
13827 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13828 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13829 errored = true;
13830 }
13831
13832 if (errored) {
13833 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13834 // Raise EIO because actual error code might be misleading for
13835 // userspace filesystem user.
13836 pool_perms.erase(perm_key);
13837 signal_cond_list(waiting_for_pool_perm);
13838 return -EIO;
13839 }
13840
13841 pool_perms[perm_key] = have | POOL_CHECKED;
13842 signal_cond_list(waiting_for_pool_perm);
13843 }
13844
13845 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13846 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13847 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13848 return -EPERM;
13849 }
13850 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13851 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13852 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13853 return -EPERM;
13854 }
13855
13856 return 0;
13857}
13858
13859int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13860{
13861 if (acl_type == POSIX_ACL) {
13862 if (in->xattrs.count(ACL_EA_ACCESS)) {
13863 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13864
13865 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13866 }
13867 }
13868 return -EAGAIN;
13869}
13870
13871int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13872{
13873 if (acl_type == NO_ACL)
13874 return 0;
13875
13876 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13877 if (r < 0)
13878 goto out;
13879
13880 if (acl_type == POSIX_ACL) {
13881 if (in->xattrs.count(ACL_EA_ACCESS)) {
13882 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13883 bufferptr acl(access_acl.c_str(), access_acl.length());
13884 r = posix_acl_access_chmod(acl, mode);
13885 if (r < 0)
13886 goto out;
13887 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13888 } else {
13889 r = 0;
13890 }
13891 }
13892out:
13893 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13894 return r;
13895}
13896
13897int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13898 const UserPerm& perms)
13899{
13900 if (acl_type == NO_ACL)
13901 return 0;
13902
13903 if (S_ISLNK(*mode))
13904 return 0;
13905
13906 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13907 if (r < 0)
13908 goto out;
13909
13910 if (acl_type == POSIX_ACL) {
13911 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13912 map<string, bufferptr> xattrs;
13913
13914 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13915 bufferptr acl(default_acl.c_str(), default_acl.length());
13916 r = posix_acl_inherit_mode(acl, mode);
13917 if (r < 0)
13918 goto out;
13919
13920 if (r > 0) {
13921 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13922 if (r < 0)
13923 goto out;
13924 if (r > 0)
13925 xattrs[ACL_EA_ACCESS] = acl;
13926 }
13927
13928 if (S_ISDIR(*mode))
13929 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13930
13931 r = xattrs.size();
13932 if (r > 0)
13933 ::encode(xattrs, xattrs_bl);
13934 } else {
13935 if (umask_cb)
13936 *mode &= ~umask_cb(callback_handle);
13937 r = 0;
13938 }
13939 }
13940out:
13941 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13942 return r;
13943}
13944
13945void Client::set_filer_flags(int flags)
13946{
13947 Mutex::Locker l(client_lock);
13948 assert(flags == 0 ||
13949 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13950 objecter->add_global_op_flags(flags);
13951}
13952
13953void Client::clear_filer_flags(int flags)
13954{
13955 Mutex::Locker l(client_lock);
13956 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13957 objecter->clear_global_op_flag(flags);
13958}
13959
13960/**
13961 * This is included in cap release messages, to cause
13962 * the MDS to wait until this OSD map epoch. It is necessary
13963 * in corner cases where we cancel RADOS ops, so that
13964 * nobody else tries to do IO to the same objects in
13965 * the same epoch as the cancelled ops.
13966 */
13967void Client::set_cap_epoch_barrier(epoch_t e)
13968{
13969 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13970 cap_epoch_barrier = e;
13971}
13972
13973const char** Client::get_tracked_conf_keys() const
13974{
13975 static const char* keys[] = {
13976 "client_cache_size",
13977 "client_cache_mid",
13978 "client_acl_type",
b32b8144
FG
13979 "client_deleg_timeout",
13980 "client_deleg_break_on_open",
7c673cae
FG
13981 NULL
13982 };
13983 return keys;
13984}
13985
13986void Client::handle_conf_change(const struct md_config_t *conf,
13987 const std::set <std::string> &changed)
13988{
13989 Mutex::Locker lock(client_lock);
13990
181888fb 13991 if (changed.count("client_cache_mid")) {
7c673cae
FG
13992 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13993 }
13994 if (changed.count("client_acl_type")) {
13995 acl_type = NO_ACL;
13996 if (cct->_conf->client_acl_type == "posix_acl")
13997 acl_type = POSIX_ACL;
13998 }
13999}
14000
7c673cae
FG
14001void intrusive_ptr_add_ref(Inode *in)
14002{
14003 in->get();
14004}
14005
14006void intrusive_ptr_release(Inode *in)
14007{
14008 in->client->put_inode(in);
14009}
14010
14011mds_rank_t Client::_get_random_up_mds() const
14012{
14013 assert(client_lock.is_locked_by_me());
14014
14015 std::set<mds_rank_t> up;
14016 mdsmap->get_up_mds_set(up);
14017
14018 if (up.empty())
14019 return MDS_RANK_NONE;
14020 std::set<mds_rank_t>::const_iterator p = up.begin();
14021 for (int n = rand() % up.size(); n; n--)
14022 ++p;
14023 return *p;
14024}
14025
14026
14027StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14028 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14029{
14030 monclient->set_messenger(m);
14031 objecter->set_client_incarnation(0);
14032}
14033
14034StandaloneClient::~StandaloneClient()
14035{
14036 delete objecter;
14037 objecter = nullptr;
14038}
14039
14040int StandaloneClient::init()
14041{
14042 timer.init();
14043 objectcacher->start();
14044 objecter->init();
14045
14046 client_lock.Lock();
14047 assert(!initialized);
14048
14049 messenger->add_dispatcher_tail(objecter);
14050 messenger->add_dispatcher_tail(this);
14051
14052 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14053 int r = monclient->init();
14054 if (r < 0) {
14055 // need to do cleanup because we're in an intermediate init state
14056 timer.shutdown();
14057 client_lock.Unlock();
14058 objecter->shutdown();
14059 objectcacher->stop();
14060 monclient->shutdown();
14061 return r;
14062 }
14063 objecter->start();
14064
14065 client_lock.Unlock();
14066 _finish_init();
14067
14068 return 0;
14069}
14070
14071void StandaloneClient::shutdown()
14072{
14073 Client::shutdown();
14074 objecter->shutdown();
14075 monclient->shutdown();
14076}