]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to 12.2.7
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
b32b8144 86#include "Delegation.h"
7c673cae
FG
87#include "Dir.h"
88#include "ClientSnapRealm.h"
89#include "Fh.h"
90#include "MetaSession.h"
91#include "MetaRequest.h"
92#include "ObjecterWriteback.h"
93#include "posix_acl.h"
94
95#include "include/assert.h"
96#include "include/stat.h"
97
98#include "include/cephfs/ceph_statx.h"
99
100#if HAVE_GETGROUPLIST
101#include <grp.h>
102#include <pwd.h>
103#include <unistd.h>
104#endif
105
106#undef dout_prefix
107#define dout_prefix *_dout << "client." << whoami << " "
108
109#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110
111// FreeBSD fails to define this
112#ifndef O_DSYNC
113#define O_DSYNC 0x0
114#endif
115// Darwin fails to define this
116#ifndef O_RSYNC
117#define O_RSYNC 0x0
118#endif
119
120#ifndef O_DIRECT
121#define O_DIRECT 0x0
122#endif
123
124#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125
126void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127{
128 Client *client = static_cast<Client*>(p);
129 client->flush_set_callback(oset);
130}
131
132
133// -------------
134
135Client::CommandHook::CommandHook(Client *client) :
136 m_client(client)
137{
138}
139
140bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
141 std::string format, bufferlist& out)
142{
143 Formatter *f = Formatter::create(format);
144 f->open_object_section("result");
145 m_client->client_lock.Lock();
146 if (command == "mds_requests")
147 m_client->dump_mds_requests(f);
148 else if (command == "mds_sessions")
149 m_client->dump_mds_sessions(f);
150 else if (command == "dump_cache")
151 m_client->dump_cache(f);
152 else if (command == "kick_stale_sessions")
153 m_client->_kick_stale_sessions();
154 else if (command == "status")
155 m_client->dump_status(f);
156 else
157 assert(0 == "bad command registered");
158 m_client->client_lock.Unlock();
159 f->close_section();
160 f->flush(out);
161 delete f;
162 return true;
163}
164
165
166// -------------
167
168dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
169 : inode(in), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
171 perms(perms)
172 { }
173
174void Client::_reset_faked_inos()
175{
176 ino_t start = 1024;
177 free_faked_inos.clear();
178 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
179 last_used_faked_ino = 0;
180 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
181}
182
183void Client::_assign_faked_ino(Inode *in)
184{
185 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
186 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
187 last_used_faked_ino = 0;
188 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 }
190 assert(it != free_faked_inos.end());
191 if (last_used_faked_ino < it.get_start()) {
192 assert(it.get_len() > 0);
193 last_used_faked_ino = it.get_start();
194 } else {
195 ++last_used_faked_ino;
196 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 }
198 in->faked_ino = last_used_faked_ino;
199 free_faked_inos.erase(in->faked_ino);
200 faked_ino_map[in->faked_ino] = in->vino();
201}
202
203void Client::_release_faked_ino(Inode *in)
204{
205 free_faked_inos.insert(in->faked_ino);
206 faked_ino_map.erase(in->faked_ino);
207}
208
209vinodeno_t Client::_map_faked_ino(ino_t ino)
210{
211 vinodeno_t vino;
212 if (ino == 1)
213 vino = root->vino();
214 else if (faked_ino_map.count(ino))
215 vino = faked_ino_map[ino];
216 else
217 vino = vinodeno_t(0, CEPH_NOSNAP);
218 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
219 return vino;
220}
221
222vinodeno_t Client::map_faked_ino(ino_t ino)
223{
224 Mutex::Locker lock(client_lock);
225 return _map_faked_ino(ino);
226}
227
228// cons/des
229
230Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
231 : Dispatcher(m->cct),
232 m_command_hook(this),
233 timer(m->cct, client_lock),
234 callback_handle(NULL),
235 switch_interrupt_cb(NULL),
236 remount_cb(NULL),
237 ino_invalidate_cb(NULL),
238 dentry_invalidate_cb(NULL),
7c673cae
FG
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
7c673cae
FG
241 async_ino_invalidator(m->cct),
242 async_dentry_invalidator(m->cct),
243 interrupt_finisher(m->cct),
244 remount_finisher(m->cct),
245 objecter_finisher(m->cct),
246 tick_event(NULL),
247 messenger(m), monclient(mc),
248 objecter(objecter_),
249 whoami(mc->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
251 initialized(false),
31f18b77 252 mounted(false), unmounting(false), blacklisted(false),
b32b8144 253 local_osd(-ENXIO), local_osd_epoch(0),
7c673cae 254 unsafe_sync_write(0),
b32b8144
FG
255 client_lock("Client::client_lock"),
256 deleg_timeout(0)
7c673cae
FG
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
7c673cae
FG
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
275
276 // file handles
277 free_fd_set.insert(10, 1<<30);
278
279 mdsmap.reset(new MDSMap);
280
281 // osd interfaces
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
283 &client_lock));
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
286 (void*)this,
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
292 true));
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 295 objecter->enable_blacklist_events();
7c673cae
FG
296}
297
298
299Client::~Client()
300{
301 assert(!client_lock.is_locked());
302
31f18b77
FG
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
306 client_lock.Lock();
7c673cae 307 tear_down_cache();
31f18b77 308 client_lock.Unlock();
7c673cae
FG
309}
310
311void Client::tear_down_cache()
312{
313 // fd's
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
315 it != fd_map.end();
316 ++it) {
317 Fh *fh = it->second;
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
319 _release_fh(fh);
320 }
321 fd_map.clear();
322
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
326 _closedir(dirp);
327 }
328
329 // caps!
330 // *** FIXME ***
331
332 // empty lru
7c673cae
FG
333 trim_cache();
334 assert(lru.lru_get_size() == 0);
335
336 // close root ino
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
339 delete root;
340 root = 0;
341 root_ancestor = 0;
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
344 inode_map.clear();
345 _reset_faked_inos();
346 }
347
348 assert(inode_map.empty());
349}
350
351inodeno_t Client::get_root_ino()
352{
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
356 else
357 return root->ino;
358}
359
360Inode *Client::get_root()
361{
362 Mutex::Locker l(client_lock);
363 root->ll_get();
364 return root;
365}
366
367
368// debug crapola
369
370void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
371{
372 filepath path;
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
377 << " " << path
378 << " ref " << in->get_num_ref()
379 << *in << dendl;
380
381 if (f) {
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
384 if (disconnected)
385 f->dump_int("disconnected", 1);
386 in->dump(f);
387 f->close_section();
388 }
389
390 did.insert(in);
391 if (in->dir) {
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
395 ++it) {
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
397 if (f) {
398 f->open_object_section("dentry");
399 it->second->dump(f);
400 f->close_section();
401 }
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
404 }
405 }
406}
407
408void Client::dump_cache(Formatter *f)
409{
410 set<Inode*> did;
411
412 ldout(cct, 1) << "dump_cache" << dendl;
413
414 if (f)
415 f->open_array_section("cache");
416
417 if (root)
418 dump_inode(f, root, did, true);
419
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
423 ++it) {
424 if (did.count(it->second))
425 continue;
426 dump_inode(f, it->second, did, true);
427 }
428
429 if (f)
430 f->close_section();
431}
432
433void Client::dump_status(Formatter *f)
434{
435 assert(client_lock.is_locked_by_me());
436
437 ldout(cct, 1) << __func__ << dendl;
438
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
441
442 if (f) {
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
446 f->close_section();
447
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
451 f->dump_int("inode_count", inode_map.size());
452 f->dump_int("mds_epoch", mdsmap->get_epoch());
453 f->dump_int("osd_epoch", osd_epoch);
454 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
455 }
456}
457
458int Client::init()
459{
460 timer.init();
461 objectcacher->start();
462
463 client_lock.Lock();
464 assert(!initialized);
465
466 messenger->add_dispatcher_tail(this);
467 client_lock.Unlock();
468
469 _finish_init();
470 return 0;
471}
472
473void Client::_finish_init()
474{
475 client_lock.Lock();
476 // logger
477 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
478 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
479 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
480 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
481 logger.reset(plb.create_perf_counters());
482 cct->get_perfcounters_collection()->add(logger.get());
483
484 client_lock.Unlock();
485
486 cct->_conf->add_observer(this);
487
488 AdminSocket* admin_socket = cct->get_admin_socket();
489 int ret = admin_socket->register_command("mds_requests",
490 "mds_requests",
491 &m_command_hook,
492 "show in-progress mds requests");
493 if (ret < 0) {
494 lderr(cct) << "error registering admin socket command: "
495 << cpp_strerror(-ret) << dendl;
496 }
497 ret = admin_socket->register_command("mds_sessions",
498 "mds_sessions",
499 &m_command_hook,
500 "show mds session state");
501 if (ret < 0) {
502 lderr(cct) << "error registering admin socket command: "
503 << cpp_strerror(-ret) << dendl;
504 }
505 ret = admin_socket->register_command("dump_cache",
506 "dump_cache",
507 &m_command_hook,
508 "show in-memory metadata cache contents");
509 if (ret < 0) {
510 lderr(cct) << "error registering admin socket command: "
511 << cpp_strerror(-ret) << dendl;
512 }
513 ret = admin_socket->register_command("kick_stale_sessions",
514 "kick_stale_sessions",
515 &m_command_hook,
516 "kick sessions that were remote reset");
517 if (ret < 0) {
518 lderr(cct) << "error registering admin socket command: "
519 << cpp_strerror(-ret) << dendl;
520 }
521 ret = admin_socket->register_command("status",
522 "status",
523 &m_command_hook,
524 "show overall client status");
525 if (ret < 0) {
526 lderr(cct) << "error registering admin socket command: "
527 << cpp_strerror(-ret) << dendl;
528 }
529
530 client_lock.Lock();
531 initialized = true;
532 client_lock.Unlock();
533}
534
535void Client::shutdown()
536{
537 ldout(cct, 1) << "shutdown" << dendl;
538
539 // If we were not mounted, but were being used for sending
540 // MDS commands, we may have sessions that need closing.
541 client_lock.Lock();
542 _close_sessions();
543 client_lock.Unlock();
544
545 cct->_conf->remove_observer(this);
546
547 AdminSocket* admin_socket = cct->get_admin_socket();
548 admin_socket->unregister_command("mds_requests");
549 admin_socket->unregister_command("mds_sessions");
550 admin_socket->unregister_command("dump_cache");
551 admin_socket->unregister_command("kick_stale_sessions");
552 admin_socket->unregister_command("status");
553
554 if (ino_invalidate_cb) {
555 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
556 async_ino_invalidator.wait_for_empty();
557 async_ino_invalidator.stop();
558 }
559
560 if (dentry_invalidate_cb) {
561 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
562 async_dentry_invalidator.wait_for_empty();
563 async_dentry_invalidator.stop();
564 }
565
566 if (switch_interrupt_cb) {
567 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
568 interrupt_finisher.wait_for_empty();
569 interrupt_finisher.stop();
570 }
571
572 if (remount_cb) {
573 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
574 remount_finisher.wait_for_empty();
575 remount_finisher.stop();
576 }
577
578 objectcacher->stop(); // outside of client_lock! this does a join.
579
580 client_lock.Lock();
581 assert(initialized);
582 initialized = false;
583 timer.shutdown();
584 client_lock.Unlock();
585
586 objecter_finisher.wait_for_empty();
587 objecter_finisher.stop();
588
589 if (logger) {
590 cct->get_perfcounters_collection()->remove(logger.get());
591 logger.reset();
592 }
593}
594
595
596// ===================
597// metadata cache stuff
598
599void Client::trim_cache(bool trim_kernel_dcache)
600{
181888fb
FG
601 uint64_t max = cct->_conf->client_cache_size;
602 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
603 unsigned last = 0;
604 while (lru.lru_get_size() != last) {
605 last = lru.lru_get_size();
606
181888fb 607 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
608
609 // trim!
31f18b77 610 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
611 if (!dn)
612 break; // done
613
614 trim_dentry(dn);
615 }
616
181888fb 617 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
618 _invalidate_kernel_dcache();
619
620 // hose root?
621 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
622 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
623 delete root;
624 root = 0;
625 root_ancestor = 0;
626 while (!root_parents.empty())
627 root_parents.erase(root_parents.begin());
628 inode_map.clear();
629 _reset_faked_inos();
630 }
631}
632
633void Client::trim_cache_for_reconnect(MetaSession *s)
634{
635 mds_rank_t mds = s->mds_num;
636 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
637
638 int trimmed = 0;
639 list<Dentry*> skipped;
640 while (lru.lru_get_size() > 0) {
641 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
642 if (!dn)
643 break;
644
645 if ((dn->inode && dn->inode->caps.count(mds)) ||
646 dn->dir->parent_inode->caps.count(mds)) {
647 trim_dentry(dn);
648 trimmed++;
649 } else
650 skipped.push_back(dn);
651 }
652
653 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
654 lru.lru_insert_mid(*p);
655
656 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
657 << " trimmed " << trimmed << " dentries" << dendl;
658
659 if (s->caps.size() > 0)
660 _invalidate_kernel_dcache();
661}
662
663void Client::trim_dentry(Dentry *dn)
664{
665 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
666 << " in dir " << hex << dn->dir->parent_inode->ino
667 << dendl;
668 if (dn->inode) {
669 Inode *diri = dn->dir->parent_inode;
670 diri->dir_release_count++;
671 clear_dir_complete_and_ordered(diri, true);
672 }
673 unlink(dn, false, false); // drop dir, drop dentry
674}
675
676
677void Client::update_inode_file_bits(Inode *in,
678 uint64_t truncate_seq, uint64_t truncate_size,
679 uint64_t size, uint64_t change_attr,
680 uint64_t time_warp_seq, utime_t ctime,
681 utime_t mtime,
682 utime_t atime,
683 version_t inline_version,
684 bufferlist& inline_data,
685 int issued)
686{
687 bool warn = false;
688 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
689 << " mtime " << mtime << dendl;
690 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
691 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
692 << " local " << in->time_warp_seq << dendl;
693 uint64_t prior_size = in->size;
694
695 if (inline_version > in->inline_version) {
696 in->inline_data = inline_data;
697 in->inline_version = inline_version;
698 }
699
700 /* always take a newer change attr */
701 if (change_attr > in->change_attr)
702 in->change_attr = change_attr;
703
704 if (truncate_seq > in->truncate_seq ||
705 (truncate_seq == in->truncate_seq && size > in->size)) {
706 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
707 in->size = size;
708 in->reported_size = size;
709 if (truncate_seq != in->truncate_seq) {
710 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
711 << truncate_seq << dendl;
712 in->truncate_seq = truncate_seq;
713 in->oset.truncate_seq = truncate_seq;
714
715 // truncate cached file data
716 if (prior_size > size) {
717 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
718 }
719 }
720
721 // truncate inline data
722 if (in->inline_version < CEPH_INLINE_NONE) {
723 uint32_t len = in->inline_data.length();
724 if (size < len)
725 in->inline_data.splice(size, len - size);
726 }
727 }
728 if (truncate_seq >= in->truncate_seq &&
729 in->truncate_size != truncate_size) {
730 if (in->is_file()) {
731 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
732 << truncate_size << dendl;
733 in->truncate_size = truncate_size;
734 in->oset.truncate_size = truncate_size;
735 } else {
736 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
737 }
738 }
739
740 // be careful with size, mtime, atime
741 if (issued & (CEPH_CAP_FILE_EXCL|
742 CEPH_CAP_FILE_WR|
743 CEPH_CAP_FILE_BUFFER|
744 CEPH_CAP_AUTH_EXCL|
745 CEPH_CAP_XATTR_EXCL)) {
746 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
747 if (ctime > in->ctime)
748 in->ctime = ctime;
749 if (time_warp_seq > in->time_warp_seq) {
750 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
751 << " is higher than local time_warp_seq "
752 << in->time_warp_seq << dendl;
753 //the mds updated times, so take those!
754 in->mtime = mtime;
755 in->atime = atime;
756 in->time_warp_seq = time_warp_seq;
757 } else if (time_warp_seq == in->time_warp_seq) {
758 //take max times
759 if (mtime > in->mtime)
760 in->mtime = mtime;
761 if (atime > in->atime)
762 in->atime = atime;
763 } else if (issued & CEPH_CAP_FILE_EXCL) {
764 //ignore mds values as we have a higher seq
765 } else warn = true;
766 } else {
767 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
768 if (time_warp_seq >= in->time_warp_seq) {
769 in->ctime = ctime;
770 in->mtime = mtime;
771 in->atime = atime;
772 in->time_warp_seq = time_warp_seq;
773 } else warn = true;
774 }
775 if (warn) {
776 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
777 << time_warp_seq << " is lower than local time_warp_seq "
778 << in->time_warp_seq
779 << dendl;
780 }
781}
782
783void Client::_fragmap_remove_non_leaves(Inode *in)
784{
785 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
786 if (!in->dirfragtree.is_leaf(p->first))
787 in->fragmap.erase(p++);
788 else
789 ++p;
790}
791
792void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
793{
794 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
795 if (p->second == mds)
796 in->fragmap.erase(p++);
797 else
798 ++p;
799}
800
801Inode * Client::add_update_inode(InodeStat *st, utime_t from,
802 MetaSession *session,
803 const UserPerm& request_perms)
804{
805 Inode *in;
806 bool was_new = false;
807 if (inode_map.count(st->vino)) {
808 in = inode_map[st->vino];
809 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
810 } else {
811 in = new Inode(this, st->vino, &st->layout);
812 inode_map[st->vino] = in;
813
814 if (use_faked_inos())
815 _assign_faked_ino(in);
816
817 if (!root) {
818 root = in;
819 root_ancestor = in;
820 cwd = root;
821 } else if (!mounted) {
822 root_parents[root_ancestor] = in;
823 root_ancestor = in;
824 }
825
826 // immutable bits
827 in->ino = st->vino.ino;
828 in->snapid = st->vino.snapid;
829 in->mode = st->mode & S_IFMT;
830 was_new = true;
831 }
832
833 in->rdev = st->rdev;
834 if (in->is_symlink())
835 in->symlink = st->symlink;
836
837 if (was_new)
838 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
839
840 if (!st->cap.caps)
841 return in; // as with readdir returning indoes in different snaprealms (no caps!)
842
843 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
844 bool updating_inode = false;
845 int issued = 0;
846 if (st->version == 0 ||
847 (in->version & ~1) < st->version) {
848 updating_inode = true;
849
850 int implemented = 0;
851 issued = in->caps_issued(&implemented) | in->caps_dirty();
852 issued |= implemented;
853
854 in->version = st->version;
855
856 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
857 in->mode = st->mode;
858 in->uid = st->uid;
859 in->gid = st->gid;
860 in->btime = st->btime;
861 }
862
863 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
864 in->nlink = st->nlink;
865 }
866
867 in->dirstat = st->dirstat;
868 in->rstat = st->rstat;
869 in->quota = st->quota;
870 in->layout = st->layout;
871
872 if (in->is_dir()) {
873 in->dir_layout = st->dir_layout;
874 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
875 }
876
877 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
878 st->change_attr, st->time_warp_seq, st->ctime,
879 st->mtime, st->atime, st->inline_version,
880 st->inline_data, issued);
881 } else if (st->inline_version > in->inline_version) {
882 in->inline_data = st->inline_data;
883 in->inline_version = st->inline_version;
884 }
885
886 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
887 st->xattrbl.length() &&
888 st->xattr_version > in->xattr_version) {
889 bufferlist::iterator p = st->xattrbl.begin();
890 ::decode(in->xattrs, p);
891 in->xattr_version = st->xattr_version;
892 }
893
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
898 }
899
900 if (in->snapid == CEPH_NOSNAP) {
901 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
902 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
903 request_perms);
28e407b8 904 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 905 in->max_size = st->max_size;
28e407b8
AA
906 in->rstat = st->rstat;
907 }
7c673cae
FG
908 } else
909 in->snap_caps |= st->cap.caps;
910
911 // setting I_COMPLETE needs to happen after adding the cap
912 if (updating_inode &&
913 in->is_dir() &&
914 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
915 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
916 in->dirstat.nfiles == 0 &&
917 in->dirstat.nsubdirs == 0) {
918 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
919 in->flags |= I_COMPLETE | I_DIR_ORDERED;
920 if (in->dir) {
921 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
922 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
923 in->dir->readdir_cache.clear();
924 for (auto p = in->dir->dentries.begin();
925 p != in->dir->dentries.end();
926 ++p) {
927 unlink(p->second, true, true); // keep dir, keep dentry
928 }
929 if (in->dir->dentries.empty())
930 close_dir(in->dir);
931 }
932 }
933
934 return in;
935}
936
937
938/*
939 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
940 */
941Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
942 Inode *in, utime_t from, MetaSession *session,
943 Dentry *old_dentry)
944{
945 Dentry *dn = NULL;
946 if (dir->dentries.count(dname))
947 dn = dir->dentries[dname];
948
949 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
950 << " in dir " << dir->parent_inode->vino() << " dn " << dn
951 << dendl;
952
953 if (dn && dn->inode) {
954 if (dn->inode->vino() == in->vino()) {
955 touch_dn(dn);
956 ldout(cct, 12) << " had dentry " << dname
957 << " with correct vino " << dn->inode->vino()
958 << dendl;
959 } else {
960 ldout(cct, 12) << " had dentry " << dname
961 << " with WRONG vino " << dn->inode->vino()
962 << dendl;
963 unlink(dn, true, true); // keep dir, keep dentry
964 }
965 }
966
967 if (!dn || !dn->inode) {
968 InodeRef tmp_ref(in);
969 if (old_dentry) {
970 if (old_dentry->dir != dir) {
971 Inode *old_diri = old_dentry->dir->parent_inode;
972 old_diri->dir_ordered_count++;
973 clear_dir_complete_and_ordered(old_diri, false);
974 }
975 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
976 }
977 Inode *diri = dir->parent_inode;
978 diri->dir_ordered_count++;
979 clear_dir_complete_and_ordered(diri, false);
980 dn = link(dir, dname, in, dn);
981 }
982
983 update_dentry_lease(dn, dlease, from, session);
984 return dn;
985}
986
987void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
988{
989 utime_t dttl = from;
990 dttl += (float)dlease->duration_ms / 1000.0;
991
992 assert(dn);
993
994 if (dlease->mask & CEPH_LOCK_DN) {
995 if (dttl > dn->lease_ttl) {
996 ldout(cct, 10) << "got dentry lease on " << dn->name
997 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
998 dn->lease_ttl = dttl;
999 dn->lease_mds = session->mds_num;
1000 dn->lease_seq = dlease->seq;
1001 dn->lease_gen = session->cap_gen;
1002 }
1003 }
1004 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1005}
1006
1007
1008/*
1009 * update MDS location cache for a single inode
1010 */
1011void Client::update_dir_dist(Inode *in, DirStat *dst)
1012{
1013 // auth
1014 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1015 if (dst->auth >= 0) {
1016 in->fragmap[dst->frag] = dst->auth;
1017 } else {
1018 in->fragmap.erase(dst->frag);
1019 }
1020 if (!in->dirfragtree.is_leaf(dst->frag)) {
1021 in->dirfragtree.force_to_leaf(cct, dst->frag);
1022 _fragmap_remove_non_leaves(in);
1023 }
1024
1025 // replicated
1026 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1027
1028 // dist
1029 /*
1030 if (!st->dirfrag_dist.empty()) { // FIXME
1031 set<int> dist = st->dirfrag_dist.begin()->second;
1032 if (dist.empty() && !in->dir_contacts.empty())
1033 ldout(cct, 9) << "lost dist spec for " << in->ino
1034 << " " << dist << dendl;
1035 if (!dist.empty() && in->dir_contacts.empty())
1036 ldout(cct, 9) << "got dist spec for " << in->ino
1037 << " " << dist << dendl;
1038 in->dir_contacts = dist;
1039 }
1040 */
1041}
1042
1043void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1044{
1045 if (diri->flags & I_COMPLETE) {
1046 if (complete) {
1047 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1048 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1049 } else {
1050 if (diri->flags & I_DIR_ORDERED) {
1051 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1052 diri->flags &= ~I_DIR_ORDERED;
1053 }
1054 }
1055 if (diri->dir)
1056 diri->dir->readdir_cache.clear();
1057 }
1058}
1059
1060/*
1061 * insert results from readdir or lssnap into the metadata cache.
1062 */
1063void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1064
1065 MClientReply *reply = request->reply;
1066 ConnectionRef con = request->reply->get_connection();
1067 uint64_t features = con->get_features();
1068
1069 dir_result_t *dirp = request->dirp;
1070 assert(dirp);
1071
1072 // the extra buffer list is only set for readdir and lssnap replies
1073 bufferlist::iterator p = reply->get_extra_bl().begin();
1074 if (!p.end()) {
1075 // snapdir?
1076 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1077 assert(diri);
1078 diri = open_snapdir(diri);
1079 }
1080
1081 // only open dir if we're actually adding stuff to it!
1082 Dir *dir = diri->open_dir();
1083 assert(dir);
1084
1085 // dirstat
1086 DirStat dst(p);
1087 __u32 numdn;
1088 __u16 flags;
1089 ::decode(numdn, p);
1090 ::decode(flags, p);
1091
1092 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1093 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1094
1095 frag_t fg = (unsigned)request->head.args.readdir.frag;
1096 unsigned readdir_offset = dirp->next_offset;
1097 string readdir_start = dirp->last_name;
1098 assert(!readdir_start.empty() || readdir_offset == 2);
1099
1100 unsigned last_hash = 0;
1101 if (hash_order) {
1102 if (!readdir_start.empty()) {
1103 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1104 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1105 /* mds understands offset_hash */
1106 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1107 }
1108 }
1109
1110 if (fg != dst.frag) {
1111 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1112 fg = dst.frag;
1113 if (!hash_order) {
1114 readdir_offset = 2;
1115 readdir_start.clear();
1116 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1117 }
1118 }
1119
1120 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1121 << ", hash_order=" << hash_order
1122 << ", readdir_start " << readdir_start
1123 << ", last_hash " << last_hash
1124 << ", next_offset " << readdir_offset << dendl;
1125
1126 if (diri->snapid != CEPH_SNAPDIR &&
1127 fg.is_leftmost() && readdir_offset == 2 &&
1128 !(hash_order && last_hash)) {
1129 dirp->release_count = diri->dir_release_count;
1130 dirp->ordered_count = diri->dir_ordered_count;
1131 dirp->start_shared_gen = diri->shared_gen;
1132 dirp->cache_index = 0;
1133 }
1134
1135 dirp->buffer_frag = fg;
1136
1137 _readdir_drop_dirp_buffer(dirp);
1138 dirp->buffer.reserve(numdn);
1139
1140 string dname;
1141 LeaseStat dlease;
1142 for (unsigned i=0; i<numdn; i++) {
1143 ::decode(dname, p);
1144 ::decode(dlease, p);
1145 InodeStat ist(p, features);
1146
1147 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1148
1149 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1150 request->perms);
1151 Dentry *dn;
1152 if (diri->dir->dentries.count(dname)) {
1153 Dentry *olddn = diri->dir->dentries[dname];
1154 if (olddn->inode != in) {
1155 // replace incorrect dentry
1156 unlink(olddn, true, true); // keep dir, dentry
1157 dn = link(dir, dname, in, olddn);
1158 assert(dn == olddn);
1159 } else {
1160 // keep existing dn
1161 dn = olddn;
1162 touch_dn(dn);
1163 }
1164 } else {
1165 // new dn
1166 dn = link(dir, dname, in, NULL);
1167 }
1168
1169 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1170 if (hash_order) {
1171 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1172 if (hash != last_hash)
1173 readdir_offset = 2;
1174 last_hash = hash;
1175 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1176 } else {
1177 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1178 }
1179 // add to readdir cache
1180 if (dirp->release_count == diri->dir_release_count &&
1181 dirp->ordered_count == diri->dir_ordered_count &&
1182 dirp->start_shared_gen == diri->shared_gen) {
1183 if (dirp->cache_index == dir->readdir_cache.size()) {
1184 if (i == 0) {
1185 assert(!dirp->inode->is_complete_and_ordered());
1186 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1187 }
1188 dir->readdir_cache.push_back(dn);
1189 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1190 if (dirp->inode->is_complete_and_ordered())
1191 assert(dir->readdir_cache[dirp->cache_index] == dn);
1192 else
1193 dir->readdir_cache[dirp->cache_index] = dn;
1194 } else {
1195 assert(0 == "unexpected readdir buffer idx");
1196 }
1197 dirp->cache_index++;
1198 }
1199 // add to cached result list
1200 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1201 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1202 }
1203
1204 if (numdn > 0)
1205 dirp->last_name = dname;
1206 if (end)
1207 dirp->next_offset = 2;
1208 else
1209 dirp->next_offset = readdir_offset;
1210
1211 if (dir->is_empty())
1212 close_dir(dir);
1213 }
1214}
1215
1216/** insert_trace
1217 *
1218 * insert a trace from a MDS reply into the cache.
1219 */
1220Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1221{
1222 MClientReply *reply = request->reply;
1223 int op = request->get_op();
1224
1225 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1226 << " is_target=" << (int)reply->head.is_target
1227 << " is_dentry=" << (int)reply->head.is_dentry
1228 << dendl;
1229
1230 bufferlist::iterator p = reply->get_trace_bl().begin();
1231 if (request->got_unsafe) {
1232 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1233 assert(p.end());
1234 return NULL;
1235 }
1236
1237 if (p.end()) {
1238 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1239
1240 Dentry *d = request->dentry();
1241 if (d) {
1242 Inode *diri = d->dir->parent_inode;
1243 diri->dir_release_count++;
1244 clear_dir_complete_and_ordered(diri, true);
1245 }
1246
1247 if (d && reply->get_result() == 0) {
1248 if (op == CEPH_MDS_OP_RENAME) {
1249 // rename
1250 Dentry *od = request->old_dentry();
1251 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1252 assert(od);
1253 unlink(od, true, true); // keep dir, dentry
1254 } else if (op == CEPH_MDS_OP_RMDIR ||
1255 op == CEPH_MDS_OP_UNLINK) {
1256 // unlink, rmdir
1257 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1258 unlink(d, true, true); // keep dir, dentry
1259 }
1260 }
1261 return NULL;
1262 }
1263
1264 ConnectionRef con = request->reply->get_connection();
1265 uint64_t features = con->get_features();
1266 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1267
1268 // snap trace
1269 SnapRealm *realm = NULL;
1270 if (reply->snapbl.length())
1271 update_snap_trace(reply->snapbl, &realm);
1272
1273 ldout(cct, 10) << " hrm "
1274 << " is_target=" << (int)reply->head.is_target
1275 << " is_dentry=" << (int)reply->head.is_dentry
1276 << dendl;
1277
1278 InodeStat dirst;
1279 DirStat dst;
1280 string dname;
1281 LeaseStat dlease;
1282 InodeStat ist;
1283
1284 if (reply->head.is_dentry) {
1285 dirst.decode(p, features);
1286 dst.decode(p);
1287 ::decode(dname, p);
1288 ::decode(dlease, p);
1289 }
1290
1291 Inode *in = 0;
1292 if (reply->head.is_target) {
1293 ist.decode(p, features);
1294 if (cct->_conf->client_debug_getattr_caps) {
1295 unsigned wanted = 0;
1296 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1297 wanted = request->head.args.getattr.mask;
1298 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1299 wanted = request->head.args.open.mask;
1300
1301 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1302 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1303 assert(0 == "MDS reply does not contain xattrs");
1304 }
1305
1306 in = add_update_inode(&ist, request->sent_stamp, session,
1307 request->perms);
1308 }
1309
1310 Inode *diri = NULL;
1311 if (reply->head.is_dentry) {
1312 diri = add_update_inode(&dirst, request->sent_stamp, session,
1313 request->perms);
1314 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1315
1316 if (in) {
1317 Dir *dir = diri->open_dir();
1318 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1319 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1320 } else {
1321 Dentry *dn = NULL;
1322 if (diri->dir && diri->dir->dentries.count(dname)) {
1323 dn = diri->dir->dentries[dname];
1324 if (dn->inode) {
1325 diri->dir_ordered_count++;
1326 clear_dir_complete_and_ordered(diri, false);
1327 unlink(dn, true, true); // keep dir, dentry
1328 }
1329 }
1330 if (dlease.duration_ms > 0) {
1331 if (!dn) {
1332 Dir *dir = diri->open_dir();
1333 dn = link(dir, dname, NULL, NULL);
1334 }
1335 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1336 }
1337 }
1338 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1339 op == CEPH_MDS_OP_MKSNAP) {
1340 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1341 // fake it for snap lookup
1342 vinodeno_t vino = ist.vino;
1343 vino.snapid = CEPH_SNAPDIR;
1344 assert(inode_map.count(vino));
1345 diri = inode_map[vino];
1346
1347 string dname = request->path.last_dentry();
1348
1349 LeaseStat dlease;
1350 dlease.duration_ms = 0;
1351
1352 if (in) {
1353 Dir *dir = diri->open_dir();
1354 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1355 } else {
1356 if (diri->dir && diri->dir->dentries.count(dname)) {
1357 Dentry *dn = diri->dir->dentries[dname];
1358 if (dn->inode)
1359 unlink(dn, true, true); // keep dir, dentry
1360 }
1361 }
1362 }
1363
1364 if (in) {
1365 if (op == CEPH_MDS_OP_READDIR ||
1366 op == CEPH_MDS_OP_LSSNAP) {
1367 insert_readdir_results(request, session, in);
1368 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1369 // hack: return parent inode instead
1370 in = diri;
1371 }
1372
1373 if (request->dentry() == NULL && in != request->inode()) {
1374 // pin the target inode if its parent dentry is not pinned
1375 request->set_other_inode(in);
1376 }
1377 }
1378
1379 if (realm)
1380 put_snap_realm(realm);
1381
1382 request->target = in;
1383 return in;
1384}
1385
1386// -------
1387
1388mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1389{
1390 mds_rank_t mds = MDS_RANK_NONE;
1391 __u32 hash = 0;
1392 bool is_hash = false;
1393
1394 Inode *in = NULL;
1395 Dentry *de = NULL;
1396 Cap *cap = NULL;
1397
1398 if (req->resend_mds >= 0) {
1399 mds = req->resend_mds;
1400 req->resend_mds = -1;
1401 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1402 goto out;
1403 }
1404
1405 if (cct->_conf->client_use_random_mds)
1406 goto random_mds;
1407
1408 in = req->inode();
1409 de = req->dentry();
1410 if (in) {
1411 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1412 if (req->path.depth()) {
1413 hash = in->hash_dentry_name(req->path[0]);
1414 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1415 << " on " << req->path[0]
1416 << " => " << hash << dendl;
1417 is_hash = true;
1418 }
1419 } else if (de) {
1420 if (de->inode) {
1421 in = de->inode.get();
1422 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1423 } else {
1424 in = de->dir->parent_inode;
1425 hash = in->hash_dentry_name(de->name);
1426 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1427 << " on " << de->name
1428 << " => " << hash << dendl;
1429 is_hash = true;
1430 }
1431 }
1432 if (in) {
1433 if (in->snapid != CEPH_NOSNAP) {
1434 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1435 while (in->snapid != CEPH_NOSNAP) {
1436 if (in->snapid == CEPH_SNAPDIR)
1437 in = in->snapdir_parent.get();
1438 else if (!in->dn_set.empty())
1439 /* In most cases there will only be one dentry, so getting it
1440 * will be the correct action. If there are multiple hard links,
1441 * I think the MDS should be able to redirect as needed*/
1442 in = in->get_first_parent()->dir->parent_inode;
1443 else {
1444 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1445 break;
1446 }
1447 }
1448 is_hash = false;
1449 }
1450
1451 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1452 << " hash=" << hash << dendl;
1453
1454 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1455 frag_t fg = in->dirfragtree[hash];
1456 if (in->fragmap.count(fg)) {
1457 mds = in->fragmap[fg];
1458 if (phash_diri)
1459 *phash_diri = in;
1460 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1461 goto out;
1462 }
1463 }
1464
1465 if (req->auth_is_best())
1466 cap = in->auth_cap;
1467 if (!cap && !in->caps.empty())
1468 cap = in->caps.begin()->second;
1469 if (!cap)
1470 goto random_mds;
1471 mds = cap->session->mds_num;
1472 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1473
1474 goto out;
1475 }
1476
1477random_mds:
1478 if (mds < 0) {
1479 mds = _get_random_up_mds();
1480 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1481 }
1482
1483out:
1484 ldout(cct, 20) << "mds is " << mds << dendl;
1485 return mds;
1486}
1487
1488
1489void Client::connect_mds_targets(mds_rank_t mds)
1490{
1491 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1492 assert(mds_sessions.count(mds));
1493 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1494 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1495 q != info.export_targets.end();
1496 ++q) {
1497 if (mds_sessions.count(*q) == 0 &&
1498 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1499 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1500 << " export target mds." << *q << dendl;
1501 _open_mds_session(*q);
1502 }
1503 }
1504}
1505
1506void Client::dump_mds_sessions(Formatter *f)
1507{
1508 f->dump_int("id", get_nodeid().v);
1509 f->open_array_section("sessions");
1510 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1511 f->open_object_section("session");
1512 p->second->dump(f);
1513 f->close_section();
1514 }
1515 f->close_section();
1516 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1517}
1518void Client::dump_mds_requests(Formatter *f)
1519{
1520 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1521 p != mds_requests.end();
1522 ++p) {
1523 f->open_object_section("request");
1524 p->second->dump(f);
1525 f->close_section();
1526 }
1527}
1528
1529int Client::verify_reply_trace(int r,
1530 MetaRequest *request, MClientReply *reply,
1531 InodeRef *ptarget, bool *pcreated,
1532 const UserPerm& perms)
1533{
1534 // check whether this request actually did the create, and set created flag
1535 bufferlist extra_bl;
1536 inodeno_t created_ino;
1537 bool got_created_ino = false;
1538 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1539
1540 extra_bl.claim(reply->get_extra_bl());
1541 if (extra_bl.length() >= 8) {
1542 // if the extra bufferlist has a buffer, we assume its the created inode
1543 // and that this request to create succeeded in actually creating
1544 // the inode (won the race with other create requests)
1545 ::decode(created_ino, extra_bl);
1546 got_created_ino = true;
1547 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1548 }
1549
1550 if (pcreated)
1551 *pcreated = got_created_ino;
1552
1553 if (request->target) {
1554 *ptarget = request->target;
1555 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1556 } else {
1557 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1558 (*ptarget) = p->second;
1559 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1560 } else {
1561 // we got a traceless reply, and need to look up what we just
1562 // created. for now, do this by name. someday, do this by the
1563 // ino... which we know! FIXME.
1564 InodeRef target;
1565 Dentry *d = request->dentry();
1566 if (d) {
1567 if (d->dir) {
1568 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1569 << d->dir->parent_inode->ino << "/" << d->name
1570 << " got_ino " << got_created_ino
1571 << " ino " << created_ino
1572 << dendl;
1573 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1574 &target, perms);
1575 } else {
1576 // if the dentry is not linked, just do our best. see #5021.
1577 assert(0 == "how did this happen? i want logs!");
1578 }
1579 } else {
1580 Inode *in = request->inode();
1581 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1582 << in->ino << dendl;
1583 r = _getattr(in, request->regetattr_mask, perms, true);
1584 target = in;
1585 }
1586 if (r >= 0) {
1587 // verify ino returned in reply and trace_dist are the same
1588 if (got_created_ino &&
1589 created_ino.val != target->ino.val) {
1590 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1591 r = -EINTR;
1592 }
1593 if (ptarget)
1594 ptarget->swap(target);
1595 }
1596 }
1597 }
1598
1599 return r;
1600}
1601
1602
1603/**
1604 * make a request
1605 *
1606 * Blocking helper to make an MDS request.
1607 *
1608 * If the ptarget flag is set, behavior changes slightly: the caller
1609 * expects to get a pointer to the inode we are creating or operating
1610 * on. As a result, we will follow up any traceless mutation reply
1611 * with a getattr or lookup to transparently handle a traceless reply
1612 * from the MDS (as when the MDS restarts and the client has to replay
1613 * a request).
1614 *
1615 * @param request the MetaRequest to execute
1616 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1617 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1618 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1619 * @param use_mds [optional] prefer a specific mds (-1 for default)
1620 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1621 */
1622int Client::make_request(MetaRequest *request,
1623 const UserPerm& perms,
1624 InodeRef *ptarget, bool *pcreated,
1625 mds_rank_t use_mds,
1626 bufferlist *pdirbl)
1627{
1628 int r = 0;
1629
1630 // assign a unique tid
1631 ceph_tid_t tid = ++last_tid;
1632 request->set_tid(tid);
1633
1634 // and timestamp
1635 request->op_stamp = ceph_clock_now();
1636
1637 // make note
1638 mds_requests[tid] = request->get();
1639 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1640 oldest_tid = tid;
1641
1642 request->set_caller_perms(perms);
1643
1644 if (cct->_conf->client_inject_fixed_oldest_tid) {
1645 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1646 request->set_oldest_client_tid(1);
1647 } else {
1648 request->set_oldest_client_tid(oldest_tid);
1649 }
1650
1651 // hack target mds?
1652 if (use_mds >= 0)
1653 request->resend_mds = use_mds;
1654
1655 while (1) {
1656 if (request->aborted())
1657 break;
1658
31f18b77
FG
1659 if (blacklisted) {
1660 request->abort(-EBLACKLISTED);
1661 break;
1662 }
1663
7c673cae
FG
1664 // set up wait cond
1665 Cond caller_cond;
1666 request->caller_cond = &caller_cond;
1667
1668 // choose mds
1669 Inode *hash_diri = NULL;
1670 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1671 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1672 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1673 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1674 if (hash_diri) {
1675 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1676 _fragmap_remove_stopped_mds(hash_diri, mds);
1677 } else {
1678 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1679 request->resend_mds = _get_random_up_mds();
1680 }
1681 } else {
1682 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1683 wait_on_list(waiting_for_mdsmap);
1684 }
1685 continue;
1686 }
1687
1688 // open a session?
1689 MetaSession *session = NULL;
1690 if (!have_open_session(mds)) {
1691 session = _get_or_open_mds_session(mds);
1692
1693 // wait
1694 if (session->state == MetaSession::STATE_OPENING) {
1695 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1696 wait_on_context_list(session->waiting_for_open);
1697 // Abort requests on REJECT from MDS
1698 if (rejected_by_mds.count(mds)) {
1699 request->abort(-EPERM);
1700 break;
1701 }
1702 continue;
1703 }
1704
1705 if (!have_open_session(mds))
1706 continue;
1707 } else {
1708 session = mds_sessions[mds];
1709 }
1710
1711 // send request.
1712 send_request(request, session);
1713
1714 // wait for signal
1715 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1716 request->kick = false;
1717 while (!request->reply && // reply
1718 request->resend_mds < 0 && // forward
1719 !request->kick)
1720 caller_cond.Wait(client_lock);
1721 request->caller_cond = NULL;
1722
1723 // did we get a reply?
1724 if (request->reply)
1725 break;
1726 }
1727
1728 if (!request->reply) {
1729 assert(request->aborted());
1730 assert(!request->got_unsafe);
1731 r = request->get_abort_code();
1732 request->item.remove_myself();
1733 unregister_request(request);
1734 put_request(request); // ours
1735 return r;
1736 }
1737
1738 // got it!
1739 MClientReply *reply = request->reply;
1740 request->reply = NULL;
1741 r = reply->get_result();
1742 if (r >= 0)
1743 request->success = true;
1744
1745 // kick dispatcher (we've got it!)
1746 assert(request->dispatch_cond);
1747 request->dispatch_cond->Signal();
1748 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1749 request->dispatch_cond = 0;
1750
1751 if (r >= 0 && ptarget)
1752 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1753
1754 if (pdirbl)
1755 pdirbl->claim(reply->get_extra_bl());
1756
1757 // -- log times --
1758 utime_t lat = ceph_clock_now();
1759 lat -= request->sent_stamp;
1760 ldout(cct, 20) << "lat " << lat << dendl;
1761 logger->tinc(l_c_lat, lat);
1762 logger->tinc(l_c_reply, lat);
1763
1764 put_request(request);
1765
1766 reply->put();
1767 return r;
1768}
1769
1770void Client::unregister_request(MetaRequest *req)
1771{
1772 mds_requests.erase(req->tid);
1773 if (req->tid == oldest_tid) {
1774 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1775 while (true) {
1776 if (p == mds_requests.end()) {
1777 oldest_tid = 0;
1778 break;
1779 }
1780 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1781 oldest_tid = p->first;
1782 break;
1783 }
1784 ++p;
1785 }
1786 }
1787 put_request(req);
1788}
1789
1790void Client::put_request(MetaRequest *request)
1791{
1792 if (request->_put()) {
1793 int op = -1;
1794 if (request->success)
1795 op = request->get_op();
1796 InodeRef other_in;
1797 request->take_other_inode(&other_in);
1798 delete request;
1799
1800 if (other_in &&
1801 (op == CEPH_MDS_OP_RMDIR ||
1802 op == CEPH_MDS_OP_RENAME ||
1803 op == CEPH_MDS_OP_RMSNAP)) {
1804 _try_to_trim_inode(other_in.get(), false);
1805 }
1806 }
1807}
1808
1809int Client::encode_inode_release(Inode *in, MetaRequest *req,
1810 mds_rank_t mds, int drop,
1811 int unless, int force)
1812{
1813 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1814 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1815 << ", have:" << ", force:" << force << ")" << dendl;
1816 int released = 0;
1817 if (in->caps.count(mds)) {
1818 Cap *caps = in->caps[mds];
1819 drop &= ~(in->dirty_caps | get_caps_used(in));
1820 if ((drop & caps->issued) &&
1821 !(unless & caps->issued)) {
1822 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1823 caps->issued &= ~drop;
1824 caps->implemented &= ~drop;
1825 released = 1;
1826 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1827 } else {
1828 released = force;
1829 }
1830 if (released) {
1831 ceph_mds_request_release rel;
1832 rel.ino = in->ino;
1833 rel.cap_id = caps->cap_id;
1834 rel.seq = caps->seq;
1835 rel.issue_seq = caps->issue_seq;
1836 rel.mseq = caps->mseq;
1837 rel.caps = caps->implemented;
1838 rel.wanted = caps->wanted;
1839 rel.dname_len = 0;
1840 rel.dname_seq = 0;
1841 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1842 }
1843 }
1844 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1845 << released << dendl;
1846 return released;
1847}
1848
1849void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1850 mds_rank_t mds, int drop, int unless)
1851{
1852 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1853 << dn << ")" << dendl;
1854 int released = 0;
1855 if (dn->dir)
1856 released = encode_inode_release(dn->dir->parent_inode, req,
1857 mds, drop, unless, 1);
1858 if (released && dn->lease_mds == mds) {
1859 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1860 MClientRequest::Release& rel = req->cap_releases.back();
1861 rel.item.dname_len = dn->name.length();
1862 rel.item.dname_seq = dn->lease_seq;
1863 rel.dname = dn->name;
1864 }
1865 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1866 << dn << ")" << dendl;
1867}
1868
1869
1870/*
1871 * This requires the MClientRequest *request member to be set.
1872 * It will error out horribly without one.
1873 * Additionally, if you set any *drop member, you'd better have
1874 * set the corresponding dentry!
1875 */
1876void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1877{
1878 ldout(cct, 20) << "encode_cap_releases enter (req: "
1879 << req << ", mds: " << mds << ")" << dendl;
1880 if (req->inode_drop && req->inode())
1881 encode_inode_release(req->inode(), req,
1882 mds, req->inode_drop,
1883 req->inode_unless);
1884
1885 if (req->old_inode_drop && req->old_inode())
1886 encode_inode_release(req->old_inode(), req,
1887 mds, req->old_inode_drop,
1888 req->old_inode_unless);
1889 if (req->other_inode_drop && req->other_inode())
1890 encode_inode_release(req->other_inode(), req,
1891 mds, req->other_inode_drop,
1892 req->other_inode_unless);
1893
1894 if (req->dentry_drop && req->dentry())
1895 encode_dentry_release(req->dentry(), req,
1896 mds, req->dentry_drop,
1897 req->dentry_unless);
1898
1899 if (req->old_dentry_drop && req->old_dentry())
1900 encode_dentry_release(req->old_dentry(), req,
1901 mds, req->old_dentry_drop,
1902 req->old_dentry_unless);
1903 ldout(cct, 25) << "encode_cap_releases exit (req: "
1904 << req << ", mds " << mds <<dendl;
1905}
1906
1907bool Client::have_open_session(mds_rank_t mds)
1908{
1909 return
1910 mds_sessions.count(mds) &&
1911 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1912 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1913}
1914
1915MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1916{
1917 if (mds_sessions.count(mds) == 0)
1918 return NULL;
1919 MetaSession *s = mds_sessions[mds];
1920 if (s->con != con)
1921 return NULL;
1922 return s;
1923}
1924
1925MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1926{
1927 if (mds_sessions.count(mds))
1928 return mds_sessions[mds];
1929 return _open_mds_session(mds);
1930}
1931
1932/**
1933 * Populate a map of strings with client-identifying metadata,
1934 * such as the hostname. Call this once at initialization.
1935 */
1936void Client::populate_metadata(const std::string &mount_root)
1937{
1938 // Hostname
1939 struct utsname u;
1940 int r = uname(&u);
1941 if (r >= 0) {
1942 metadata["hostname"] = u.nodename;
1943 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1944 } else {
1945 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1946 }
1947
1948 metadata["pid"] = stringify(getpid());
1949
1950 // Ceph entity id (the '0' in "client.0")
1951 metadata["entity_id"] = cct->_conf->name.get_id();
1952
1953 // Our mount position
1954 if (!mount_root.empty()) {
1955 metadata["root"] = mount_root;
1956 }
1957
1958 // Ceph version
1959 metadata["ceph_version"] = pretty_version_to_str();
1960 metadata["ceph_sha1"] = git_version_to_str();
1961
1962 // Apply any metadata from the user's configured overrides
1963 std::vector<std::string> tokens;
1964 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1965 for (const auto &i : tokens) {
1966 auto eqpos = i.find("=");
1967 // Throw out anything that isn't of the form "<str>=<str>"
1968 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1969 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1970 continue;
1971 }
1972 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1973 }
1974}
1975
1976/**
1977 * Optionally add or override client metadata fields.
1978 */
1979void Client::update_metadata(std::string const &k, std::string const &v)
1980{
1981 Mutex::Locker l(client_lock);
1982 assert(initialized);
1983
1984 if (metadata.count(k)) {
1985 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1986 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1987 }
1988
1989 metadata[k] = v;
1990}
1991
1992MetaSession *Client::_open_mds_session(mds_rank_t mds)
1993{
1994 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1995 assert(mds_sessions.count(mds) == 0);
1996 MetaSession *session = new MetaSession;
1997 session->mds_num = mds;
1998 session->seq = 0;
1999 session->inst = mdsmap->get_inst(mds);
2000 session->con = messenger->get_connection(session->inst);
2001 session->state = MetaSession::STATE_OPENING;
2002 session->mds_state = MDSMap::STATE_NULL;
2003 mds_sessions[mds] = session;
2004
2005 // Maybe skip sending a request to open if this MDS daemon
2006 // has previously sent us a REJECT.
2007 if (rejected_by_mds.count(mds)) {
2008 if (rejected_by_mds[mds] == session->inst) {
2009 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2010 "because we were rejected" << dendl;
2011 return session;
2012 } else {
2013 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2014 "rejected us, trying with new inst" << dendl;
2015 rejected_by_mds.erase(mds);
2016 }
2017 }
2018
2019 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2020 m->client_meta = metadata;
2021 session->con->send_message(m);
2022 return session;
2023}
2024
2025void Client::_close_mds_session(MetaSession *s)
2026{
2027 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2028 s->state = MetaSession::STATE_CLOSING;
2029 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2030}
2031
2032void Client::_closed_mds_session(MetaSession *s)
2033{
2034 s->state = MetaSession::STATE_CLOSED;
2035 s->con->mark_down();
2036 signal_context_list(s->waiting_for_open);
2037 mount_cond.Signal();
2038 remove_session_caps(s);
2039 kick_requests_closed(s);
2040 mds_sessions.erase(s->mds_num);
2041 delete s;
2042}
2043
2044void Client::handle_client_session(MClientSession *m)
2045{
2046 mds_rank_t from = mds_rank_t(m->get_source().num());
2047 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2048
2049 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2050 if (!session) {
2051 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2052 m->put();
2053 return;
2054 }
2055
2056 switch (m->get_op()) {
2057 case CEPH_SESSION_OPEN:
2058 renew_caps(session);
2059 session->state = MetaSession::STATE_OPEN;
2060 if (unmounting)
2061 mount_cond.Signal();
2062 else
2063 connect_mds_targets(from);
2064 signal_context_list(session->waiting_for_open);
2065 break;
2066
2067 case CEPH_SESSION_CLOSE:
2068 _closed_mds_session(session);
2069 break;
2070
2071 case CEPH_SESSION_RENEWCAPS:
2072 if (session->cap_renew_seq == m->get_seq()) {
2073 session->cap_ttl =
2074 session->last_cap_renew_request + mdsmap->get_session_timeout();
2075 wake_inode_waiters(session);
2076 }
2077 break;
2078
2079 case CEPH_SESSION_STALE:
28e407b8
AA
2080 // invalidate session caps/leases
2081 session->cap_gen++;
2082 session->cap_ttl = ceph_clock_now();
2083 session->cap_ttl -= 1;
7c673cae
FG
2084 renew_caps(session);
2085 break;
2086
2087 case CEPH_SESSION_RECALL_STATE:
2088 trim_caps(session, m->get_max_caps());
2089 break;
2090
2091 case CEPH_SESSION_FLUSHMSG:
2092 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2093 break;
2094
2095 case CEPH_SESSION_FORCE_RO:
2096 force_session_readonly(session);
2097 break;
2098
2099 case CEPH_SESSION_REJECT:
2100 rejected_by_mds[session->mds_num] = session->inst;
2101 _closed_mds_session(session);
2102
2103 break;
2104
2105 default:
2106 ceph_abort();
2107 }
2108
2109 m->put();
2110}
2111
2112bool Client::_any_stale_sessions() const
2113{
2114 assert(client_lock.is_locked_by_me());
2115
2116 for (const auto &i : mds_sessions) {
2117 if (i.second->state == MetaSession::STATE_STALE) {
2118 return true;
2119 }
2120 }
2121
2122 return false;
2123}
2124
2125void Client::_kick_stale_sessions()
2126{
2127 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2128
2129 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2130 p != mds_sessions.end(); ) {
2131 MetaSession *s = p->second;
2132 ++p;
2133 if (s->state == MetaSession::STATE_STALE)
2134 _closed_mds_session(s);
2135 }
2136}
2137
2138void Client::send_request(MetaRequest *request, MetaSession *session,
2139 bool drop_cap_releases)
2140{
2141 // make the request
2142 mds_rank_t mds = session->mds_num;
2143 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2144 << " for mds." << mds << dendl;
2145 MClientRequest *r = build_client_request(request);
2146 if (request->dentry()) {
2147 r->set_dentry_wanted();
2148 }
2149 if (request->got_unsafe) {
2150 r->set_replayed_op();
2151 if (request->target)
2152 r->head.ino = request->target->ino;
2153 } else {
2154 encode_cap_releases(request, mds);
2155 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2156 request->cap_releases.clear();
2157 else
2158 r->releases.swap(request->cap_releases);
2159 }
2160 r->set_mdsmap_epoch(mdsmap->get_epoch());
2161 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2162 objecter->with_osdmap([r](const OSDMap& o) {
2163 r->set_osdmap_epoch(o.get_epoch());
2164 });
2165 }
2166
2167 if (request->mds == -1) {
2168 request->sent_stamp = ceph_clock_now();
2169 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2170 }
2171 request->mds = mds;
2172
2173 Inode *in = request->inode();
2174 if (in && in->caps.count(mds))
2175 request->sent_on_mseq = in->caps[mds]->mseq;
2176
2177 session->requests.push_back(&request->item);
2178
2179 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2180 session->con->send_message(r);
2181}
2182
2183MClientRequest* Client::build_client_request(MetaRequest *request)
2184{
2185 MClientRequest *req = new MClientRequest(request->get_op());
2186 req->set_tid(request->tid);
2187 req->set_stamp(request->op_stamp);
2188 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2189
2190 // if the filepath's haven't been set, set them!
2191 if (request->path.empty()) {
2192 Inode *in = request->inode();
2193 Dentry *de = request->dentry();
2194 if (in)
2195 in->make_nosnap_relative_path(request->path);
2196 else if (de) {
2197 if (de->inode)
2198 de->inode->make_nosnap_relative_path(request->path);
2199 else if (de->dir) {
2200 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2201 request->path.push_dentry(de->name);
2202 }
2203 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2204 << " No path, inode, or appropriately-endowed dentry given!"
2205 << dendl;
2206 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2207 << " No path, inode, or dentry given!"
2208 << dendl;
2209 }
2210 req->set_filepath(request->get_filepath());
2211 req->set_filepath2(request->get_filepath2());
2212 req->set_data(request->data);
2213 req->set_retry_attempt(request->retry_attempt++);
2214 req->head.num_fwd = request->num_fwd;
2215 const gid_t *_gids;
2216 int gid_count = request->perms.get_gids(&_gids);
2217 req->set_gid_list(gid_count, _gids);
2218 return req;
2219}
2220
2221
2222
2223void Client::handle_client_request_forward(MClientRequestForward *fwd)
2224{
2225 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2226 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2227 if (!session) {
2228 fwd->put();
2229 return;
2230 }
2231 ceph_tid_t tid = fwd->get_tid();
2232
2233 if (mds_requests.count(tid) == 0) {
2234 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2235 fwd->put();
2236 return;
2237 }
2238
2239 MetaRequest *request = mds_requests[tid];
2240 assert(request);
2241
2242 // reset retry counter
2243 request->retry_attempt = 0;
2244
2245 // request not forwarded, or dest mds has no session.
2246 // resend.
2247 ldout(cct, 10) << "handle_client_request tid " << tid
2248 << " fwd " << fwd->get_num_fwd()
2249 << " to mds." << fwd->get_dest_mds()
2250 << ", resending to " << fwd->get_dest_mds()
2251 << dendl;
2252
2253 request->mds = -1;
2254 request->item.remove_myself();
2255 request->num_fwd = fwd->get_num_fwd();
2256 request->resend_mds = fwd->get_dest_mds();
2257 request->caller_cond->Signal();
2258
2259 fwd->put();
2260}
2261
2262bool Client::is_dir_operation(MetaRequest *req)
2263{
2264 int op = req->get_op();
2265 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2266 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2267 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2268 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2269 return true;
2270 return false;
2271}
2272
2273void Client::handle_client_reply(MClientReply *reply)
2274{
2275 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2276 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2277 if (!session) {
2278 reply->put();
2279 return;
2280 }
2281
2282 ceph_tid_t tid = reply->get_tid();
2283 bool is_safe = reply->is_safe();
2284
2285 if (mds_requests.count(tid) == 0) {
2286 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2287 << " safe is:" << is_safe << dendl;
2288 reply->put();
2289 return;
2290 }
2291 MetaRequest *request = mds_requests.at(tid);
2292
2293 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2294 << " tid " << tid << dendl;
2295
2296 if (request->got_unsafe && !is_safe) {
2297 //duplicate response
2298 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2299 << mds_num << " safe:" << is_safe << dendl;
2300 reply->put();
2301 return;
2302 }
2303
2304 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2305 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2306 << " from mds." << request->mds << dendl;
2307 request->send_to_auth = true;
2308 request->resend_mds = choose_target_mds(request);
2309 Inode *in = request->inode();
2310 if (request->resend_mds >= 0 &&
2311 request->resend_mds == request->mds &&
2312 (in == NULL ||
2313 in->caps.count(request->resend_mds) == 0 ||
2314 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2315 // have to return ESTALE
2316 } else {
2317 request->caller_cond->Signal();
2318 reply->put();
2319 return;
2320 }
2321 ldout(cct, 20) << "have to return ESTALE" << dendl;
2322 }
2323
2324 assert(request->reply == NULL);
2325 request->reply = reply;
2326 insert_trace(request, session);
2327
2328 // Handle unsafe reply
2329 if (!is_safe) {
2330 request->got_unsafe = true;
2331 session->unsafe_requests.push_back(&request->unsafe_item);
2332 if (is_dir_operation(request)) {
2333 Inode *dir = request->inode();
2334 assert(dir);
2335 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2336 }
2337 if (request->target) {
2338 InodeRef &in = request->target;
2339 in->unsafe_ops.push_back(&request->unsafe_target_item);
2340 }
2341 }
2342
2343 // Only signal the caller once (on the first reply):
2344 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2345 if (!is_safe || !request->got_unsafe) {
2346 Cond cond;
2347 request->dispatch_cond = &cond;
2348
2349 // wake up waiter
2350 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2351 request->caller_cond->Signal();
2352
2353 // wake for kick back
2354 while (request->dispatch_cond) {
2355 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2356 cond.Wait(client_lock);
2357 }
2358 }
2359
2360 if (is_safe) {
2361 // the filesystem change is committed to disk
2362 // we're done, clean up
2363 if (request->got_unsafe) {
2364 request->unsafe_item.remove_myself();
2365 request->unsafe_dir_item.remove_myself();
2366 request->unsafe_target_item.remove_myself();
2367 signal_cond_list(request->waitfor_safe);
2368 }
2369 request->item.remove_myself();
2370 unregister_request(request);
2371 }
2372 if (unmounting)
2373 mount_cond.Signal();
2374}
2375
2376void Client::_handle_full_flag(int64_t pool)
2377{
2378 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2379 << "on " << pool << dendl;
2380 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2381 // to do this rather than blocking, because otherwise when we fill up we
2382 // potentially lock caps forever on files with dirty pages, and we need
2383 // to be able to release those caps to the MDS so that it can delete files
2384 // and free up space.
2385 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2386
2387 // For all inodes with layouts in this pool and a pending flush write op
2388 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2389 // from ObjectCacher so that it doesn't re-issue the write in response to
2390 // the ENOSPC error.
2391 // Fortunately since we're cancelling everything in a given pool, we don't
2392 // need to know which ops belong to which ObjectSet, we can just blow all
2393 // the un-flushed cached data away and mark any dirty inodes' async_err
2394 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2395 // affecting this pool, and all the objectsets we're purging were also
2396 // in this pool.
2397 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2398 i != inode_map.end(); ++i)
2399 {
2400 Inode *inode = i->second;
2401 if (inode->oset.dirty_or_tx
2402 && (pool == -1 || inode->layout.pool_id == pool)) {
2403 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2404 << " has dirty objects, purging and setting ENOSPC" << dendl;
2405 objectcacher->purge_set(&inode->oset);
2406 inode->set_async_err(-ENOSPC);
2407 }
2408 }
2409
2410 if (cancelled_epoch != (epoch_t)-1) {
2411 set_cap_epoch_barrier(cancelled_epoch);
2412 }
2413}
2414
2415void Client::handle_osd_map(MOSDMap *m)
2416{
31f18b77
FG
2417 std::set<entity_addr_t> new_blacklists;
2418 objecter->consume_blacklist_events(&new_blacklists);
2419
2420 const auto myaddr = messenger->get_myaddr();
2421 if (!blacklisted && new_blacklists.count(myaddr)) {
2422 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2423 return o.get_epoch();
2424 });
2425 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2426 blacklisted = true;
2427 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2428 p != mds_requests.end(); ) {
2429 auto req = p->second;
2430 ++p;
2431 req->abort(-EBLACKLISTED);
2432 if (req->caller_cond) {
2433 req->kick = true;
2434 req->caller_cond->Signal();
2435 }
2436 }
2437
2438 // Progress aborts on any requests that were on this waitlist. Any
2439 // requests that were on a waiting_for_open session waitlist
2440 // will get kicked during close session below.
2441 signal_cond_list(waiting_for_mdsmap);
2442
2443 // Force-close all sessions: assume this is not abandoning any state
2444 // on the MDS side because the MDS will have seen the blacklist too.
2445 while(!mds_sessions.empty()) {
2446 auto i = mds_sessions.begin();
2447 auto session = i->second;
2448 _closed_mds_session(session);
2449 }
2450
2451 // Since we know all our OSD ops will fail, cancel them all preemtively,
2452 // so that on an unhealthy cluster we can umount promptly even if e.g.
2453 // some PGs were inaccessible.
2454 objecter->op_cancel_writes(-EBLACKLISTED);
2455
2456 } else if (blacklisted) {
2457 // Handle case where we were blacklisted but no longer are
2458 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2459 return o.is_blacklisted(myaddr);});
2460 }
2461
7c673cae
FG
2462 if (objecter->osdmap_full_flag()) {
2463 _handle_full_flag(-1);
2464 } else {
2465 // Accumulate local list of full pools so that I can drop
2466 // the objecter lock before re-entering objecter in
2467 // cancel_writes
2468 std::vector<int64_t> full_pools;
2469
2470 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2471 for (const auto& kv : o.get_pools()) {
2472 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2473 full_pools.push_back(kv.first);
2474 }
2475 }
2476 });
2477
2478 for (auto p : full_pools)
2479 _handle_full_flag(p);
2480
2481 // Subscribe to subsequent maps to watch for the full flag going
2482 // away. For the global full flag objecter does this for us, but
2483 // it pays no attention to the per-pool full flag so in this branch
2484 // we do it ourselves.
2485 if (!full_pools.empty()) {
2486 objecter->maybe_request_map();
2487 }
2488 }
2489
2490 m->put();
2491}
2492
2493
2494// ------------------------
2495// incoming messages
2496
2497
2498bool Client::ms_dispatch(Message *m)
2499{
2500 Mutex::Locker l(client_lock);
2501 if (!initialized) {
2502 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2503 m->put();
2504 return true;
2505 }
2506
2507 switch (m->get_type()) {
2508 // mounting and mds sessions
2509 case CEPH_MSG_MDS_MAP:
2510 handle_mds_map(static_cast<MMDSMap*>(m));
2511 break;
2512 case CEPH_MSG_FS_MAP:
2513 handle_fs_map(static_cast<MFSMap*>(m));
2514 break;
2515 case CEPH_MSG_FS_MAP_USER:
2516 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2517 break;
2518 case CEPH_MSG_CLIENT_SESSION:
2519 handle_client_session(static_cast<MClientSession*>(m));
2520 break;
2521
2522 case CEPH_MSG_OSD_MAP:
2523 handle_osd_map(static_cast<MOSDMap*>(m));
2524 break;
2525
2526 // requests
2527 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2528 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2529 break;
2530 case CEPH_MSG_CLIENT_REPLY:
2531 handle_client_reply(static_cast<MClientReply*>(m));
2532 break;
2533
2534 case CEPH_MSG_CLIENT_SNAP:
2535 handle_snap(static_cast<MClientSnap*>(m));
2536 break;
2537 case CEPH_MSG_CLIENT_CAPS:
2538 handle_caps(static_cast<MClientCaps*>(m));
2539 break;
2540 case CEPH_MSG_CLIENT_LEASE:
2541 handle_lease(static_cast<MClientLease*>(m));
2542 break;
2543 case MSG_COMMAND_REPLY:
2544 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2545 handle_command_reply(static_cast<MCommandReply*>(m));
2546 } else {
2547 return false;
2548 }
2549 break;
2550 case CEPH_MSG_CLIENT_QUOTA:
2551 handle_quota(static_cast<MClientQuota*>(m));
2552 break;
2553
2554 default:
2555 return false;
2556 }
2557
2558 // unmounting?
2559 if (unmounting) {
2560 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2561 << "+" << inode_map.size() << dendl;
2562 long unsigned size = lru.lru_get_size() + inode_map.size();
2563 trim_cache();
2564 if (size < lru.lru_get_size() + inode_map.size()) {
2565 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2566 mount_cond.Signal();
2567 } else {
2568 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2569 << "+" << inode_map.size() << dendl;
2570 }
2571 }
2572
2573 return true;
2574}
2575
2576void Client::handle_fs_map(MFSMap *m)
2577{
2578 fsmap.reset(new FSMap(m->get_fsmap()));
2579 m->put();
2580
2581 signal_cond_list(waiting_for_fsmap);
2582
2583 monclient->sub_got("fsmap", fsmap->get_epoch());
2584}
2585
2586void Client::handle_fs_map_user(MFSMapUser *m)
2587{
2588 fsmap_user.reset(new FSMapUser);
2589 *fsmap_user = m->get_fsmap();
2590 m->put();
2591
2592 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2593 signal_cond_list(waiting_for_fsmap);
2594}
2595
2596void Client::handle_mds_map(MMDSMap* m)
2597{
2598 if (m->get_epoch() <= mdsmap->get_epoch()) {
2599 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2600 << " is identical to or older than our "
2601 << mdsmap->get_epoch() << dendl;
2602 m->put();
2603 return;
2604 }
2605
2606 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2607
2608 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2609 oldmap.swap(mdsmap);
2610
2611 mdsmap->decode(m->get_encoded());
2612
2613 // Cancel any commands for missing or laggy GIDs
2614 std::list<ceph_tid_t> cancel_ops;
2615 auto &commands = command_table.get_commands();
2616 for (const auto &i : commands) {
2617 auto &op = i.second;
2618 const mds_gid_t op_mds_gid = op.mds_gid;
2619 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2620 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2621 cancel_ops.push_back(i.first);
2622 if (op.outs) {
2623 std::ostringstream ss;
2624 ss << "MDS " << op_mds_gid << " went away";
2625 *(op.outs) = ss.str();
2626 }
2627 op.con->mark_down();
2628 if (op.on_finish) {
2629 op.on_finish->complete(-ETIMEDOUT);
2630 }
2631 }
2632 }
2633
2634 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2635 i != cancel_ops.end(); ++i) {
2636 command_table.erase(*i);
2637 }
2638
2639 // reset session
2640 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2641 p != mds_sessions.end(); ) {
2642 mds_rank_t mds = p->first;
2643 MetaSession *session = p->second;
2644 ++p;
2645
2646 int oldstate = oldmap->get_state(mds);
2647 int newstate = mdsmap->get_state(mds);
2648 if (!mdsmap->is_up(mds)) {
2649 session->con->mark_down();
2650 } else if (mdsmap->get_inst(mds) != session->inst) {
2651 session->con->mark_down();
2652 session->inst = mdsmap->get_inst(mds);
2653 // When new MDS starts to take over, notify kernel to trim unused entries
2654 // in its dcache/icache. Hopefully, the kernel will release some unused
2655 // inodes before the new MDS enters reconnect state.
2656 trim_cache_for_reconnect(session);
2657 } else if (oldstate == newstate)
2658 continue; // no change
2659
2660 session->mds_state = newstate;
2661 if (newstate == MDSMap::STATE_RECONNECT) {
2662 session->con = messenger->get_connection(session->inst);
2663 send_reconnect(session);
2664 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2665 if (oldstate < MDSMap::STATE_ACTIVE) {
2666 // kick new requests
2667 kick_requests(session);
2668 kick_flushing_caps(session);
2669 signal_context_list(session->waiting_for_open);
2670 kick_maxsize_requests(session);
2671 wake_inode_waiters(session);
2672 }
2673 connect_mds_targets(mds);
2674 } else if (newstate == MDSMap::STATE_NULL &&
2675 mds >= mdsmap->get_max_mds()) {
2676 _closed_mds_session(session);
2677 }
2678 }
2679
2680 // kick any waiting threads
2681 signal_cond_list(waiting_for_mdsmap);
2682
2683 m->put();
2684
2685 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2686}
2687
2688void Client::send_reconnect(MetaSession *session)
2689{
2690 mds_rank_t mds = session->mds_num;
2691 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2692
2693 // trim unused caps to reduce MDS's cache rejoin time
2694 trim_cache_for_reconnect(session);
2695
2696 session->readonly = false;
2697
2698 if (session->release) {
2699 session->release->put();
2700 session->release = NULL;
2701 }
2702
2703 // reset my cap seq number
2704 session->seq = 0;
2705 //connect to the mds' offload targets
2706 connect_mds_targets(mds);
2707 //make sure unsafe requests get saved
2708 resend_unsafe_requests(session);
2709
2710 MClientReconnect *m = new MClientReconnect;
2711
2712 // i have an open session.
2713 ceph::unordered_set<inodeno_t> did_snaprealm;
2714 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2715 p != inode_map.end();
2716 ++p) {
2717 Inode *in = p->second;
2718 if (in->caps.count(mds)) {
2719 ldout(cct, 10) << " caps on " << p->first
2720 << " " << ccap_string(in->caps[mds]->issued)
2721 << " wants " << ccap_string(in->caps_wanted())
2722 << dendl;
2723 filepath path;
2724 in->make_long_path(path);
2725 ldout(cct, 10) << " path " << path << dendl;
2726
2727 bufferlist flockbl;
2728 _encode_filelocks(in, flockbl);
2729
2730 Cap *cap = in->caps[mds];
2731 cap->seq = 0; // reset seq.
2732 cap->issue_seq = 0; // reset seq.
2733 cap->mseq = 0; // reset seq.
2734 cap->issued = cap->implemented;
2735
2736 snapid_t snap_follows = 0;
2737 if (!in->cap_snaps.empty())
2738 snap_follows = in->cap_snaps.begin()->first;
2739
2740 m->add_cap(p->first.ino,
2741 cap->cap_id,
2742 path.get_ino(), path.get_path(), // ino
2743 in->caps_wanted(), // wanted
2744 cap->issued, // issued
2745 in->snaprealm->ino,
2746 snap_follows,
2747 flockbl);
2748
2749 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2750 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2751 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2752 did_snaprealm.insert(in->snaprealm->ino);
2753 }
2754 }
2755 }
2756
2757 early_kick_flushing_caps(session);
2758
2759 session->con->send_message(m);
2760
2761 mount_cond.Signal();
2762}
2763
2764
2765void Client::kick_requests(MetaSession *session)
2766{
2767 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2768 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2769 p != mds_requests.end();
2770 ++p) {
31f18b77
FG
2771 MetaRequest *req = p->second;
2772 if (req->got_unsafe)
2773 continue;
2774 if (req->aborted()) {
2775 if (req->caller_cond) {
2776 req->kick = true;
2777 req->caller_cond->Signal();
2778 }
7c673cae 2779 continue;
31f18b77
FG
2780 }
2781 if (req->retry_attempt > 0)
7c673cae 2782 continue; // new requests only
31f18b77 2783 if (req->mds == session->mds_num) {
7c673cae
FG
2784 send_request(p->second, session);
2785 }
2786 }
2787}
2788
2789void Client::resend_unsafe_requests(MetaSession *session)
2790{
2791 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2792 !iter.end();
2793 ++iter)
2794 send_request(*iter, session);
2795
2796 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2797 // process completed requests in clientreplay stage.
2798 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2799 p != mds_requests.end();
2800 ++p) {
2801 MetaRequest *req = p->second;
2802 if (req->got_unsafe)
2803 continue;
31f18b77
FG
2804 if (req->aborted())
2805 continue;
7c673cae
FG
2806 if (req->retry_attempt == 0)
2807 continue; // old requests only
2808 if (req->mds == session->mds_num)
2809 send_request(req, session, true);
2810 }
2811}
2812
2813void Client::wait_unsafe_requests()
2814{
2815 list<MetaRequest*> last_unsafe_reqs;
2816 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2817 p != mds_sessions.end();
2818 ++p) {
2819 MetaSession *s = p->second;
2820 if (!s->unsafe_requests.empty()) {
2821 MetaRequest *req = s->unsafe_requests.back();
2822 req->get();
2823 last_unsafe_reqs.push_back(req);
2824 }
2825 }
2826
2827 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2828 p != last_unsafe_reqs.end();
2829 ++p) {
2830 MetaRequest *req = *p;
2831 if (req->unsafe_item.is_on_list())
2832 wait_on_list(req->waitfor_safe);
2833 put_request(req);
2834 }
2835}
2836
2837void Client::kick_requests_closed(MetaSession *session)
2838{
2839 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2840 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2841 p != mds_requests.end(); ) {
2842 MetaRequest *req = p->second;
2843 ++p;
2844 if (req->mds == session->mds_num) {
2845 if (req->caller_cond) {
2846 req->kick = true;
2847 req->caller_cond->Signal();
2848 }
2849 req->item.remove_myself();
2850 if (req->got_unsafe) {
2851 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2852 req->unsafe_item.remove_myself();
2853 req->unsafe_dir_item.remove_myself();
2854 req->unsafe_target_item.remove_myself();
2855 signal_cond_list(req->waitfor_safe);
2856 unregister_request(req);
2857 }
2858 }
2859 }
2860 assert(session->requests.empty());
2861 assert(session->unsafe_requests.empty());
2862}
2863
2864
2865
2866
2867/************
2868 * leases
2869 */
2870
2871void Client::got_mds_push(MetaSession *s)
2872{
2873 s->seq++;
2874 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2875 if (s->state == MetaSession::STATE_CLOSING) {
2876 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2877 }
2878}
2879
2880void Client::handle_lease(MClientLease *m)
2881{
2882 ldout(cct, 10) << "handle_lease " << *m << dendl;
2883
2884 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2885
2886 mds_rank_t mds = mds_rank_t(m->get_source().num());
2887 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2888 if (!session) {
2889 m->put();
2890 return;
2891 }
2892
2893 got_mds_push(session);
2894
2895 ceph_seq_t seq = m->get_seq();
2896
2897 Inode *in;
2898 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2899 if (inode_map.count(vino) == 0) {
2900 ldout(cct, 10) << " don't have vino " << vino << dendl;
2901 goto revoke;
2902 }
2903 in = inode_map[vino];
2904
2905 if (m->get_mask() & CEPH_LOCK_DN) {
2906 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2907 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2908 goto revoke;
2909 }
2910 Dentry *dn = in->dir->dentries[m->dname];
2911 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2912 dn->lease_mds = -1;
2913 }
2914
2915 revoke:
2916 m->get_connection()->send_message(
2917 new MClientLease(
2918 CEPH_MDS_LEASE_RELEASE, seq,
2919 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2920 m->put();
2921}
2922
2923void Client::put_inode(Inode *in, int n)
2924{
2925 ldout(cct, 10) << "put_inode on " << *in << dendl;
2926 int left = in->_put(n);
2927 if (left == 0) {
2928 // release any caps
2929 remove_all_caps(in);
2930
2931 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2932 bool unclean = objectcacher->release_set(&in->oset);
2933 assert(!unclean);
2934 inode_map.erase(in->vino());
2935 if (use_faked_inos())
2936 _release_faked_ino(in);
2937
2938 if (in == root) {
2939 root = 0;
2940 root_ancestor = 0;
2941 while (!root_parents.empty())
2942 root_parents.erase(root_parents.begin());
2943 }
2944
2945 delete in;
2946 }
2947}
2948
2949void Client::close_dir(Dir *dir)
2950{
2951 Inode *in = dir->parent_inode;
2952 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2953 assert(dir->is_empty());
2954 assert(in->dir == dir);
2955 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2956 if (!in->dn_set.empty())
2957 in->get_first_parent()->put(); // unpin dentry
2958
2959 delete in->dir;
2960 in->dir = 0;
2961 put_inode(in); // unpin inode
2962}
2963
2964 /**
2965 * Don't call this with in==NULL, use get_or_create for that
2966 * leave dn set to default NULL unless you're trying to add
2967 * a new inode to a pre-created Dentry
2968 */
2969Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2970{
2971 if (!dn) {
2972 // create a new Dentry
2973 dn = new Dentry;
2974 dn->name = name;
2975
2976 // link to dir
2977 dn->dir = dir;
2978 dir->dentries[dn->name] = dn;
2979 lru.lru_insert_mid(dn); // mid or top?
2980
2981 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2982 << " dn " << dn << " (new dn)" << dendl;
2983 } else {
2984 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2985 << " dn " << dn << " (old dn)" << dendl;
2986 }
2987
2988 if (in) { // link to inode
2989 dn->inode = in;
2990 if (in->is_dir()) {
2991 if (in->dir)
2992 dn->get(); // dir -> dn pin
2993 if (in->ll_ref)
2994 dn->get(); // ll_ref -> dn pin
2995 }
2996
2997 assert(in->dn_set.count(dn) == 0);
2998
2999 // only one parent for directories!
3000 if (in->is_dir() && !in->dn_set.empty()) {
3001 Dentry *olddn = in->get_first_parent();
3002 assert(olddn->dir != dir || olddn->name != name);
3003 Inode *old_diri = olddn->dir->parent_inode;
3004 old_diri->dir_release_count++;
3005 clear_dir_complete_and_ordered(old_diri, true);
3006 unlink(olddn, true, true); // keep dir, dentry
3007 }
3008
3009 in->dn_set.insert(dn);
3010
3011 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3012 }
3013
3014 return dn;
3015}
3016
3017void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3018{
3019 InodeRef in;
3020 in.swap(dn->inode);
3021 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3022 << " inode " << dn->inode << dendl;
3023
3024 // unlink from inode
3025 if (in) {
3026 if (in->is_dir()) {
3027 if (in->dir)
3028 dn->put(); // dir -> dn pin
3029 if (in->ll_ref)
3030 dn->put(); // ll_ref -> dn pin
3031 }
3032 dn->inode = 0;
3033 assert(in->dn_set.count(dn));
3034 in->dn_set.erase(dn);
3035 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3036 }
3037
3038 if (keepdentry) {
3039 dn->lease_mds = -1;
3040 } else {
3041 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3042
3043 // unlink from dir
3044 dn->dir->dentries.erase(dn->name);
3045 if (dn->dir->is_empty() && !keepdir)
3046 close_dir(dn->dir);
3047 dn->dir = 0;
3048
3049 // delete den
3050 lru.lru_remove(dn);
3051 dn->put();
3052 }
3053}
3054
3055/**
3056 * For asynchronous flushes, check for errors from the IO and
3057 * update the inode if necessary
3058 */
3059class C_Client_FlushComplete : public Context {
3060private:
3061 Client *client;
3062 InodeRef inode;
3063public:
3064 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3065 void finish(int r) override {
3066 assert(client->client_lock.is_locked_by_me());
3067 if (r != 0) {
3068 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3069 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3070 << " 0x" << std::hex << inode->ino << std::dec
3071 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3072 inode->set_async_err(r);
3073 }
3074 }
3075};
3076
3077
3078/****
3079 * caps
3080 */
3081
3082void Client::get_cap_ref(Inode *in, int cap)
3083{
3084 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3085 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3086 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3087 in->get();
3088 }
3089 if ((cap & CEPH_CAP_FILE_CACHE) &&
3090 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3091 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3092 in->get();
3093 }
3094 in->get_cap_ref(cap);
3095}
3096
3097void Client::put_cap_ref(Inode *in, int cap)
3098{
3099 int last = in->put_cap_ref(cap);
3100 if (last) {
3101 int put_nref = 0;
3102 int drop = last & ~in->caps_issued();
3103 if (in->snapid == CEPH_NOSNAP) {
3104 if ((last & CEPH_CAP_FILE_WR) &&
3105 !in->cap_snaps.empty() &&
3106 in->cap_snaps.rbegin()->second.writing) {
3107 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3108 in->cap_snaps.rbegin()->second.writing = 0;
3109 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3110 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3111 }
3112 if (last & CEPH_CAP_FILE_BUFFER) {
3113 for (auto &p : in->cap_snaps)
3114 p.second.dirty_data = 0;
3115 signal_cond_list(in->waitfor_commit);
3116 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3117 ++put_nref;
3118 }
3119 }
3120 if (last & CEPH_CAP_FILE_CACHE) {
3121 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3122 ++put_nref;
3123 }
3124 if (drop)
3125 check_caps(in, 0);
3126 if (put_nref)
3127 put_inode(in, put_nref);
3128 }
3129}
3130
3131int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3132{
3133 int r = check_pool_perm(in, need);
3134 if (r < 0)
3135 return r;
3136
3137 while (1) {
3138 int file_wanted = in->caps_file_wanted();
3139 if ((file_wanted & need) != need) {
3140 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3141 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3142 << dendl;
3143 return -EBADF;
3144 }
3145
3146 int implemented;
3147 int have = in->caps_issued(&implemented);
3148
3149 bool waitfor_caps = false;
3150 bool waitfor_commit = false;
3151
3152 if (have & need & CEPH_CAP_FILE_WR) {
3153 if (endoff > 0 &&
3154 (endoff >= (loff_t)in->max_size ||
3155 endoff > (loff_t)(in->size << 1)) &&
3156 endoff > (loff_t)in->wanted_max_size) {
3157 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3158 in->wanted_max_size = endoff;
3159 check_caps(in, 0);
3160 }
3161
3162 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3163 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3164 waitfor_caps = true;
3165 }
3166 if (!in->cap_snaps.empty()) {
3167 if (in->cap_snaps.rbegin()->second.writing) {
3168 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3169 waitfor_caps = true;
3170 }
3171 for (auto &p : in->cap_snaps) {
3172 if (p.second.dirty_data) {
3173 waitfor_commit = true;
3174 break;
3175 }
3176 }
3177 if (waitfor_commit) {
3178 _flush(in, new C_Client_FlushComplete(this, in));
3179 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3180 }
3181 }
3182 }
3183
3184 if (!waitfor_caps && !waitfor_commit) {
3185 if ((have & need) == need) {
7c673cae
FG
3186 int revoking = implemented & ~have;
3187 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3188 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3189 << " revoking " << ccap_string(revoking)
7c673cae 3190 << dendl;
c07f9fc5 3191 if ((revoking & want) == 0) {
7c673cae
FG
3192 *phave = need | (have & want);
3193 in->get_cap_ref(need);
3194 return 0;
3195 }
3196 }
3197 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3198 waitfor_caps = true;
3199 }
3200
3201 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3202 in->auth_cap->session->readonly)
3203 return -EROFS;
3204
3205 if (in->flags & I_CAP_DROPPED) {
3206 int mds_wanted = in->caps_mds_wanted();
3207 if ((mds_wanted & need) != need) {
3208 int ret = _renew_caps(in);
3209 if (ret < 0)
3210 return ret;
3211 continue;
3212 }
3213 if ((mds_wanted & file_wanted) ==
3214 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3215 in->flags &= ~I_CAP_DROPPED;
3216 }
3217 }
3218
3219 if (waitfor_caps)
3220 wait_on_list(in->waitfor_caps);
3221 else if (waitfor_commit)
3222 wait_on_list(in->waitfor_commit);
3223 }
3224}
3225
3226int Client::get_caps_used(Inode *in)
3227{
3228 unsigned used = in->caps_used();
3229 if (!(used & CEPH_CAP_FILE_CACHE) &&
3230 !objectcacher->set_is_empty(&in->oset))
3231 used |= CEPH_CAP_FILE_CACHE;
3232 return used;
3233}
3234
3235void Client::cap_delay_requeue(Inode *in)
3236{
3237 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3238 in->hold_caps_until = ceph_clock_now();
3239 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3240 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3241}
3242
3243void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3244 bool sync, int used, int want, int retain,
3245 int flush, ceph_tid_t flush_tid)
3246{
3247 int held = cap->issued | cap->implemented;
3248 int revoking = cap->implemented & ~cap->issued;
3249 retain &= ~revoking;
3250 int dropping = cap->issued & ~retain;
3251 int op = CEPH_CAP_OP_UPDATE;
3252
3253 ldout(cct, 10) << "send_cap " << *in
3254 << " mds." << session->mds_num << " seq " << cap->seq
3255 << (sync ? " sync " : " async ")
3256 << " used " << ccap_string(used)
3257 << " want " << ccap_string(want)
3258 << " flush " << ccap_string(flush)
3259 << " retain " << ccap_string(retain)
3260 << " held "<< ccap_string(held)
3261 << " revoking " << ccap_string(revoking)
3262 << " dropping " << ccap_string(dropping)
3263 << dendl;
3264
3265 if (cct->_conf->client_inject_release_failure && revoking) {
3266 const int would_have_issued = cap->issued & retain;
3267 const int would_have_implemented = cap->implemented & (cap->issued | used);
3268 // Simulated bug:
3269 // - tell the server we think issued is whatever they issued plus whatever we implemented
3270 // - leave what we have implemented in place
3271 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3272 cap->issued = cap->issued | cap->implemented;
3273
3274 // Make an exception for revoking xattr caps: we are injecting
3275 // failure to release other caps, but allow xattr because client
3276 // will block on xattr ops if it can't release these to MDS (#9800)
3277 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3278 cap->issued ^= xattr_mask & revoking;
3279 cap->implemented ^= xattr_mask & revoking;
3280
3281 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3282 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3283 } else {
3284 // Normal behaviour
3285 cap->issued &= retain;
3286 cap->implemented &= cap->issued | used;
3287 }
3288
3289 snapid_t follows = 0;
3290
3291 if (flush)
3292 follows = in->snaprealm->get_snap_context().seq;
3293
3294 MClientCaps *m = new MClientCaps(op,
3295 in->ino,
3296 0,
3297 cap->cap_id, cap->seq,
3298 cap->implemented,
3299 want,
3300 flush,
3301 cap->mseq,
3302 cap_epoch_barrier);
3303 m->caller_uid = in->cap_dirtier_uid;
3304 m->caller_gid = in->cap_dirtier_gid;
3305
3306 m->head.issue_seq = cap->issue_seq;
3307 m->set_tid(flush_tid);
3308
3309 m->head.uid = in->uid;
3310 m->head.gid = in->gid;
3311 m->head.mode = in->mode;
3312
3313 m->head.nlink = in->nlink;
3314
3315 if (flush & CEPH_CAP_XATTR_EXCL) {
3316 ::encode(in->xattrs, m->xattrbl);
3317 m->head.xattr_version = in->xattr_version;
3318 }
3319
3320 m->size = in->size;
3321 m->max_size = in->max_size;
3322 m->truncate_seq = in->truncate_seq;
3323 m->truncate_size = in->truncate_size;
3324 m->mtime = in->mtime;
3325 m->atime = in->atime;
3326 m->ctime = in->ctime;
3327 m->btime = in->btime;
3328 m->time_warp_seq = in->time_warp_seq;
3329 m->change_attr = in->change_attr;
3330 if (sync)
3331 m->flags |= CLIENT_CAPS_SYNC;
3332
3333 if (flush & CEPH_CAP_FILE_WR) {
3334 m->inline_version = in->inline_version;
3335 m->inline_data = in->inline_data;
3336 }
3337
3338 in->reported_size = in->size;
3339 m->set_snap_follows(follows);
3340 cap->wanted = want;
3341 if (cap == in->auth_cap) {
3342 m->set_max_size(in->wanted_max_size);
3343 in->requested_max_size = in->wanted_max_size;
3344 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3345 }
3346
3347 if (!session->flushing_caps_tids.empty())
3348 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3349
3350 session->con->send_message(m);
3351}
3352
31f18b77
FG
3353static bool is_max_size_approaching(Inode *in)
3354{
3355 /* mds will adjust max size according to the reported size */
3356 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3357 return false;
3358 if (in->size >= in->max_size)
3359 return true;
3360 /* half of previous max_size increment has been used */
3361 if (in->max_size > in->reported_size &&
3362 (in->size << 1) >= in->max_size + in->reported_size)
3363 return true;
3364 return false;
3365}
7c673cae
FG
3366
3367/**
3368 * check_caps
3369 *
3370 * Examine currently used and wanted versus held caps. Release, flush or ack
3371 * revoked caps to the MDS as appropriate.
3372 *
3373 * @param in the inode to check
3374 * @param flags flags to apply to cap check
3375 */
3376void Client::check_caps(Inode *in, unsigned flags)
3377{
3378 unsigned wanted = in->caps_wanted();
3379 unsigned used = get_caps_used(in);
3380 unsigned cap_used;
3381
3382 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3383 // we do this here because we don't want to drop to Fs (and then
3384 // drop the Fs if we do a create!) if that alone makes us send lookups
3385 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3386 wanted |= CEPH_CAP_FILE_EXCL;
3387 }
3388
3389 int implemented;
3390 int issued = in->caps_issued(&implemented);
3391 int revoking = implemented & ~issued;
3392
3393 int retain = wanted | used | CEPH_CAP_PIN;
3394 if (!unmounting) {
3395 if (wanted)
3396 retain |= CEPH_CAP_ANY;
3397 else
3398 retain |= CEPH_CAP_ANY_SHARED;
3399 }
3400
3401 ldout(cct, 10) << "check_caps on " << *in
3402 << " wanted " << ccap_string(wanted)
3403 << " used " << ccap_string(used)
3404 << " issued " << ccap_string(issued)
3405 << " revoking " << ccap_string(revoking)
3406 << " flags=" << flags
3407 << dendl;
3408
3409 if (in->snapid != CEPH_NOSNAP)
3410 return; //snap caps last forever, can't write
3411
3412 if (in->caps.empty())
3413 return; // guard if at end of func
3414
3415 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
94b18763
FG
3416 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3417 if (_release(in))
3418 used &= ~CEPH_CAP_FILE_CACHE;
3419 }
7c673cae
FG
3420
3421 if (!in->cap_snaps.empty())
3422 flush_snaps(in);
3423
3424 if (flags & CHECK_CAPS_NODELAY)
3425 in->hold_caps_until = utime_t();
3426 else
3427 cap_delay_requeue(in);
3428
3429 utime_t now = ceph_clock_now();
3430
3431 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3432 while (it != in->caps.end()) {
3433 mds_rank_t mds = it->first;
3434 Cap *cap = it->second;
3435 ++it;
3436
3437 MetaSession *session = mds_sessions[mds];
3438 assert(session);
3439
3440 cap_used = used;
3441 if (in->auth_cap && cap != in->auth_cap)
3442 cap_used &= ~in->auth_cap->issued;
3443
3444 revoking = cap->implemented & ~cap->issued;
3445
3446 ldout(cct, 10) << " cap mds." << mds
3447 << " issued " << ccap_string(cap->issued)
3448 << " implemented " << ccap_string(cap->implemented)
3449 << " revoking " << ccap_string(revoking) << dendl;
3450
3451 if (in->wanted_max_size > in->max_size &&
3452 in->wanted_max_size > in->requested_max_size &&
3453 cap == in->auth_cap)
3454 goto ack;
3455
3456 /* approaching file_max? */
3457 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3458 cap == in->auth_cap &&
3459 is_max_size_approaching(in)) {
7c673cae 3460 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3461 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3462 goto ack;
3463 }
3464
3465 /* completed revocation? */
3466 if (revoking && (revoking & cap_used) == 0) {
3467 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3468 goto ack;
3469 }
3470
3471 /* want more caps from mds? */
3472 if (wanted & ~(cap->wanted | cap->issued))
3473 goto ack;
3474
3475 if (!revoking && unmounting && (cap_used == 0))
3476 goto ack;
3477
3478 if (wanted == cap->wanted && // mds knows what we want.
3479 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3480 !in->dirty_caps) // and we have no dirty caps
3481 continue;
3482
3483 if (now < in->hold_caps_until) {
3484 ldout(cct, 10) << "delaying cap release" << dendl;
3485 continue;
3486 }
3487
3488 ack:
3489 // re-send old cap/snapcap flushes first.
3490 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3491 session->mds_state < MDSMap::STATE_ACTIVE &&
3492 session->early_flushing_caps.count(in) == 0) {
3493 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3494 << " to mds." << session->mds_num << dendl;
3495 session->early_flushing_caps.insert(in);
3496 if (in->cap_snaps.size())
3497 flush_snaps(in, true);
3498 if (in->flushing_caps)
3499 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3500 }
3501
3502 int flushing;
3503 ceph_tid_t flush_tid;
3504 if (in->auth_cap == cap && in->dirty_caps) {
3505 flushing = mark_caps_flushing(in, &flush_tid);
3506 } else {
3507 flushing = 0;
3508 flush_tid = 0;
3509 }
3510
3511 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3512 retain, flushing, flush_tid);
3513 }
3514}
3515
3516
3517void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3518{
3519 int used = get_caps_used(in);
3520 int dirty = in->caps_dirty();
3521 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3522
3523 if (in->cap_snaps.size() &&
3524 in->cap_snaps.rbegin()->second.writing) {
3525 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3526 return;
3527 } else if (in->caps_dirty() ||
3528 (used & CEPH_CAP_FILE_WR) ||
3529 (dirty & CEPH_CAP_ANY_WR)) {
3530 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3531 assert(capsnapem.second == true); /* element inserted */
3532 CapSnap &capsnap = capsnapem.first->second;
3533 capsnap.context = old_snapc;
3534 capsnap.issued = in->caps_issued();
3535 capsnap.dirty = in->caps_dirty();
3536
3537 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3538
3539 capsnap.uid = in->uid;
3540 capsnap.gid = in->gid;
3541 capsnap.mode = in->mode;
3542 capsnap.btime = in->btime;
3543 capsnap.xattrs = in->xattrs;
3544 capsnap.xattr_version = in->xattr_version;
3545
3546 if (used & CEPH_CAP_FILE_WR) {
3547 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3548 capsnap.writing = 1;
3549 } else {
3550 finish_cap_snap(in, capsnap, used);
3551 }
3552 } else {
3553 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3554 }
3555}
3556
3557void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3558{
3559 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3560 capsnap.size = in->size;
3561 capsnap.mtime = in->mtime;
3562 capsnap.atime = in->atime;
3563 capsnap.ctime = in->ctime;
3564 capsnap.time_warp_seq = in->time_warp_seq;
3565 capsnap.change_attr = in->change_attr;
3566
3567 capsnap.dirty |= in->caps_dirty();
3568
3569 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3570 capsnap.inline_data = in->inline_data;
3571 capsnap.inline_version = in->inline_version;
3572 }
3573
3574 if (used & CEPH_CAP_FILE_BUFFER) {
3575 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3576 << " WRBUFFER, delaying" << dendl;
3577 } else {
3578 capsnap.dirty_data = 0;
3579 flush_snaps(in);
3580 }
3581}
3582
3583void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3584{
3585 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3586 in->cap_snaps.at(seq).dirty_data = 0;
3587 flush_snaps(in);
3588}
3589
3590void Client::flush_snaps(Inode *in, bool all_again)
3591{
3592 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3593 assert(in->cap_snaps.size());
3594
3595 // pick auth mds
3596 assert(in->auth_cap);
3597 MetaSession *session = in->auth_cap->session;
3598 int mseq = in->auth_cap->mseq;
3599
3600 for (auto &p : in->cap_snaps) {
3601 CapSnap &capsnap = p.second;
3602 if (!all_again) {
3603 // only flush once per session
3604 if (capsnap.flush_tid > 0)
3605 continue;
3606 }
3607
3608 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3609 << " follows " << p.first
3610 << " size " << capsnap.size
3611 << " mtime " << capsnap.mtime
3612 << " dirty_data=" << capsnap.dirty_data
3613 << " writing=" << capsnap.writing
3614 << " on " << *in << dendl;
3615 if (capsnap.dirty_data || capsnap.writing)
3616 continue;
3617
3618 if (capsnap.flush_tid == 0) {
3619 capsnap.flush_tid = ++last_flush_tid;
3620 if (!in->flushing_cap_item.is_on_list())
3621 session->flushing_caps.push_back(&in->flushing_cap_item);
3622 session->flushing_caps_tids.insert(capsnap.flush_tid);
3623 }
3624
3625 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3626 cap_epoch_barrier);
3627 if (user_id >= 0)
3628 m->caller_uid = user_id;
3629 if (group_id >= 0)
3630 m->caller_gid = group_id;
3631
3632 m->set_client_tid(capsnap.flush_tid);
3633 m->head.snap_follows = p.first;
3634
3635 m->head.caps = capsnap.issued;
3636 m->head.dirty = capsnap.dirty;
3637
3638 m->head.uid = capsnap.uid;
3639 m->head.gid = capsnap.gid;
3640 m->head.mode = capsnap.mode;
3641 m->btime = capsnap.btime;
3642
3643 m->size = capsnap.size;
3644
3645 m->head.xattr_version = capsnap.xattr_version;
3646 ::encode(capsnap.xattrs, m->xattrbl);
3647
3648 m->ctime = capsnap.ctime;
3649 m->btime = capsnap.btime;
3650 m->mtime = capsnap.mtime;
3651 m->atime = capsnap.atime;
3652 m->time_warp_seq = capsnap.time_warp_seq;
3653 m->change_attr = capsnap.change_attr;
3654
3655 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3656 m->inline_version = in->inline_version;
3657 m->inline_data = in->inline_data;
3658 }
3659
3660 assert(!session->flushing_caps_tids.empty());
3661 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3662
3663 session->con->send_message(m);
3664 }
3665}
3666
3667
3668
3669void Client::wait_on_list(list<Cond*>& ls)
3670{
3671 Cond cond;
3672 ls.push_back(&cond);
3673 cond.Wait(client_lock);
3674 ls.remove(&cond);
3675}
3676
3677void Client::signal_cond_list(list<Cond*>& ls)
3678{
3679 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3680 (*it)->Signal();
3681}
3682
3683void Client::wait_on_context_list(list<Context*>& ls)
3684{
3685 Cond cond;
3686 bool done = false;
3687 int r;
3688 ls.push_back(new C_Cond(&cond, &done, &r));
3689 while (!done)
3690 cond.Wait(client_lock);
3691}
3692
3693void Client::signal_context_list(list<Context*>& ls)
3694{
3695 while (!ls.empty()) {
3696 ls.front()->complete(0);
3697 ls.pop_front();
3698 }
3699}
3700
3701void Client::wake_inode_waiters(MetaSession *s)
3702{
3703 xlist<Cap*>::iterator iter = s->caps.begin();
3704 while (!iter.end()){
3705 signal_cond_list((*iter)->inode->waitfor_caps);
3706 ++iter;
3707 }
3708}
3709
3710
3711// flush dirty data (from objectcache)
3712
3713class C_Client_CacheInvalidate : public Context {
3714private:
3715 Client *client;
3716 vinodeno_t ino;
3717 int64_t offset, length;
3718public:
3719 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3720 client(c), offset(off), length(len) {
3721 if (client->use_faked_inos())
3722 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3723 else
3724 ino = in->vino();
3725 }
3726 void finish(int r) override {
3727 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3728 assert(!client->client_lock.is_locked_by_me());
3729 client->_async_invalidate(ino, offset, length);
3730 }
3731};
3732
3733void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3734{
3735 if (unmounting)
3736 return;
3737 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3738 ino_invalidate_cb(callback_handle, ino, off, len);
3739}
3740
3741void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3742
3743 if (ino_invalidate_cb)
3744 // we queue the invalidate, which calls the callback and decrements the ref
3745 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3746}
3747
3748void Client::_invalidate_inode_cache(Inode *in)
3749{
3750 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3751
3752 // invalidate our userspace inode cache
94b18763 3753 if (cct->_conf->client_oc) {
7c673cae 3754 objectcacher->release_set(&in->oset);
94b18763
FG
3755 if (!objectcacher->set_is_empty(&in->oset))
3756 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3757 }
7c673cae
FG
3758
3759 _schedule_invalidate_callback(in, 0, 0);
3760}
3761
3762void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3763{
3764 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3765
3766 // invalidate our userspace inode cache
3767 if (cct->_conf->client_oc) {
3768 vector<ObjectExtent> ls;
3769 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3770 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3771 }
3772
3773 _schedule_invalidate_callback(in, off, len);
3774}
3775
3776bool Client::_release(Inode *in)
3777{
3778 ldout(cct, 20) << "_release " << *in << dendl;
3779 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3780 _invalidate_inode_cache(in);
3781 return true;
3782 }
3783 return false;
3784}
3785
3786bool Client::_flush(Inode *in, Context *onfinish)
3787{
3788 ldout(cct, 10) << "_flush " << *in << dendl;
3789
3790 if (!in->oset.dirty_or_tx) {
3791 ldout(cct, 10) << " nothing to flush" << dendl;
3792 onfinish->complete(0);
3793 return true;
3794 }
3795
3796 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3797 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3798 objectcacher->purge_set(&in->oset);
3799 if (onfinish) {
3800 onfinish->complete(-ENOSPC);
3801 }
3802 return true;
3803 }
3804
3805 return objectcacher->flush_set(&in->oset, onfinish);
3806}
3807
3808void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3809{
3810 assert(client_lock.is_locked());
3811 if (!in->oset.dirty_or_tx) {
3812 ldout(cct, 10) << " nothing to flush" << dendl;
3813 return;
3814 }
3815
3816 Mutex flock("Client::_flush_range flock");
3817 Cond cond;
3818 bool safe = false;
3819 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3820 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3821 offset, size, onflush);
3822 if (!ret) {
3823 // wait for flush
3824 client_lock.Unlock();
3825 flock.Lock();
3826 while (!safe)
3827 cond.Wait(flock);
3828 flock.Unlock();
3829 client_lock.Lock();
3830 }
3831}
3832
3833void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3834{
3835 // Mutex::Locker l(client_lock);
3836 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3837 Inode *in = static_cast<Inode *>(oset->parent);
3838 assert(in);
3839 _flushed(in);
3840}
3841
3842void Client::_flushed(Inode *in)
3843{
3844 ldout(cct, 10) << "_flushed " << *in << dendl;
3845
3846 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3847}
3848
3849
3850
3851// checks common to add_update_cap, handle_cap_grant
3852void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3853{
3854 unsigned had = in->caps_issued();
3855
3856 if ((issued & CEPH_CAP_FILE_CACHE) &&
3857 !(had & CEPH_CAP_FILE_CACHE))
3858 in->cache_gen++;
3859
3860 if ((issued & CEPH_CAP_FILE_SHARED) &&
3861 !(had & CEPH_CAP_FILE_SHARED)) {
3862 in->shared_gen++;
3863
3864 if (in->is_dir())
3865 clear_dir_complete_and_ordered(in, true);
3866 }
3867}
3868
3869void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3870 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3871 int flags, const UserPerm& cap_perms)
3872{
3873 Cap *cap = 0;
3874 mds_rank_t mds = mds_session->mds_num;
3875 if (in->caps.count(mds)) {
3876 cap = in->caps[mds];
3877
3878 /*
3879 * auth mds of the inode changed. we received the cap export
3880 * message, but still haven't received the cap import message.
3881 * handle_cap_export() updated the new auth MDS' cap.
3882 *
3883 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3884 * a message that was send before the cap import message. So
3885 * don't remove caps.
3886 */
3887 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3888 assert(cap == in->auth_cap);
3889 assert(cap->cap_id == cap_id);
3890 seq = cap->seq;
3891 mseq = cap->mseq;
3892 issued |= cap->issued;
3893 flags |= CEPH_CAP_FLAG_AUTH;
3894 }
3895 } else {
3896 mds_session->num_caps++;
3897 if (!in->is_any_caps()) {
3898 assert(in->snaprealm == 0);
3899 in->snaprealm = get_snap_realm(realm);
3900 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3901 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3902 }
3903 in->caps[mds] = cap = new Cap;
3904
3905 mds_session->caps.push_back(&cap->cap_item);
3906 cap->session = mds_session;
3907 cap->inode = in;
3908 cap->gen = mds_session->cap_gen;
7c673cae
FG
3909 }
3910
3911 check_cap_issue(in, cap, issued);
3912
3913 if (flags & CEPH_CAP_FLAG_AUTH) {
3914 if (in->auth_cap != cap &&
3915 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3916 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3917 ldout(cct, 10) << "add_update_cap changing auth cap: "
3918 << "add myself to new auth MDS' flushing caps list" << dendl;
3919 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3920 }
3921 in->auth_cap = cap;
3922 }
3923 }
3924
3925 unsigned old_caps = cap->issued;
3926 cap->cap_id = cap_id;
3927 cap->issued |= issued;
3928 cap->implemented |= issued;
3929 cap->seq = seq;
3930 cap->issue_seq = seq;
3931 cap->mseq = mseq;
28e407b8 3932 cap->gen = mds_session->cap_gen;
7c673cae
FG
3933 cap->latest_perms = cap_perms;
3934 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3935 << " from mds." << mds
3936 << " on " << *in
3937 << dendl;
3938
3939 if ((issued & ~old_caps) && in->auth_cap == cap) {
3940 // non-auth MDS is revoking the newly grant caps ?
3941 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3942 if (it->second == cap)
3943 continue;
3944 if (it->second->implemented & ~it->second->issued & issued) {
3945 check_caps(in, CHECK_CAPS_NODELAY);
3946 break;
3947 }
3948 }
3949 }
3950
3951 if (issued & ~old_caps)
3952 signal_cond_list(in->waitfor_caps);
3953}
3954
3955void Client::remove_cap(Cap *cap, bool queue_release)
3956{
3957 Inode *in = cap->inode;
3958 MetaSession *session = cap->session;
3959 mds_rank_t mds = cap->session->mds_num;
3960
3961 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3962
3963 if (queue_release) {
3964 session->enqueue_cap_release(
3965 in->ino,
3966 cap->cap_id,
3967 cap->issue_seq,
3968 cap->mseq,
3969 cap_epoch_barrier);
3970 }
3971
3972 if (in->auth_cap == cap) {
3973 if (in->flushing_cap_item.is_on_list()) {
3974 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3975 in->flushing_cap_item.remove_myself();
3976 }
3977 in->auth_cap = NULL;
3978 }
3979 assert(in->caps.count(mds));
3980 in->caps.erase(mds);
3981
3982 cap->cap_item.remove_myself();
3983 delete cap;
3984 cap = nullptr;
3985
3986 if (!in->is_any_caps()) {
3987 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3988 in->snaprealm_item.remove_myself();
3989 put_snap_realm(in->snaprealm);
3990 in->snaprealm = 0;
3991 }
3992}
3993
3994void Client::remove_all_caps(Inode *in)
3995{
3996 while (!in->caps.empty())
3997 remove_cap(in->caps.begin()->second, true);
3998}
3999
4000void Client::remove_session_caps(MetaSession *s)
4001{
4002 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
4003
4004 while (s->caps.size()) {
4005 Cap *cap = *s->caps.begin();
4006 Inode *in = cap->inode;
4007 bool dirty_caps = false, cap_snaps = false;
4008 if (in->auth_cap == cap) {
4009 cap_snaps = !in->cap_snaps.empty();
4010 dirty_caps = in->dirty_caps | in->flushing_caps;
4011 in->wanted_max_size = 0;
4012 in->requested_max_size = 0;
4013 in->flags |= I_CAP_DROPPED;
4014 }
4015 remove_cap(cap, false);
4016 signal_cond_list(in->waitfor_caps);
4017 if (cap_snaps) {
4018 InodeRef tmp_ref(in);
4019 in->cap_snaps.clear();
4020 }
4021 if (dirty_caps) {
4022 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4023 if (in->flushing_caps) {
4024 num_flushing_caps--;
4025 in->flushing_cap_tids.clear();
4026 }
4027 in->flushing_caps = 0;
28e407b8 4028 in->mark_caps_clean();
7c673cae
FG
4029 put_inode(in);
4030 }
4031 }
4032 s->flushing_caps_tids.clear();
4033 sync_cond.Signal();
4034}
4035
b32b8144
FG
4036int Client::_do_remount(void)
4037{
4038 errno = 0;
4039 int r = remount_cb(callback_handle);
4040 if (r != 0) {
4041 int e = errno;
4042 client_t whoami = get_nodeid();
4043 if (r == -1) {
4044 lderr(cct) <<
4045 "failed to remount (to trim kernel dentries): "
4046 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4047 } else {
4048 lderr(cct) <<
4049 "failed to remount (to trim kernel dentries): "
4050 "return code = " << r << dendl;
4051 }
4052 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4053 cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
4054 if (should_abort && !unmounting) {
4055 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4056 ceph_abort();
4057 }
4058 }
4059 return r;
4060}
4061
7c673cae
FG
4062class C_Client_Remount : public Context {
4063private:
4064 Client *client;
4065public:
4066 explicit C_Client_Remount(Client *c) : client(c) {}
4067 void finish(int r) override {
b32b8144
FG
4068 assert(r == 0);
4069 client->_do_remount();
7c673cae
FG
4070 }
4071};
4072
4073void Client::_invalidate_kernel_dcache()
4074{
4075 if (unmounting)
4076 return;
94b18763
FG
4077 if (can_invalidate_dentries) {
4078 if (dentry_invalidate_cb && root->dir) {
4079 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4080 p != root->dir->dentries.end();
4081 ++p) {
4082 if (p->second->inode)
4083 _schedule_invalidate_dentry_callback(p->second, false);
4084 }
7c673cae
FG
4085 }
4086 } else if (remount_cb) {
4087 // Hacky:
4088 // when remounting a file system, linux kernel trims all unused dentries in the fs
4089 remount_finisher.queue(new C_Client_Remount(this));
4090 }
4091}
4092
28e407b8 4093void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4094{
4095 mds_rank_t mds = s->mds_num;
28e407b8 4096 size_t caps_size = s->caps.size();
7c673cae
FG
4097 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4098 << " caps " << caps_size << dendl;
4099
28e407b8
AA
4100 uint64_t trimmed = 0;
4101 auto p = s->caps.begin();
4102 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4103 * looking at from getting deleted during traversal. */
7c673cae
FG
4104 while ((caps_size - trimmed) > max && !p.end()) {
4105 Cap *cap = *p;
b32b8144 4106 InodeRef in(cap->inode);
7c673cae
FG
4107
4108 // Increment p early because it will be invalidated if cap
4109 // is deleted inside remove_cap
4110 ++p;
4111
4112 if (in->caps.size() > 1 && cap != in->auth_cap) {
4113 int mine = cap->issued | cap->implemented;
4114 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4115 // disposable non-auth cap
b32b8144 4116 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4117 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4118 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4119 trimmed++;
4120 }
4121 } else {
4122 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4123 bool all = true;
4124 set<Dentry*>::iterator q = in->dn_set.begin();
7c673cae
FG
4125 while (q != in->dn_set.end()) {
4126 Dentry *dn = *q++;
4127 if (dn->lru_is_expireable()) {
4128 if (can_invalidate_dentries &&
4129 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4130 // Only issue one of these per DN for inodes in root: handle
4131 // others more efficiently by calling for root-child DNs at
4132 // the end of this function.
4133 _schedule_invalidate_dentry_callback(dn, true);
4134 }
28e407b8
AA
4135 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4136 to_trim.insert(dn);
7c673cae
FG
4137 } else {
4138 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4139 all = false;
4140 }
4141 }
4142 if (all && in->ino != MDS_INO_ROOT) {
4143 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4144 trimmed++;
4145 }
4146 }
4147 }
28e407b8
AA
4148 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4149 for (const auto &dn : to_trim) {
4150 trim_dentry(dn);
4151 }
4152 to_trim.clear();
7c673cae 4153
b32b8144
FG
4154 caps_size = s->caps.size();
4155 if (caps_size > max)
7c673cae
FG
4156 _invalidate_kernel_dcache();
4157}
4158
4159void Client::force_session_readonly(MetaSession *s)
4160{
4161 s->readonly = true;
4162 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4163 Inode *in = (*p)->inode;
4164 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4165 signal_cond_list(in->waitfor_caps);
4166 }
4167}
4168
7c673cae
FG
4169int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4170{
4171 MetaSession *session = in->auth_cap->session;
4172
4173 int flushing = in->dirty_caps;
4174 assert(flushing);
4175
4176 ceph_tid_t flush_tid = ++last_flush_tid;
4177 in->flushing_cap_tids[flush_tid] = flushing;
4178
4179 if (!in->flushing_caps) {
4180 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4181 num_flushing_caps++;
4182 } else {
4183 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4184 }
4185
4186 in->flushing_caps |= flushing;
28e407b8 4187 in->mark_caps_clean();
7c673cae
FG
4188
4189 if (!in->flushing_cap_item.is_on_list())
4190 session->flushing_caps.push_back(&in->flushing_cap_item);
4191 session->flushing_caps_tids.insert(flush_tid);
4192
4193 *ptid = flush_tid;
4194 return flushing;
4195}
4196
4197void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4198{
4199 for (auto &p : in->cap_snaps) {
4200 CapSnap &capsnap = p.second;
4201 if (capsnap.flush_tid > 0) {
4202 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4203 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4204 }
4205 }
4206 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4207 it != in->flushing_cap_tids.end();
4208 ++it) {
4209 old_s->flushing_caps_tids.erase(it->first);
4210 new_s->flushing_caps_tids.insert(it->first);
4211 }
4212 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4213}
4214
4215/*
4216 * Flush all caps back to the MDS. Because the callers generally wait on the
4217 * result of this function (syncfs and umount cases), we set
4218 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4219 */
4220void Client::flush_caps_sync()
4221{
4222 ldout(cct, 10) << __func__ << dendl;
28e407b8 4223 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4224 while (!p.end()) {
4225 unsigned flags = CHECK_CAPS_NODELAY;
4226 Inode *in = *p;
4227
4228 ++p;
28e407b8
AA
4229 delayed_list.pop_front();
4230 if (p.end() && dirty_list.empty())
7c673cae
FG
4231 flags |= CHECK_CAPS_SYNCHRONOUS;
4232 check_caps(in, flags);
4233 }
4234
4235 // other caps, too
28e407b8 4236 p = dirty_list.begin();
7c673cae
FG
4237 while (!p.end()) {
4238 unsigned flags = CHECK_CAPS_NODELAY;
4239 Inode *in = *p;
4240
4241 ++p;
4242 if (p.end())
4243 flags |= CHECK_CAPS_SYNCHRONOUS;
4244 check_caps(in, flags);
4245 }
4246}
4247
4248void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4249{
4250 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4251 Cap *cap = in->auth_cap;
4252 assert(cap->session == session);
4253
4254 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4255 p != in->flushing_cap_tids.end();
4256 ++p) {
4257 bool req_sync = false;
4258
4259 /* If this is a synchronous request, then flush the journal on last one */
4260 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4261 req_sync = true;
4262
4263 send_cap(in, session, cap, req_sync,
4264 (get_caps_used(in) | in->caps_dirty()),
4265 in->caps_wanted(), (cap->issued | cap->implemented),
4266 p->second, p->first);
4267 }
4268}
4269
4270void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4271{
4272 while (in->flushing_caps) {
4273 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4274 assert(it != in->flushing_cap_tids.end());
4275 if (it->first > want)
4276 break;
4277 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4278 << ccap_string(it->second) << " want " << want
4279 << " last " << it->first << dendl;
4280 wait_on_list(in->waitfor_caps);
4281 }
4282}
4283
4284void Client::wait_sync_caps(ceph_tid_t want)
4285{
4286 retry:
4287 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4288 << num_flushing_caps << " total flushing)" << dendl;
4289 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4290 p != mds_sessions.end();
4291 ++p) {
4292 MetaSession *s = p->second;
4293 if (s->flushing_caps_tids.empty())
4294 continue;
4295 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4296 if (oldest_tid <= want) {
4297 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4298 << " (want " << want << ")" << dendl;
4299 sync_cond.Wait(client_lock);
4300 goto retry;
4301 }
4302 }
4303}
4304
4305void Client::kick_flushing_caps(MetaSession *session)
4306{
4307 mds_rank_t mds = session->mds_num;
4308 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4309
4310 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4311 Inode *in = *p;
4312 if (session->early_flushing_caps.count(in))
4313 continue;
4314 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4315 if (in->cap_snaps.size())
4316 flush_snaps(in, true);
4317 if (in->flushing_caps)
4318 flush_caps(in, session);
4319 }
4320
4321 session->early_flushing_caps.clear();
4322}
4323
4324void Client::early_kick_flushing_caps(MetaSession *session)
4325{
4326 session->early_flushing_caps.clear();
4327
4328 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4329 Inode *in = *p;
4330 assert(in->auth_cap);
4331
4332 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4333 // stage. This guarantees that MDS processes the cap flush message before issuing
4334 // the flushing caps to other client.
4335 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4336 continue;
4337
4338 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4339 << " to mds." << session->mds_num << dendl;
4340
4341 session->early_flushing_caps.insert(in);
4342
4343 if (in->cap_snaps.size())
4344 flush_snaps(in, true);
4345 if (in->flushing_caps)
4346 flush_caps(in, session);
4347
4348 }
4349}
4350
4351void Client::kick_maxsize_requests(MetaSession *session)
4352{
4353 xlist<Cap*>::iterator iter = session->caps.begin();
4354 while (!iter.end()){
4355 (*iter)->inode->requested_max_size = 0;
4356 (*iter)->inode->wanted_max_size = 0;
4357 signal_cond_list((*iter)->inode->waitfor_caps);
4358 ++iter;
4359 }
4360}
4361
4362void SnapRealm::build_snap_context()
4363{
4364 set<snapid_t> snaps;
4365 snapid_t max_seq = seq;
4366
4367 // start with prior_parents?
4368 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4369 snaps.insert(prior_parent_snaps[i]);
4370
4371 // current parent's snaps
4372 if (pparent) {
4373 const SnapContext& psnapc = pparent->get_snap_context();
4374 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4375 if (psnapc.snaps[i] >= parent_since)
4376 snaps.insert(psnapc.snaps[i]);
4377 if (psnapc.seq > max_seq)
4378 max_seq = psnapc.seq;
4379 }
4380
4381 // my snaps
4382 for (unsigned i=0; i<my_snaps.size(); i++)
4383 snaps.insert(my_snaps[i]);
4384
4385 // ok!
4386 cached_snap_context.seq = max_seq;
4387 cached_snap_context.snaps.resize(0);
4388 cached_snap_context.snaps.reserve(snaps.size());
4389 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4390 cached_snap_context.snaps.push_back(*p);
4391}
4392
4393void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4394{
4395 list<SnapRealm*> q;
4396 q.push_back(realm);
4397
4398 while (!q.empty()) {
4399 realm = q.front();
4400 q.pop_front();
4401
4402 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4403 realm->invalidate_cache();
4404
4405 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4406 p != realm->pchildren.end();
4407 ++p)
4408 q.push_back(*p);
4409 }
4410}
4411
4412SnapRealm *Client::get_snap_realm(inodeno_t r)
4413{
4414 SnapRealm *realm = snap_realms[r];
4415 if (!realm)
4416 snap_realms[r] = realm = new SnapRealm(r);
4417 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4418 realm->nref++;
4419 return realm;
4420}
4421
4422SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4423{
4424 if (snap_realms.count(r) == 0) {
4425 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4426 return NULL;
4427 }
4428 SnapRealm *realm = snap_realms[r];
4429 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4430 realm->nref++;
4431 return realm;
4432}
4433
4434void Client::put_snap_realm(SnapRealm *realm)
4435{
4436 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4437 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4438 if (--realm->nref == 0) {
4439 snap_realms.erase(realm->ino);
4440 if (realm->pparent) {
4441 realm->pparent->pchildren.erase(realm);
4442 put_snap_realm(realm->pparent);
4443 }
4444 delete realm;
4445 }
4446}
4447
4448bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4449{
4450 if (realm->parent != parent) {
4451 ldout(cct, 10) << "adjust_realm_parent " << *realm
4452 << " " << realm->parent << " -> " << parent << dendl;
4453 realm->parent = parent;
4454 if (realm->pparent) {
4455 realm->pparent->pchildren.erase(realm);
4456 put_snap_realm(realm->pparent);
4457 }
4458 realm->pparent = get_snap_realm(parent);
4459 realm->pparent->pchildren.insert(realm);
4460 return true;
4461 }
4462 return false;
4463}
4464
4465static bool has_new_snaps(const SnapContext& old_snapc,
4466 const SnapContext& new_snapc)
4467{
4468 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4469}
4470
4471
4472void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4473{
4474 SnapRealm *first_realm = NULL;
4475 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4476
4477 map<SnapRealm*, SnapContext> dirty_realms;
4478
4479 bufferlist::iterator p = bl.begin();
4480 while (!p.end()) {
4481 SnapRealmInfo info;
4482 ::decode(info, p);
4483 SnapRealm *realm = get_snap_realm(info.ino());
4484
4485 bool invalidate = false;
4486
4487 if (info.seq() > realm->seq) {
4488 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4489 << dendl;
4490
4491 if (flush) {
4492 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4493 // flush me + children
4494 list<SnapRealm*> q;
4495 q.push_back(realm);
4496 while (!q.empty()) {
4497 SnapRealm *realm = q.front();
4498 q.pop_front();
4499
4500 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4501 p != realm->pchildren.end();
4502 ++p)
4503 q.push_back(*p);
4504
4505 if (dirty_realms.count(realm) == 0) {
4506 realm->nref++;
4507 dirty_realms[realm] = realm->get_snap_context();
4508 }
4509 }
4510 }
4511
4512 // update
4513 realm->seq = info.seq();
4514 realm->created = info.created();
4515 realm->parent_since = info.parent_since();
4516 realm->prior_parent_snaps = info.prior_parent_snaps;
4517 realm->my_snaps = info.my_snaps;
4518 invalidate = true;
4519 }
4520
4521 // _always_ verify parent
4522 if (adjust_realm_parent(realm, info.parent()))
4523 invalidate = true;
4524
4525 if (invalidate) {
4526 invalidate_snaprealm_and_children(realm);
4527 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4528 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4529 } else {
4530 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4531 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4532 }
4533
4534 if (!first_realm)
4535 first_realm = realm;
4536 else
4537 put_snap_realm(realm);
4538 }
4539
4540 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4541 q != dirty_realms.end();
4542 ++q) {
4543 SnapRealm *realm = q->first;
4544 // if there are new snaps ?
4545 if (has_new_snaps(q->second, realm->get_snap_context())) {
4546 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4547 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4548 while (!r.end()) {
4549 Inode *in = *r;
4550 ++r;
4551 queue_cap_snap(in, q->second);
4552 }
4553 } else {
4554 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4555 }
4556 put_snap_realm(realm);
4557 }
4558
4559 if (realm_ret)
4560 *realm_ret = first_realm;
4561 else
4562 put_snap_realm(first_realm);
4563}
4564
4565void Client::handle_snap(MClientSnap *m)
4566{
4567 ldout(cct, 10) << "handle_snap " << *m << dendl;
4568 mds_rank_t mds = mds_rank_t(m->get_source().num());
4569 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4570 if (!session) {
4571 m->put();
4572 return;
4573 }
4574
4575 got_mds_push(session);
4576
4577 map<Inode*, SnapContext> to_move;
4578 SnapRealm *realm = 0;
4579
4580 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4581 assert(m->head.split);
4582 SnapRealmInfo info;
4583 bufferlist::iterator p = m->bl.begin();
4584 ::decode(info, p);
4585 assert(info.ino() == m->head.split);
4586
4587 // flush, then move, ino's.
4588 realm = get_snap_realm(info.ino());
4589 ldout(cct, 10) << " splitting off " << *realm << dendl;
4590 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4591 p != m->split_inos.end();
4592 ++p) {
4593 vinodeno_t vino(*p, CEPH_NOSNAP);
4594 if (inode_map.count(vino)) {
4595 Inode *in = inode_map[vino];
4596 if (!in->snaprealm || in->snaprealm == realm)
4597 continue;
4598 if (in->snaprealm->created > info.created()) {
4599 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4600 << *in->snaprealm << dendl;
4601 continue;
4602 }
4603 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4604
4605
4606 in->snaprealm_item.remove_myself();
4607 to_move[in] = in->snaprealm->get_snap_context();
4608 put_snap_realm(in->snaprealm);
4609 }
4610 }
4611
4612 // move child snaprealms, too
4613 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4614 p != m->split_realms.end();
4615 ++p) {
4616 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4617 SnapRealm *child = get_snap_realm_maybe(*p);
4618 if (!child)
4619 continue;
4620 adjust_realm_parent(child, realm->ino);
4621 put_snap_realm(child);
4622 }
4623 }
4624
4625 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4626
4627 if (realm) {
4628 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4629 Inode *in = p->first;
4630 in->snaprealm = realm;
4631 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4632 realm->nref++;
4633 // queue for snap writeback
4634 if (has_new_snaps(p->second, realm->get_snap_context()))
4635 queue_cap_snap(in, p->second);
4636 }
4637 put_snap_realm(realm);
4638 }
4639
4640 m->put();
4641}
4642
4643void Client::handle_quota(MClientQuota *m)
4644{
4645 mds_rank_t mds = mds_rank_t(m->get_source().num());
4646 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4647 if (!session) {
4648 m->put();
4649 return;
4650 }
4651
4652 got_mds_push(session);
4653
4654 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4655
4656 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4657 if (inode_map.count(vino)) {
4658 Inode *in = NULL;
4659 in = inode_map[vino];
4660
4661 if (in) {
4662 in->quota = m->quota;
4663 in->rstat = m->rstat;
4664 }
4665 }
4666
4667 m->put();
4668}
4669
4670void Client::handle_caps(MClientCaps *m)
4671{
4672 mds_rank_t mds = mds_rank_t(m->get_source().num());
4673 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4674 if (!session) {
4675 m->put();
4676 return;
4677 }
4678
4679 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4680 // Pause RADOS operations until we see the required epoch
4681 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4682 }
4683
4684 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4685 // Record the barrier so that we will transmit it to MDS when releasing
4686 set_cap_epoch_barrier(m->osd_epoch_barrier);
4687 }
4688
4689 got_mds_push(session);
4690
4691 m->clear_payload(); // for if/when we send back to MDS
4692
4693 Inode *in = 0;
4694 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4695 if (inode_map.count(vino))
4696 in = inode_map[vino];
4697 if (!in) {
4698 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4699 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4700 session->enqueue_cap_release(
4701 m->get_ino(),
4702 m->get_cap_id(),
4703 m->get_seq(),
4704 m->get_mseq(),
4705 cap_epoch_barrier);
4706 } else {
4707 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4708 }
4709 m->put();
4710
4711 // in case the mds is waiting on e.g. a revocation
4712 flush_cap_releases();
4713 return;
4714 }
4715
4716 switch (m->get_op()) {
4717 case CEPH_CAP_OP_EXPORT:
4718 return handle_cap_export(session, in, m);
4719 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4720 return handle_cap_flushsnap_ack(session, in, m);
4721 case CEPH_CAP_OP_IMPORT:
4722 handle_cap_import(session, in, m);
4723 }
4724
4725 if (in->caps.count(mds) == 0) {
4726 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4727 m->put();
4728 return;
4729 }
4730
4731 Cap *cap = in->caps[mds];
4732
4733 switch (m->get_op()) {
4734 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4735 case CEPH_CAP_OP_IMPORT:
4736 case CEPH_CAP_OP_REVOKE:
4737 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4738 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4739 default:
4740 m->put();
4741 }
4742}
4743
4744void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4745{
4746 mds_rank_t mds = session->mds_num;
4747
4748 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4749 << " IMPORT from mds." << mds << dendl;
4750
4751 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4752 Cap *cap = NULL;
4753 UserPerm cap_perms;
4754 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4755 cap = in->caps[peer_mds];
4756 if (cap) {
4757 cap_perms = cap->latest_perms;
4758 }
4759 }
4760
4761 // add/update it
4762 SnapRealm *realm = NULL;
4763 update_snap_trace(m->snapbl, &realm);
4764
4765 add_update_cap(in, session, m->get_cap_id(),
4766 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4767 CEPH_CAP_FLAG_AUTH, cap_perms);
4768
4769 if (cap && cap->cap_id == m->peer.cap_id) {
4770 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4771 }
4772
4773 if (realm)
4774 put_snap_realm(realm);
4775
4776 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4777 // reflush any/all caps (if we are now the auth_cap)
4778 if (in->cap_snaps.size())
4779 flush_snaps(in, true);
4780 if (in->flushing_caps)
4781 flush_caps(in, session);
4782 }
4783}
4784
4785void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4786{
4787 mds_rank_t mds = session->mds_num;
4788
4789 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4790 << " EXPORT from mds." << mds << dendl;
4791
4792 Cap *cap = NULL;
4793 if (in->caps.count(mds))
4794 cap = in->caps[mds];
4795
4796 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4797
4798 if (cap && cap->cap_id == m->get_cap_id()) {
4799 if (m->peer.cap_id) {
4800 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4801 if (in->caps.count(peer_mds)) {
4802 Cap *tcap = in->caps[peer_mds];
181888fb 4803 if (tcap->cap_id == m->peer.cap_id &&
7c673cae
FG
4804 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4805 tcap->cap_id = m->peer.cap_id;
4806 tcap->seq = m->peer.seq - 1;
4807 tcap->issue_seq = tcap->seq;
4808 tcap->mseq = m->peer.mseq;
4809 tcap->issued |= cap->issued;
4810 tcap->implemented |= cap->issued;
4811 if (cap == in->auth_cap)
4812 in->auth_cap = tcap;
4813 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4814 adjust_session_flushing_caps(in, session, tsession);
4815 }
4816 } else {
4817 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4818 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4819 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4820 cap->latest_perms);
4821 }
4822 } else {
4823 if (cap == in->auth_cap)
4824 in->flags |= I_CAP_DROPPED;
4825 }
4826
4827 remove_cap(cap, false);
4828 }
4829
4830 m->put();
4831}
4832
4833void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4834{
4835 mds_rank_t mds = session->mds_num;
4836 assert(in->caps[mds]);
4837
4838 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4839 << " size " << in->size << " -> " << m->get_size()
4840 << dendl;
4841
4842 int implemented = 0;
4843 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4844 issued |= implemented;
4845 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4846 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4847 m->get_ctime(), m->get_mtime(), m->get_atime(),
4848 m->inline_version, m->inline_data, issued);
4849 m->put();
4850}
4851
4852void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4853{
4854 ceph_tid_t flush_ack_tid = m->get_client_tid();
4855 int dirty = m->get_dirty();
4856 int cleaned = 0;
4857 int flushed = 0;
4858
4859 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4860 it != in->flushing_cap_tids.end(); ) {
4861 if (it->first == flush_ack_tid)
4862 cleaned = it->second;
4863 if (it->first <= flush_ack_tid) {
4864 session->flushing_caps_tids.erase(it->first);
4865 in->flushing_cap_tids.erase(it++);
4866 ++flushed;
4867 continue;
4868 }
4869 cleaned &= ~it->second;
4870 if (!cleaned)
4871 break;
4872 ++it;
4873 }
4874
4875 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4876 << " cleaned " << ccap_string(cleaned) << " on " << *in
4877 << " with " << ccap_string(dirty) << dendl;
4878
4879 if (flushed) {
4880 signal_cond_list(in->waitfor_caps);
4881 if (session->flushing_caps_tids.empty() ||
4882 *session->flushing_caps_tids.begin() > flush_ack_tid)
4883 sync_cond.Signal();
4884 }
4885
4886 if (!dirty) {
4887 in->cap_dirtier_uid = -1;
4888 in->cap_dirtier_gid = -1;
4889 }
4890
4891 if (!cleaned) {
4892 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4893 } else {
4894 if (in->flushing_caps) {
4895 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4896 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4897 in->flushing_caps &= ~cleaned;
4898 if (in->flushing_caps == 0) {
4899 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4900 num_flushing_caps--;
4901 if (in->cap_snaps.empty())
4902 in->flushing_cap_item.remove_myself();
4903 }
4904 if (!in->caps_dirty())
4905 put_inode(in);
4906 }
4907 }
4908
4909 m->put();
4910}
4911
4912
4913void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4914{
4915 mds_rank_t mds = session->mds_num;
4916 assert(in->caps[mds]);
4917 snapid_t follows = m->get_snap_follows();
4918
4919 if (in->cap_snaps.count(follows)) {
4920 CapSnap &capsnap = in->cap_snaps.at(follows);
4921 if (m->get_client_tid() != capsnap.flush_tid) {
4922 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4923 } else {
4924 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4925 << " on " << *in << dendl;
4926 InodeRef tmp_ref;
4927 if (in->get_num_ref() == 1)
4928 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4929 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4930 in->flushing_cap_item.remove_myself();
4931 session->flushing_caps_tids.erase(capsnap.flush_tid);
4932 in->cap_snaps.erase(follows);
4933 }
4934 } else {
4935 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4936 << " on " << *in << dendl;
4937 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4938 }
4939
4940 m->put();
4941}
4942
4943class C_Client_DentryInvalidate : public Context {
4944private:
4945 Client *client;
4946 vinodeno_t dirino;
4947 vinodeno_t ino;
4948 string name;
4949public:
4950 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4951 client(c), name(dn->name) {
4952 if (client->use_faked_inos()) {
4953 dirino.ino = dn->dir->parent_inode->faked_ino;
4954 if (del)
4955 ino.ino = dn->inode->faked_ino;
4956 } else {
4957 dirino = dn->dir->parent_inode->vino();
4958 if (del)
4959 ino = dn->inode->vino();
4960 }
4961 if (!del)
4962 ino.ino = inodeno_t();
4963 }
4964 void finish(int r) override {
4965 // _async_dentry_invalidate is responsible for its own locking
4966 assert(!client->client_lock.is_locked_by_me());
4967 client->_async_dentry_invalidate(dirino, ino, name);
4968 }
4969};
4970
4971void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4972{
4973 if (unmounting)
4974 return;
4975 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4976 << " in dir " << dirino << dendl;
4977 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4978}
4979
4980void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4981{
4982 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4983 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4984}
4985
4986void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4987{
4988 int ref = in->get_num_ref();
4989
4990 if (in->dir && !in->dir->dentries.empty()) {
4991 for (auto p = in->dir->dentries.begin();
4992 p != in->dir->dentries.end(); ) {
4993 Dentry *dn = p->second;
4994 ++p;
4995 /* rmsnap removes whole subtree, need trim inodes recursively.
4996 * we don't need to invalidate dentries recursively. because
4997 * invalidating a directory dentry effectively invalidate
4998 * whole subtree */
4999 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5000 _try_to_trim_inode(dn->inode.get(), false);
5001
5002 if (dn->lru_is_expireable())
5003 unlink(dn, true, false); // keep dir, drop dentry
5004 }
5005 if (in->dir->dentries.empty()) {
5006 close_dir(in->dir);
5007 --ref;
5008 }
5009 }
5010
5011 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5012 InodeRef snapdir = open_snapdir(in);
5013 _try_to_trim_inode(snapdir.get(), false);
5014 --ref;
5015 }
5016
5017 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5018 set<Dentry*>::iterator q = in->dn_set.begin();
5019 while (q != in->dn_set.end()) {
5020 Dentry *dn = *q++;
5021 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5022 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5023 _schedule_invalidate_dentry_callback(dn, true);
5024 unlink(dn, true, true);
5025 }
5026 }
5027}
5028
5029void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5030{
5031 mds_rank_t mds = session->mds_num;
5032 int used = get_caps_used(in);
5033 int wanted = in->caps_wanted();
5034
5035 const int old_caps = cap->issued;
5036 const int new_caps = m->get_caps();
5037 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5038 << " mds." << mds << " seq " << m->get_seq()
5039 << " caps now " << ccap_string(new_caps)
5040 << " was " << ccap_string(old_caps) << dendl;
5041 cap->seq = m->get_seq();
28e407b8 5042 cap->gen = session->cap_gen;
7c673cae
FG
5043
5044 in->layout = m->get_layout();
5045
5046 // update inode
5047 int implemented = 0;
5048 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5049 issued |= implemented;
5050
5051 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5052 in->mode = m->head.mode;
5053 in->uid = m->head.uid;
5054 in->gid = m->head.gid;
5055 in->btime = m->btime;
5056 }
5057 bool deleted_inode = false;
5058 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5059 in->nlink = m->head.nlink;
5060 if (in->nlink == 0 &&
5061 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5062 deleted_inode = true;
5063 }
5064 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5065 m->xattrbl.length() &&
5066 m->head.xattr_version > in->xattr_version) {
5067 bufferlist::iterator p = m->xattrbl.begin();
5068 ::decode(in->xattrs, p);
5069 in->xattr_version = m->head.xattr_version;
5070 }
28e407b8
AA
5071
5072 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5073 in->dirstat.nfiles = m->get_nfiles();
5074 in->dirstat.nsubdirs = m->get_nsubdirs();
5075 }
5076
7c673cae
FG
5077 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5078 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5079 m->get_mtime(), m->get_atime(),
5080 m->inline_version, m->inline_data, issued);
5081
5082 // max_size
5083 if (cap == in->auth_cap &&
5084 m->get_max_size() != in->max_size) {
5085 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5086 in->max_size = m->get_max_size();
5087 if (in->max_size > in->wanted_max_size) {
5088 in->wanted_max_size = 0;
5089 in->requested_max_size = 0;
5090 }
5091 }
5092
5093 bool check = false;
5094 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5095 check = true;
5096
5097 check_cap_issue(in, cap, new_caps);
5098
5099 // update caps
b32b8144
FG
5100 int revoked = old_caps & ~new_caps;
5101 if (revoked) {
5102 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5103 cap->issued = new_caps;
5104 cap->implemented |= new_caps;
5105
b32b8144
FG
5106 // recall delegations if we're losing caps necessary for them
5107 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5108 in->recall_deleg(false);
5109 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5110 in->recall_deleg(true);
5111
28e407b8
AA
5112 if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
5113 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5114 // waitin' for flush
28e407b8 5115 } else if (revoked & CEPH_CAP_FILE_CACHE) {
7c673cae
FG
5116 if (_release(in))
5117 check = true;
5118 } else {
5119 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5120 check = true;
5121 }
7c673cae
FG
5122 } else if (old_caps == new_caps) {
5123 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5124 } else {
5125 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5126 cap->issued = new_caps;
5127 cap->implemented |= new_caps;
5128
5129 if (cap == in->auth_cap) {
5130 // non-auth MDS is revoking the newly grant caps ?
5131 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5132 if (it->second == cap)
5133 continue;
5134 if (it->second->implemented & ~it->second->issued & new_caps) {
5135 check = true;
5136 break;
5137 }
5138 }
5139 }
5140 }
5141
5142 if (check)
5143 check_caps(in, 0);
5144
5145 // wake up waiters
5146 if (new_caps)
5147 signal_cond_list(in->waitfor_caps);
5148
5149 // may drop inode's last ref
5150 if (deleted_inode)
5151 _try_to_trim_inode(in, true);
5152
5153 m->put();
5154}
5155
7c673cae
FG
5156int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5157{
5158 if (perms.uid() == 0)
5159 return 0;
5160
5161 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5162 int ret = _posix_acl_permission(in, perms, want);
5163 if (ret != -EAGAIN)
5164 return ret;
5165 }
5166
5167 // check permissions before doing anything else
5168 if (!in->check_mode(perms, want))
5169 return -EACCES;
5170 return 0;
5171}
5172
5173int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5174 const UserPerm& perms)
5175{
5176 int r = _getattr_for_perm(in, perms);
5177 if (r < 0)
5178 goto out;
5179
5180 r = 0;
5181 if (strncmp(name, "system.", 7) == 0) {
5182 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5183 r = -EPERM;
5184 } else {
5185 r = inode_permission(in, perms, want);
5186 }
5187out:
5188 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5189 return r;
5190}
5191
5192ostream& operator<<(ostream &out, const UserPerm& perm) {
5193 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5194 return out;
5195}
5196
5197int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5198 const UserPerm& perms)
5199{
181888fb 5200 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5201 int r = _getattr_for_perm(in, perms);
5202 if (r < 0)
5203 goto out;
5204
5205 if (mask & CEPH_SETATTR_SIZE) {
5206 r = inode_permission(in, perms, MAY_WRITE);
5207 if (r < 0)
5208 goto out;
5209 }
5210
5211 r = -EPERM;
5212 if (mask & CEPH_SETATTR_UID) {
5213 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5214 goto out;
5215 }
5216 if (mask & CEPH_SETATTR_GID) {
5217 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5218 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5219 goto out;
5220 }
5221
5222 if (mask & CEPH_SETATTR_MODE) {
5223 if (perms.uid() != 0 && perms.uid() != in->uid)
5224 goto out;
5225
5226 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5227 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5228 stx->stx_mode &= ~S_ISGID;
5229 }
5230
5231 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5232 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5233 if (perms.uid() != 0 && perms.uid() != in->uid) {
5234 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5235 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5236 check_mask |= CEPH_SETATTR_MTIME;
5237 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5238 check_mask |= CEPH_SETATTR_ATIME;
5239 if (check_mask & mask) {
5240 goto out;
5241 } else {
5242 r = inode_permission(in, perms, MAY_WRITE);
5243 if (r < 0)
5244 goto out;
5245 }
5246 }
5247 }
5248 r = 0;
5249out:
5250 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5251 return r;
5252}
5253
5254int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5255{
181888fb 5256 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5257 unsigned want = 0;
5258
5259 if ((flags & O_ACCMODE) == O_WRONLY)
5260 want = MAY_WRITE;
5261 else if ((flags & O_ACCMODE) == O_RDWR)
5262 want = MAY_READ | MAY_WRITE;
5263 else if ((flags & O_ACCMODE) == O_RDONLY)
5264 want = MAY_READ;
5265 if (flags & O_TRUNC)
5266 want |= MAY_WRITE;
5267
5268 int r = 0;
5269 switch (in->mode & S_IFMT) {
5270 case S_IFLNK:
5271 r = -ELOOP;
5272 goto out;
5273 case S_IFDIR:
5274 if (want & MAY_WRITE) {
5275 r = -EISDIR;
5276 goto out;
5277 }
5278 break;
5279 }
5280
5281 r = _getattr_for_perm(in, perms);
5282 if (r < 0)
5283 goto out;
5284
5285 r = inode_permission(in, perms, want);
5286out:
5287 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5288 return r;
5289}
5290
5291int Client::may_lookup(Inode *dir, const UserPerm& perms)
5292{
181888fb 5293 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5294 int r = _getattr_for_perm(dir, perms);
5295 if (r < 0)
5296 goto out;
5297
5298 r = inode_permission(dir, perms, MAY_EXEC);
5299out:
5300 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5301 return r;
5302}
5303
5304int Client::may_create(Inode *dir, const UserPerm& perms)
5305{
181888fb 5306 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5307 int r = _getattr_for_perm(dir, perms);
5308 if (r < 0)
5309 goto out;
5310
5311 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5312out:
5313 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5314 return r;
5315}
5316
5317int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5318{
181888fb 5319 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5320 int r = _getattr_for_perm(dir, perms);
5321 if (r < 0)
5322 goto out;
5323
5324 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5325 if (r < 0)
5326 goto out;
5327
5328 /* 'name == NULL' means rmsnap */
5329 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5330 InodeRef otherin;
5331 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5332 if (r < 0)
5333 goto out;
5334 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5335 r = -EPERM;
5336 }
5337out:
5338 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5339 return r;
5340}
5341
5342int Client::may_hardlink(Inode *in, const UserPerm& perms)
5343{
181888fb 5344 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5345 int r = _getattr_for_perm(in, perms);
5346 if (r < 0)
5347 goto out;
5348
5349 if (perms.uid() == 0 || perms.uid() == in->uid) {
5350 r = 0;
5351 goto out;
5352 }
5353
5354 r = -EPERM;
5355 if (!S_ISREG(in->mode))
5356 goto out;
5357
5358 if (in->mode & S_ISUID)
5359 goto out;
5360
5361 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5362 goto out;
5363
5364 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5365out:
5366 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5367 return r;
5368}
5369
5370int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5371{
5372 int mask = CEPH_STAT_CAP_MODE;
5373 bool force = false;
5374 if (acl_type != NO_ACL) {
5375 mask |= CEPH_STAT_CAP_XATTR;
5376 force = in->xattr_version == 0;
5377 }
5378 return _getattr(in, mask, perms, force);
5379}
5380
5381vinodeno_t Client::_get_vino(Inode *in)
5382{
5383 /* The caller must hold the client lock */
5384 return vinodeno_t(in->ino, in->snapid);
5385}
5386
5387inodeno_t Client::_get_inodeno(Inode *in)
5388{
5389 /* The caller must hold the client lock */
5390 return in->ino;
5391}
5392
5393
5394/**
5395 * Resolve an MDS spec to a list of MDS daemon GIDs.
5396 *
5397 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5398 * It may be '*' in which case it matches all GIDs.
5399 *
5400 * If no error is returned, the `targets` vector will be populated with at least
5401 * one MDS.
5402 */
5403int Client::resolve_mds(
5404 const std::string &mds_spec,
5405 std::vector<mds_gid_t> *targets)
5406{
5407 assert(fsmap);
5408 assert(targets != nullptr);
5409
5410 mds_role_t role;
5411 std::stringstream ss;
5412 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5413 if (role_r == 0) {
5414 // We got a role, resolve it to a GID
5415 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5416 << role << "'" << dendl;
5417 targets->push_back(
5418 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5419 return 0;
5420 }
5421
5422 std::string strtol_err;
5423 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5424 if (strtol_err.empty()) {
5425 // It is a possible GID
5426 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5427 if (fsmap->gid_exists(mds_gid)) {
5428 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5429 targets->push_back(mds_gid);
5430 } else {
5431 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5432 << dendl;
5433 return -ENOENT;
5434 }
5435 } else if (mds_spec == "*") {
5436 // It is a wildcard: use all MDSs
5437 const auto mds_info = fsmap->get_mds_info();
5438
5439 if (mds_info.empty()) {
5440 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5441 return -ENOENT;
5442 }
5443
5444 for (const auto i : mds_info) {
5445 targets->push_back(i.first);
5446 }
5447 } else {
5448 // It did not parse as an integer, it is not a wildcard, it must be a name
5449 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5450 if (mds_gid == 0) {
5451 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5452
5453 lderr(cct) << "FSMap: " << *fsmap << dendl;
5454
5455 return -ENOENT;
5456 } else {
5457 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5458 << "' to GID " << mds_gid << dendl;
5459 targets->push_back(mds_gid);
5460 }
5461 }
5462
5463 return 0;
5464}
5465
5466
5467/**
5468 * Authenticate with mon and establish global ID
5469 */
5470int Client::authenticate()
5471{
5472 assert(client_lock.is_locked_by_me());
5473
5474 if (monclient->is_authenticated()) {
5475 return 0;
5476 }
5477
5478 client_lock.Unlock();
5479 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5480 client_lock.Lock();
5481 if (r < 0) {
5482 return r;
5483 }
5484
5485 whoami = monclient->get_global_id();
5486 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5487
5488 return 0;
5489}
5490
5491int Client::fetch_fsmap(bool user)
5492{
5493 int r;
5494 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5495 // rather than MDSMap because no one MDSMap contains all the daemons, and
5496 // a `tell` can address any daemon.
5497 version_t fsmap_latest;
5498 do {
5499 C_SaferCond cond;
5500 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5501 client_lock.Unlock();
5502 r = cond.wait();
5503 client_lock.Lock();
5504 } while (r == -EAGAIN);
5505
5506 if (r < 0) {
5507 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5508 return r;
5509 }
5510
5511 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5512
5513 if (user) {
5514 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5515 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5516 monclient->renew_subs();
5517 wait_on_list(waiting_for_fsmap);
5518 }
5519 assert(fsmap_user);
5520 assert(fsmap_user->get_epoch() >= fsmap_latest);
5521 } else {
5522 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5523 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5524 monclient->renew_subs();
5525 wait_on_list(waiting_for_fsmap);
5526 }
5527 assert(fsmap);
5528 assert(fsmap->get_epoch() >= fsmap_latest);
5529 }
5530 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5531 << fsmap_latest << dendl;
5532 return 0;
5533}
5534
5535/**
5536 *
5537 * @mds_spec one of ID, rank, GID, "*"
5538 *
5539 */
5540int Client::mds_command(
5541 const std::string &mds_spec,
5542 const vector<string>& cmd,
5543 const bufferlist& inbl,
5544 bufferlist *outbl,
5545 string *outs,
5546 Context *onfinish)
5547{
5548 Mutex::Locker lock(client_lock);
5549
181888fb
FG
5550 if (!initialized)
5551 return -ENOTCONN;
7c673cae
FG
5552
5553 int r;
5554 r = authenticate();
5555 if (r < 0) {
5556 return r;
5557 }
5558
5559 r = fetch_fsmap(false);
5560 if (r < 0) {
5561 return r;
5562 }
5563
5564 // Look up MDS target(s) of the command
5565 std::vector<mds_gid_t> targets;
5566 r = resolve_mds(mds_spec, &targets);
5567 if (r < 0) {
5568 return r;
5569 }
5570
5571 // If daemons are laggy, we won't send them commands. If all
5572 // are laggy then we fail.
5573 std::vector<mds_gid_t> non_laggy;
5574 for (const auto gid : targets) {
5575 const auto info = fsmap->get_info_gid(gid);
5576 if (!info.laggy()) {
5577 non_laggy.push_back(gid);
5578 }
5579 }
5580 if (non_laggy.size() == 0) {
5581 *outs = "All targeted MDS daemons are laggy";
5582 return -ENOENT;
5583 }
5584
5585 if (metadata.empty()) {
5586 // We are called on an unmounted client, so metadata
5587 // won't be initialized yet.
5588 populate_metadata("");
5589 }
5590
5591 // Send commands to targets
5592 C_GatherBuilder gather(cct, onfinish);
5593 for (const auto target_gid : non_laggy) {
5594 const auto info = fsmap->get_info_gid(target_gid);
5595
5596 // Open a connection to the target MDS
5597 entity_inst_t inst = info.get_inst();
5598 ConnectionRef conn = messenger->get_connection(inst);
5599
5600 // Generate MDSCommandOp state
5601 auto &op = command_table.start_command();
5602
5603 op.on_finish = gather.new_sub();
5604 op.cmd = cmd;
5605 op.outbl = outbl;
5606 op.outs = outs;
5607 op.inbl = inbl;
5608 op.mds_gid = target_gid;
5609 op.con = conn;
5610
5611 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5612 << " tid=" << op.tid << cmd << dendl;
5613
5614 // Construct and send MCommand
5615 MCommand *m = op.get_message(monclient->get_fsid());
5616 conn->send_message(m);
5617 }
5618 gather.activate();
5619
5620 return 0;
5621}
5622
5623void Client::handle_command_reply(MCommandReply *m)
5624{
5625 ceph_tid_t const tid = m->get_tid();
5626
5627 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5628
5629 if (!command_table.exists(tid)) {
5630 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5631 m->put();
5632 return;
5633 }
5634
5635 auto &op = command_table.get_command(tid);
5636 if (op.outbl) {
5637 op.outbl->claim(m->get_data());
5638 }
5639 if (op.outs) {
5640 *op.outs = m->rs;
5641 }
5642
5643 if (op.on_finish) {
5644 op.on_finish->complete(m->r);
5645 }
5646
5647 command_table.erase(tid);
5648
5649 m->put();
5650}
5651
5652// -------------------
5653// MOUNT
5654
5655int Client::mount(const std::string &mount_root, const UserPerm& perms,
5656 bool require_mds)
5657{
5658 Mutex::Locker lock(client_lock);
5659
5660 if (mounted) {
5661 ldout(cct, 5) << "already mounted" << dendl;
5662 return 0;
5663 }
5664
b32b8144
FG
5665 unmounting = false;
5666
7c673cae
FG
5667 int r = authenticate();
5668 if (r < 0) {
5669 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5670 return r;
5671 }
5672
5673 std::string want = "mdsmap";
5674 const auto &mds_ns = cct->_conf->client_mds_namespace;
5675 if (!mds_ns.empty()) {
5676 r = fetch_fsmap(true);
5677 if (r < 0)
5678 return r;
5679 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5680 if (cid == FS_CLUSTER_ID_NONE)
5681 return -ENOENT;
5682
5683 std::ostringstream oss;
5684 oss << want << "." << cid;
5685 want = oss.str();
5686 }
5687 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5688
5689 monclient->sub_want(want, 0, 0);
5690 monclient->renew_subs();
5691
5692 tick(); // start tick
5693
5694 if (require_mds) {
5695 while (1) {
5696 auto availability = mdsmap->is_cluster_available();
5697 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5698 // Error out
5699 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5700 return CEPH_FUSE_NO_MDS_UP;
5701 } else if (availability == MDSMap::AVAILABLE) {
5702 // Continue to mount
5703 break;
5704 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5705 // Else, wait. MDSMonitor will update the map to bring
5706 // us to a conclusion eventually.
5707 wait_on_list(waiting_for_mdsmap);
5708 } else {
5709 // Unexpected value!
5710 ceph_abort();
5711 }
5712 }
5713 }
5714
5715 populate_metadata(mount_root.empty() ? "/" : mount_root);
5716
5717 filepath fp(CEPH_INO_ROOT);
5718 if (!mount_root.empty()) {
5719 fp = filepath(mount_root.c_str());
5720 }
5721 while (true) {
5722 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5723 req->set_filepath(fp);
5724 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5725 int res = make_request(req, perms);
5726 if (res < 0) {
5727 if (res == -EACCES && root) {
5728 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5729 break;
5730 }
5731 return res;
5732 }
5733
5734 if (fp.depth())
5735 fp.pop_dentry();
5736 else
5737 break;
5738 }
5739
5740 assert(root);
5741 _ll_get(root);
5742
5743 mounted = true;
5744
5745 // trace?
5746 if (!cct->_conf->client_trace.empty()) {
5747 traceout.open(cct->_conf->client_trace.c_str());
5748 if (traceout.is_open()) {
5749 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5750 } else {
5751 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5752 }
5753 }
5754
5755 /*
5756 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5757 ldout(cct, 3) << "op: struct stat st;" << dendl;
5758 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5759 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5760 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5761 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5762 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5763 ldout(cct, 3) << "op: int fd;" << dendl;
5764 */
5765 return 0;
5766}
5767
5768// UNMOUNT
5769
5770void Client::_close_sessions()
5771{
5772 while (!mds_sessions.empty()) {
5773 // send session closes!
5774 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5775 p != mds_sessions.end();
5776 ++p) {
5777 if (p->second->state != MetaSession::STATE_CLOSING) {
5778 _close_mds_session(p->second);
5779 }
5780 }
5781
5782 // wait for sessions to close
5783 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5784 mount_cond.Wait(client_lock);
5785 }
5786}
5787
31f18b77
FG
5788void Client::flush_mdlog_sync()
5789{
5790 if (mds_requests.empty())
5791 return;
5792 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5793 p != mds_sessions.end();
5794 ++p) {
5795 MetaSession *s = p->second;
5796 flush_mdlog(s);
5797 }
5798}
5799
5800void Client::flush_mdlog(MetaSession *session)
5801{
5802 // Only send this to Luminous or newer MDS daemons, older daemons
5803 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5804 const uint64_t features = session->con->get_features();
5805 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5806 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5807 session->con->send_message(m);
5808 }
5809}
5810
5811
b32b8144 5812void Client::_unmount()
7c673cae 5813{
181888fb
FG
5814 if (unmounting)
5815 return;
7c673cae
FG
5816
5817 ldout(cct, 2) << "unmounting" << dendl;
5818 unmounting = true;
5819
b32b8144
FG
5820 deleg_timeout = 0;
5821
31f18b77 5822 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5823 while (!mds_requests.empty()) {
5824 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5825 mount_cond.Wait(client_lock);
5826 }
5827
5828 if (tick_event)
5829 timer.cancel_event(tick_event);
5830 tick_event = 0;
5831
5832 cwd.reset();
5833
5834 // clean up any unclosed files
5835 while (!fd_map.empty()) {
5836 Fh *fh = fd_map.begin()->second;
5837 fd_map.erase(fd_map.begin());
5838 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5839 _release_fh(fh);
5840 }
5841
5842 while (!ll_unclosed_fh_set.empty()) {
5843 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5844 Fh *fh = *it;
5845 ll_unclosed_fh_set.erase(fh);
5846 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5847 _release_fh(fh);
5848 }
5849
5850 while (!opened_dirs.empty()) {
5851 dir_result_t *dirp = *opened_dirs.begin();
5852 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5853 _closedir(dirp);
5854 }
5855
5856 _ll_drop_pins();
5857
31f18b77
FG
5858 if (blacklisted) {
5859 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5860
5861 if (cct->_conf->client_oc) {
5862 // Purge all cached data so that ObjectCacher doesn't get hung up
5863 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5864 // is to just leave things marked dirty
5865 // (http://tracker.ceph.com/issues/9105)
5866 for (const auto &i : inode_map) {
5867 objectcacher->purge_set(&(i.second->oset));
5868 }
5869 }
5870
5871 mounted = false;
5872 return;
5873 }
5874
7c673cae
FG
5875 while (unsafe_sync_write > 0) {
5876 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5877 mount_cond.Wait(client_lock);
5878 }
5879
5880 if (cct->_conf->client_oc) {
5881 // flush/release all buffered data
5882 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5883 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5884 p != inode_map.end();
5885 p = next) {
5886 next = p;
5887 ++next;
5888 Inode *in = p->second;
5889 if (!in) {
5890 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5891 assert(in);
5892 }
5893 if (!in->caps.empty()) {
5894 InodeRef tmp_ref(in);
5895 _release(in);
5896 _flush(in, new C_Client_FlushComplete(this, in));
5897 }
5898 }
5899 }
5900
5901 flush_caps_sync();
5902 wait_sync_caps(last_flush_tid);
5903
5904 // empty lru cache
7c673cae
FG
5905 trim_cache();
5906
5907 while (lru.lru_get_size() > 0 ||
5908 !inode_map.empty()) {
5909 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5910 << "+" << inode_map.size() << " items"
5911 << ", waiting (for caps to release?)"
5912 << dendl;
5913 utime_t until = ceph_clock_now() + utime_t(5, 0);
5914 int r = mount_cond.WaitUntil(client_lock, until);
5915 if (r == ETIMEDOUT) {
5916 dump_cache(NULL);
5917 }
5918 }
5919 assert(lru.lru_get_size() == 0);
5920 assert(inode_map.empty());
5921
5922 // stop tracing
5923 if (!cct->_conf->client_trace.empty()) {
5924 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5925 traceout.close();
5926 }
5927
5928 _close_sessions();
5929
5930 mounted = false;
5931
5932 ldout(cct, 2) << "unmounted." << dendl;
5933}
5934
b32b8144
FG
5935void Client::unmount()
5936{
5937 Mutex::Locker lock(client_lock);
5938 _unmount();
5939}
5940
7c673cae
FG
5941void Client::flush_cap_releases()
5942{
5943 // send any cap releases
5944 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5945 p != mds_sessions.end();
5946 ++p) {
5947 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5948 p->first)) {
5949 if (cct->_conf->client_inject_release_failure) {
5950 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5951 p->second->release->put();
5952 } else {
5953 p->second->con->send_message(p->second->release);
5954 }
5955 p->second->release = 0;
5956 }
5957 }
5958}
5959
5960void Client::tick()
5961{
5962 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5963 sleep(cct->_conf->client_debug_inject_tick_delay);
5964 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5965 cct->_conf->apply_changes(NULL);
5966 }
5967
5968 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
5969 tick_event = timer.add_event_after(
5970 cct->_conf->client_tick_interval,
5971 new FunctionContext([this](int) {
5972 // Called back via Timer, which takes client_lock for us
5973 assert(client_lock.is_locked_by_me());
5974 tick();
5975 }));
7c673cae
FG
5976 utime_t now = ceph_clock_now();
5977
5978 if (!mounted && !mds_requests.empty()) {
5979 MetaRequest *req = mds_requests.begin()->second;
5980 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
5981 req->abort(-ETIMEDOUT);
5982 if (req->caller_cond) {
5983 req->kick = true;
5984 req->caller_cond->Signal();
5985 }
5986 signal_cond_list(waiting_for_mdsmap);
5987 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5988 p != mds_sessions.end();
5989 ++p)
5990 signal_context_list(p->second->waiting_for_open);
5991 }
5992 }
5993
5994 if (mdsmap->get_epoch()) {
5995 // renew caps?
5996 utime_t el = now - last_cap_renew;
5997 if (el > mdsmap->get_session_timeout() / 3.0)
5998 renew_caps();
5999
6000 flush_cap_releases();
6001 }
6002
6003 // delayed caps
28e407b8 6004 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6005 while (!p.end()) {
6006 Inode *in = *p;
6007 ++p;
6008 if (in->hold_caps_until > now)
6009 break;
28e407b8 6010 delayed_list.pop_front();
7c673cae
FG
6011 check_caps(in, CHECK_CAPS_NODELAY);
6012 }
6013
6014 trim_cache(true);
6015}
6016
6017void Client::renew_caps()
6018{
6019 ldout(cct, 10) << "renew_caps()" << dendl;
6020 last_cap_renew = ceph_clock_now();
6021
6022 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6023 p != mds_sessions.end();
6024 ++p) {
6025 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6026 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6027 renew_caps(p->second);
6028 }
6029}
6030
6031void Client::renew_caps(MetaSession *session)
6032{
6033 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6034 session->last_cap_renew_request = ceph_clock_now();
6035 uint64_t seq = ++session->cap_renew_seq;
6036 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6037}
6038
6039
6040// ===============================================================
6041// high level (POSIXy) interface
6042
6043int Client::_do_lookup(Inode *dir, const string& name, int mask,
6044 InodeRef *target, const UserPerm& perms)
6045{
6046 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6047 MetaRequest *req = new MetaRequest(op);
6048 filepath path;
6049 dir->make_nosnap_relative_path(path);
6050 path.push_dentry(name);
6051 req->set_filepath(path);
6052 req->set_inode(dir);
6053 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6054 mask |= DEBUG_GETATTR_CAPS;
6055 req->head.args.getattr.mask = mask;
6056
6057 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6058
6059 int r = make_request(req, perms, target);
6060 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6061 return r;
6062}
6063
6064int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6065 const UserPerm& perms)
6066{
6067 int r = 0;
6068 Dentry *dn = NULL;
6069
6070 if (!dir->is_dir()) {
6071 r = -ENOTDIR;
6072 goto done;
6073 }
6074
6075 if (dname == "..") {
6076 if (dir->dn_set.empty())
6077 *target = dir;
6078 else
6079 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6080 goto done;
6081 }
6082
6083 if (dname == ".") {
6084 *target = dir;
6085 goto done;
6086 }
6087
6088 if (dname.length() > NAME_MAX) {
6089 r = -ENAMETOOLONG;
6090 goto done;
6091 }
6092
6093 if (dname == cct->_conf->client_snapdir &&
6094 dir->snapid == CEPH_NOSNAP) {
6095 *target = open_snapdir(dir);
6096 goto done;
6097 }
6098
6099 if (dir->dir &&
6100 dir->dir->dentries.count(dname)) {
6101 dn = dir->dir->dentries[dname];
6102
6103 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6104 << " seq " << dn->lease_seq
6105 << dendl;
6106
94b18763 6107 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6108 // is dn lease valid?
6109 utime_t now = ceph_clock_now();
6110 if (dn->lease_mds >= 0 &&
6111 dn->lease_ttl > now &&
6112 mds_sessions.count(dn->lease_mds)) {
6113 MetaSession *s = mds_sessions[dn->lease_mds];
6114 if (s->cap_ttl > now &&
6115 s->cap_gen == dn->lease_gen) {
6116 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6117 // make trim_caps() behave.
6118 dir->try_touch_cap(dn->lease_mds);
6119 goto hit_dn;
6120 }
6121 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6122 << " vs lease_gen " << dn->lease_gen << dendl;
6123 }
6124 // dir lease?
94b18763 6125 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6126 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6127 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6128 goto hit_dn;
6129 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6130 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6131 << *dir << " dn '" << dname << "'" << dendl;
6132 return -ENOENT;
6133 }
6134 }
6135 } else {
6136 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6137 }
6138 } else {
6139 // can we conclude ENOENT locally?
94b18763 6140 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae
FG
6141 (dir->flags & I_COMPLETE)) {
6142 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6143 return -ENOENT;
6144 }
6145 }
6146
6147 r = _do_lookup(dir, dname, mask, target, perms);
6148 goto done;
6149
6150 hit_dn:
6151 if (dn->inode) {
6152 *target = dn->inode;
6153 } else {
6154 r = -ENOENT;
6155 }
6156 touch_dn(dn);
6157
6158 done:
6159 if (r < 0)
6160 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6161 else
6162 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6163 return r;
6164}
6165
6166int Client::get_or_create(Inode *dir, const char* name,
6167 Dentry **pdn, bool expect_null)
6168{
6169 // lookup
6170 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6171 dir->open_dir();
6172 if (dir->dir->dentries.count(name)) {
6173 Dentry *dn = dir->dir->dentries[name];
6174
6175 // is dn lease valid?
6176 utime_t now = ceph_clock_now();
6177 if (dn->inode &&
6178 dn->lease_mds >= 0 &&
6179 dn->lease_ttl > now &&
6180 mds_sessions.count(dn->lease_mds)) {
6181 MetaSession *s = mds_sessions[dn->lease_mds];
6182 if (s->cap_ttl > now &&
6183 s->cap_gen == dn->lease_gen) {
6184 if (expect_null)
6185 return -EEXIST;
6186 }
6187 }
6188 *pdn = dn;
6189 } else {
6190 // otherwise link up a new one
6191 *pdn = link(dir->dir, name, NULL, NULL);
6192 }
6193
6194 // success
6195 return 0;
6196}
6197
6198int Client::path_walk(const filepath& origpath, InodeRef *end,
6199 const UserPerm& perms, bool followsym, int mask)
6200{
6201 filepath path = origpath;
6202 InodeRef cur;
6203 if (origpath.absolute())
6204 cur = root;
6205 else
6206 cur = cwd;
6207 assert(cur);
6208
6209 ldout(cct, 10) << "path_walk " << path << dendl;
6210
6211 int symlinks = 0;
6212
6213 unsigned i=0;
6214 while (i < path.depth() && cur) {
6215 int caps = 0;
6216 const string &dname = path[i];
6217 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6218 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6219 InodeRef next;
6220 if (cct->_conf->client_permissions) {
6221 int r = may_lookup(cur.get(), perms);
6222 if (r < 0)
6223 return r;
6224 caps = CEPH_CAP_AUTH_SHARED;
6225 }
6226
6227 /* Get extra requested caps on the last component */
6228 if (i == (path.depth() - 1))
6229 caps |= mask;
6230 int r = _lookup(cur.get(), dname, caps, &next, perms);
6231 if (r < 0)
6232 return r;
6233 // only follow trailing symlink if followsym. always follow
6234 // 'directory' symlinks.
6235 if (next && next->is_symlink()) {
6236 symlinks++;
6237 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6238 if (symlinks > MAXSYMLINKS) {
6239 return -ELOOP;
6240 }
6241
6242 if (i < path.depth() - 1) {
6243 // dir symlink
6244 // replace consumed components of path with symlink dir target
6245 filepath resolved(next->symlink.c_str());
6246 resolved.append(path.postfixpath(i + 1));
6247 path = resolved;
6248 i = 0;
6249 if (next->symlink[0] == '/') {
6250 cur = root;
6251 }
6252 continue;
6253 } else if (followsym) {
6254 if (next->symlink[0] == '/') {
6255 path = next->symlink.c_str();
6256 i = 0;
6257 // reset position
6258 cur = root;
6259 } else {
6260 filepath more(next->symlink.c_str());
6261 // we need to remove the symlink component from off of the path
6262 // before adding the target that the symlink points to. remain
6263 // at the same position in the path.
6264 path.pop_dentry();
6265 path.append(more);
6266 }
6267 continue;
6268 }
6269 }
6270 cur.swap(next);
6271 i++;
6272 }
6273 if (!cur)
6274 return -ENOENT;
6275 if (end)
6276 end->swap(cur);
6277 return 0;
6278}
6279
6280
6281// namespace ops
6282
6283int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6284{
6285 Mutex::Locker lock(client_lock);
6286 tout(cct) << "link" << std::endl;
6287 tout(cct) << relexisting << std::endl;
6288 tout(cct) << relpath << std::endl;
6289
181888fb
FG
6290 if (unmounting)
6291 return -ENOTCONN;
6292
7c673cae
FG
6293 filepath existing(relexisting);
6294
6295 InodeRef in, dir;
6296 int r = path_walk(existing, &in, perm, true);
6297 if (r < 0)
6298 return r;
6299 if (std::string(relpath) == "/") {
6300 r = -EEXIST;
6301 return r;
6302 }
6303 filepath path(relpath);
6304 string name = path.last_dentry();
6305 path.pop_dentry();
6306
6307 r = path_walk(path, &dir, perm, true);
6308 if (r < 0)
6309 return r;
6310 if (cct->_conf->client_permissions) {
6311 if (S_ISDIR(in->mode)) {
6312 r = -EPERM;
6313 return r;
6314 }
6315 r = may_hardlink(in.get(), perm);
6316 if (r < 0)
6317 return r;
6318 r = may_create(dir.get(), perm);
6319 if (r < 0)
6320 return r;
6321 }
6322 r = _link(in.get(), dir.get(), name.c_str(), perm);
6323 return r;
6324}
6325
6326int Client::unlink(const char *relpath, const UserPerm& perm)
6327{
6328 Mutex::Locker lock(client_lock);
6329 tout(cct) << "unlink" << std::endl;
6330 tout(cct) << relpath << std::endl;
6331
181888fb
FG
6332 if (unmounting)
6333 return -ENOTCONN;
6334
7c673cae
FG
6335 if (std::string(relpath) == "/")
6336 return -EISDIR;
6337
6338 filepath path(relpath);
6339 string name = path.last_dentry();
6340 path.pop_dentry();
6341 InodeRef dir;
6342 int r = path_walk(path, &dir, perm);
6343 if (r < 0)
6344 return r;
6345 if (cct->_conf->client_permissions) {
6346 r = may_delete(dir.get(), name.c_str(), perm);
6347 if (r < 0)
6348 return r;
6349 }
6350 return _unlink(dir.get(), name.c_str(), perm);
6351}
6352
6353int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6354{
6355 Mutex::Locker lock(client_lock);
6356 tout(cct) << "rename" << std::endl;
6357 tout(cct) << relfrom << std::endl;
6358 tout(cct) << relto << std::endl;
6359
181888fb
FG
6360 if (unmounting)
6361 return -ENOTCONN;
6362
7c673cae
FG
6363 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6364 return -EBUSY;
6365
6366 filepath from(relfrom);
6367 filepath to(relto);
6368 string fromname = from.last_dentry();
6369 from.pop_dentry();
6370 string toname = to.last_dentry();
6371 to.pop_dentry();
6372
6373 InodeRef fromdir, todir;
6374 int r = path_walk(from, &fromdir, perm);
6375 if (r < 0)
6376 goto out;
6377 r = path_walk(to, &todir, perm);
6378 if (r < 0)
6379 goto out;
6380
6381 if (cct->_conf->client_permissions) {
6382 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6383 if (r < 0)
6384 return r;
6385 r = may_delete(todir.get(), toname.c_str(), perm);
6386 if (r < 0 && r != -ENOENT)
6387 return r;
6388 }
6389 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6390out:
6391 return r;
6392}
6393
6394// dirs
6395
6396int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6397{
6398 Mutex::Locker lock(client_lock);
6399 tout(cct) << "mkdir" << std::endl;
6400 tout(cct) << relpath << std::endl;
6401 tout(cct) << mode << std::endl;
6402 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6403
181888fb
FG
6404 if (unmounting)
6405 return -ENOTCONN;
6406
7c673cae
FG
6407 if (std::string(relpath) == "/")
6408 return -EEXIST;
6409
6410 filepath path(relpath);
6411 string name = path.last_dentry();
6412 path.pop_dentry();
6413 InodeRef dir;
6414 int r = path_walk(path, &dir, perm);
6415 if (r < 0)
6416 return r;
6417 if (cct->_conf->client_permissions) {
6418 r = may_create(dir.get(), perm);
6419 if (r < 0)
6420 return r;
6421 }
6422 return _mkdir(dir.get(), name.c_str(), mode, perm);
6423}
6424
6425int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6426{
6427 Mutex::Locker lock(client_lock);
6428 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6429 tout(cct) << "mkdirs" << std::endl;
6430 tout(cct) << relpath << std::endl;
6431 tout(cct) << mode << std::endl;
6432
181888fb
FG
6433 if (unmounting)
6434 return -ENOTCONN;
6435
7c673cae
FG
6436 //get through existing parts of path
6437 filepath path(relpath);
6438 unsigned int i;
6439 int r = 0, caps = 0;
6440 InodeRef cur, next;
6441 cur = cwd;
6442 for (i=0; i<path.depth(); ++i) {
6443 if (cct->_conf->client_permissions) {
6444 r = may_lookup(cur.get(), perms);
6445 if (r < 0)
6446 break;
6447 caps = CEPH_CAP_AUTH_SHARED;
6448 }
6449 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6450 if (r < 0)
6451 break;
6452 cur.swap(next);
6453 }
6454 //check that we have work left to do
6455 if (i==path.depth()) return -EEXIST;
6456 if (r!=-ENOENT) return r;
6457 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6458 //make new directory at each level
6459 for (; i<path.depth(); ++i) {
6460 if (cct->_conf->client_permissions) {
6461 r = may_create(cur.get(), perms);
6462 if (r < 0)
6463 return r;
6464 }
6465 //make new dir
6466 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6467
7c673cae 6468 //check proper creation/existence
c07f9fc5
FG
6469 if(-EEXIST == r && i < path.depth() - 1) {
6470 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6471 }
6472 if (r < 0)
6473 return r;
7c673cae
FG
6474 //move to new dir and continue
6475 cur.swap(next);
6476 ldout(cct, 20) << "mkdirs: successfully created directory "
6477 << filepath(cur->ino).get_path() << dendl;
6478 }
6479 return 0;
6480}
6481
6482int Client::rmdir(const char *relpath, const UserPerm& perms)
6483{
6484 Mutex::Locker lock(client_lock);
6485 tout(cct) << "rmdir" << std::endl;
6486 tout(cct) << relpath << std::endl;
6487
181888fb
FG
6488 if (unmounting)
6489 return -ENOTCONN;
6490
7c673cae
FG
6491 if (std::string(relpath) == "/")
6492 return -EBUSY;
6493
6494 filepath path(relpath);
6495 string name = path.last_dentry();
6496 path.pop_dentry();
6497 InodeRef dir;
6498 int r = path_walk(path, &dir, perms);
6499 if (r < 0)
6500 return r;
6501 if (cct->_conf->client_permissions) {
6502 int r = may_delete(dir.get(), name.c_str(), perms);
6503 if (r < 0)
6504 return r;
6505 }
6506 return _rmdir(dir.get(), name.c_str(), perms);
6507}
6508
6509int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6510{
6511 Mutex::Locker lock(client_lock);
6512 tout(cct) << "mknod" << std::endl;
6513 tout(cct) << relpath << std::endl;
6514 tout(cct) << mode << std::endl;
6515 tout(cct) << rdev << std::endl;
6516
181888fb
FG
6517 if (unmounting)
6518 return -ENOTCONN;
6519
7c673cae
FG
6520 if (std::string(relpath) == "/")
6521 return -EEXIST;
6522
6523 filepath path(relpath);
6524 string name = path.last_dentry();
6525 path.pop_dentry();
6526 InodeRef dir;
6527 int r = path_walk(path, &dir, perms);
6528 if (r < 0)
6529 return r;
6530 if (cct->_conf->client_permissions) {
6531 int r = may_create(dir.get(), perms);
6532 if (r < 0)
6533 return r;
6534 }
6535 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6536}
6537
6538// symlinks
6539
6540int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6541{
6542 Mutex::Locker lock(client_lock);
6543 tout(cct) << "symlink" << std::endl;
6544 tout(cct) << target << std::endl;
6545 tout(cct) << relpath << std::endl;
6546
181888fb
FG
6547 if (unmounting)
6548 return -ENOTCONN;
6549
7c673cae
FG
6550 if (std::string(relpath) == "/")
6551 return -EEXIST;
6552
6553 filepath path(relpath);
6554 string name = path.last_dentry();
6555 path.pop_dentry();
6556 InodeRef dir;
6557 int r = path_walk(path, &dir, perms);
6558 if (r < 0)
6559 return r;
6560 if (cct->_conf->client_permissions) {
6561 int r = may_create(dir.get(), perms);
6562 if (r < 0)
6563 return r;
6564 }
6565 return _symlink(dir.get(), name.c_str(), target, perms);
6566}
6567
6568int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6569{
6570 Mutex::Locker lock(client_lock);
6571 tout(cct) << "readlink" << std::endl;
6572 tout(cct) << relpath << std::endl;
6573
181888fb
FG
6574 if (unmounting)
6575 return -ENOTCONN;
6576
7c673cae
FG
6577 filepath path(relpath);
6578 InodeRef in;
6579 int r = path_walk(path, &in, perms, false);
6580 if (r < 0)
6581 return r;
6582
6583 return _readlink(in.get(), buf, size);
6584}
6585
6586int Client::_readlink(Inode *in, char *buf, size_t size)
6587{
6588 if (!in->is_symlink())
6589 return -EINVAL;
6590
6591 // copy into buf (at most size bytes)
6592 int r = in->symlink.length();
6593 if (r > (int)size)
6594 r = size;
6595 memcpy(buf, in->symlink.c_str(), r);
6596 return r;
6597}
6598
6599
6600// inode stuff
6601
6602int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6603{
94b18763 6604 bool yes = in->caps_issued_mask(mask, true);
7c673cae
FG
6605
6606 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6607 if (yes && !force)
6608 return 0;
6609
6610 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6611 filepath path;
6612 in->make_nosnap_relative_path(path);
6613 req->set_filepath(path);
6614 req->set_inode(in);
6615 req->head.args.getattr.mask = mask;
6616
6617 int res = make_request(req, perms);
6618 ldout(cct, 10) << "_getattr result=" << res << dendl;
6619 return res;
6620}
6621
6622int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6623 const UserPerm& perms, InodeRef *inp)
6624{
6625 int issued = in->caps_issued();
6626
6627 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6628 ccap_string(issued) << dendl;
6629
6630 if (in->snapid != CEPH_NOSNAP) {
6631 return -EROFS;
6632 }
6633 if ((mask & CEPH_SETATTR_SIZE) &&
6634 (unsigned long)stx->stx_size > in->size &&
6635 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6636 perms)) {
6637 return -EDQUOT;
6638 }
6639
6640 // make the change locally?
6641 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6642 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6643 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6644 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6645 << in->cap_dirtier_gid << ", forcing sync setattr"
6646 << dendl;
6647 /*
6648 * This works because we implicitly flush the caps as part of the
6649 * request, so the cap update check will happen with the writeback
6650 * cap context, and then the setattr check will happen with the
6651 * caller's context.
6652 *
6653 * In reality this pattern is likely pretty rare (different users
6654 * setattr'ing the same file). If that turns out not to be the
6655 * case later, we can build a more complex pipelined cap writeback
6656 * infrastructure...
6657 */
6658 if (!mask)
6659 mask |= CEPH_SETATTR_CTIME;
6660 goto force_request;
6661 }
6662
6663 if (!mask) {
6664 // caller just needs us to bump the ctime
6665 in->ctime = ceph_clock_now();
6666 in->cap_dirtier_uid = perms.uid();
6667 in->cap_dirtier_gid = perms.gid();
6668 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6669 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6670 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6671 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6672 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6673 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6674 else
6675 mask |= CEPH_SETATTR_CTIME;
6676 }
6677
6678 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6679 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6680
6681 mask &= ~CEPH_SETATTR_KILL_SGUID;
6682
6683 if (mask & CEPH_SETATTR_UID) {
6684 in->ctime = ceph_clock_now();
6685 in->cap_dirtier_uid = perms.uid();
6686 in->cap_dirtier_gid = perms.gid();
6687 in->uid = stx->stx_uid;
28e407b8 6688 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6689 mask &= ~CEPH_SETATTR_UID;
6690 kill_sguid = true;
6691 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6692 }
6693 if (mask & CEPH_SETATTR_GID) {
6694 in->ctime = ceph_clock_now();
6695 in->cap_dirtier_uid = perms.uid();
6696 in->cap_dirtier_gid = perms.gid();
6697 in->gid = stx->stx_gid;
28e407b8 6698 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6699 mask &= ~CEPH_SETATTR_GID;
6700 kill_sguid = true;
6701 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6702 }
6703
6704 if (mask & CEPH_SETATTR_MODE) {
6705 in->ctime = ceph_clock_now();
6706 in->cap_dirtier_uid = perms.uid();
6707 in->cap_dirtier_gid = perms.gid();
6708 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6709 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6710 mask &= ~CEPH_SETATTR_MODE;
6711 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6712 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6713 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6714 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6715 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6716 }
6717
6718 if (mask & CEPH_SETATTR_BTIME) {
6719 in->ctime = ceph_clock_now();
6720 in->cap_dirtier_uid = perms.uid();
6721 in->cap_dirtier_gid = perms.gid();
6722 in->btime = utime_t(stx->stx_btime);
28e407b8 6723 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6724 mask &= ~CEPH_SETATTR_BTIME;
6725 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6726 }
6727 } else if (mask & CEPH_SETATTR_SIZE) {
6728 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6729 mask |= CEPH_SETATTR_KILL_SGUID;
6730 }
6731
6732 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6733 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6734 if (mask & CEPH_SETATTR_MTIME)
6735 in->mtime = utime_t(stx->stx_mtime);
6736 if (mask & CEPH_SETATTR_ATIME)
6737 in->atime = utime_t(stx->stx_atime);
6738 in->ctime = ceph_clock_now();
6739 in->cap_dirtier_uid = perms.uid();
6740 in->cap_dirtier_gid = perms.gid();
6741 in->time_warp_seq++;
28e407b8 6742 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6743 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6744 }
6745 }
6746 if (!mask) {
6747 in->change_attr++;
6748 return 0;
6749 }
6750
6751force_request:
6752 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6753
6754 filepath path;
6755
6756 in->make_nosnap_relative_path(path);
6757 req->set_filepath(path);
6758 req->set_inode(in);
6759
6760 if (mask & CEPH_SETATTR_KILL_SGUID) {
6761 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6762 }
6763 if (mask & CEPH_SETATTR_MODE) {
6764 req->head.args.setattr.mode = stx->stx_mode;
6765 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6766 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6767 }
6768 if (mask & CEPH_SETATTR_UID) {
6769 req->head.args.setattr.uid = stx->stx_uid;
6770 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6771 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6772 }
6773 if (mask & CEPH_SETATTR_GID) {
6774 req->head.args.setattr.gid = stx->stx_gid;
6775 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6776 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6777 }
6778 if (mask & CEPH_SETATTR_BTIME) {
6779 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6780 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6781 }
6782 if (mask & CEPH_SETATTR_MTIME) {
6783 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6784 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6785 CEPH_CAP_FILE_WR;
6786 }
6787 if (mask & CEPH_SETATTR_ATIME) {
6788 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6789 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6790 CEPH_CAP_FILE_WR;
6791 }
6792 if (mask & CEPH_SETATTR_SIZE) {
6793 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6794 req->head.args.setattr.size = stx->stx_size;
6795 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6796 } else { //too big!
6797 put_request(req);
6798 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6799 return -EFBIG;
6800 }
94b18763 6801 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6802 CEPH_CAP_FILE_WR;
6803 }
6804 req->head.args.setattr.mask = mask;
6805
6806 req->regetattr_mask = mask;
6807
6808 int res = make_request(req, perms, inp);
6809 ldout(cct, 10) << "_setattr result=" << res << dendl;
6810 return res;
6811}
6812
6813/* Note that we only care about attrs that setattr cares about */
6814void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6815{
6816 stx->stx_size = st->st_size;
6817 stx->stx_mode = st->st_mode;
6818 stx->stx_uid = st->st_uid;
6819 stx->stx_gid = st->st_gid;
6820 stx->stx_mtime = st->st_mtim;
6821 stx->stx_atime = st->st_atim;
6822}
6823
6824int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6825 const UserPerm& perms, InodeRef *inp)
6826{
6827 int ret = _do_setattr(in, stx, mask, perms, inp);
6828 if (ret < 0)
6829 return ret;
6830 if (mask & CEPH_SETATTR_MODE)
6831 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6832 return ret;
6833}
6834
6835int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6836 const UserPerm& perms)
6837{
6838 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6839 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6840 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6841 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6842 if (cct->_conf->client_permissions) {
6843 int r = may_setattr(in.get(), stx, mask, perms);
6844 if (r < 0)
6845 return r;
6846 }
6847 return __setattrx(in.get(), stx, mask, perms);
6848}
6849
6850int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6851 const UserPerm& perms)
6852{
6853 struct ceph_statx stx;
6854
6855 stat_to_statx(attr, &stx);
6856 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
6857
6858 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6859 mask &= ~CEPH_SETATTR_UID;
6860 }
6861 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6862 mask &= ~CEPH_SETATTR_GID;
6863 }
6864
7c673cae
FG
6865 return _setattrx(in, &stx, mask, perms);
6866}
6867
6868int Client::setattr(const char *relpath, struct stat *attr, int mask,
6869 const UserPerm& perms)
6870{
6871 Mutex::Locker lock(client_lock);
6872 tout(cct) << "setattr" << std::endl;
6873 tout(cct) << relpath << std::endl;
6874 tout(cct) << mask << std::endl;
6875
181888fb
FG
6876 if (unmounting)
6877 return -ENOTCONN;
6878
7c673cae
FG
6879 filepath path(relpath);
6880 InodeRef in;
6881 int r = path_walk(path, &in, perms);
6882 if (r < 0)
6883 return r;
6884 return _setattr(in, attr, mask, perms);
6885}
6886
6887int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6888 const UserPerm& perms, int flags)
6889{
6890 Mutex::Locker lock(client_lock);
6891 tout(cct) << "setattrx" << std::endl;
6892 tout(cct) << relpath << std::endl;
6893 tout(cct) << mask << std::endl;
6894
181888fb
FG
6895 if (unmounting)
6896 return -ENOTCONN;
6897
7c673cae
FG
6898 filepath path(relpath);
6899 InodeRef in;
6900 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6901 if (r < 0)
6902 return r;
6903 return _setattrx(in, stx, mask, perms);
6904}
6905
6906int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6907{
6908 Mutex::Locker lock(client_lock);
6909 tout(cct) << "fsetattr" << std::endl;
6910 tout(cct) << fd << std::endl;
6911 tout(cct) << mask << std::endl;
6912
181888fb
FG
6913 if (unmounting)
6914 return -ENOTCONN;
6915
7c673cae
FG
6916 Fh *f = get_filehandle(fd);
6917 if (!f)
6918 return -EBADF;
6919#if defined(__linux__) && defined(O_PATH)
6920 if (f->flags & O_PATH)
6921 return -EBADF;
6922#endif
6923 return _setattr(f->inode, attr, mask, perms);
6924}
6925
6926int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6927{
6928 Mutex::Locker lock(client_lock);
6929 tout(cct) << "fsetattr" << std::endl;
6930 tout(cct) << fd << std::endl;
6931 tout(cct) << mask << std::endl;
6932
181888fb
FG
6933 if (unmounting)
6934 return -ENOTCONN;
6935
7c673cae
FG
6936 Fh *f = get_filehandle(fd);
6937 if (!f)
6938 return -EBADF;
6939#if defined(__linux__) && defined(O_PATH)
6940 if (f->flags & O_PATH)
6941 return -EBADF;
6942#endif
6943 return _setattrx(f->inode, stx, mask, perms);
6944}
6945
6946int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6947 frag_info_t *dirstat, int mask)
6948{
6949 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6950 Mutex::Locker lock(client_lock);
6951 tout(cct) << "stat" << std::endl;
6952 tout(cct) << relpath << std::endl;
181888fb
FG
6953
6954 if (unmounting)
6955 return -ENOTCONN;
6956
7c673cae
FG
6957 filepath path(relpath);
6958 InodeRef in;
6959 int r = path_walk(path, &in, perms, true, mask);
6960 if (r < 0)
6961 return r;
6962 r = _getattr(in, mask, perms);
6963 if (r < 0) {
6964 ldout(cct, 3) << "stat exit on error!" << dendl;
6965 return r;
6966 }
6967 fill_stat(in, stbuf, dirstat);
6968 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6969 return r;
6970}
6971
6972unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6973{
6974 unsigned mask = 0;
6975
6976 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
6977 if (flags & AT_NO_ATTR_SYNC)
6978 goto out;
6979
6980 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
6981 mask |= CEPH_CAP_PIN;
6982 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6983 mask |= CEPH_CAP_AUTH_SHARED;
6984 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6985 mask |= CEPH_CAP_LINK_SHARED;
6986 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
6987 mask |= CEPH_CAP_FILE_SHARED;
6988 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
6989 mask |= CEPH_CAP_XATTR_SHARED;
6990out:
6991 return mask;
6992}
6993
6994int Client::statx(const char *relpath, struct ceph_statx *stx,
6995 const UserPerm& perms,
6996 unsigned int want, unsigned int flags)
6997{
6998 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
6999 Mutex::Locker lock(client_lock);
7000 tout(cct) << "statx" << std::endl;
7001 tout(cct) << relpath << std::endl;
181888fb
FG
7002
7003 if (unmounting)
7004 return -ENOTCONN;
7005
7c673cae
FG
7006 filepath path(relpath);
7007 InodeRef in;
7008
7009 unsigned mask = statx_to_mask(flags, want);
7010
7011 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7012 if (r < 0)
7013 return r;
7014
7015 r = _getattr(in, mask, perms);
7016 if (r < 0) {
7017 ldout(cct, 3) << "statx exit on error!" << dendl;
7018 return r;
7019 }
7020
7021 fill_statx(in, mask, stx);
7022 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7023 return r;
7024}
7025
7026int Client::lstat(const char *relpath, struct stat *stbuf,
7027 const UserPerm& perms, frag_info_t *dirstat, int mask)
7028{
7029 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7030 Mutex::Locker lock(client_lock);
7031 tout(cct) << "lstat" << std::endl;
7032 tout(cct) << relpath << std::endl;
181888fb
FG
7033
7034 if (unmounting)
7035 return -ENOTCONN;
7036
7c673cae
FG
7037 filepath path(relpath);
7038 InodeRef in;
7039 // don't follow symlinks
7040 int r = path_walk(path, &in, perms, false, mask);
7041 if (r < 0)
7042 return r;
7043 r = _getattr(in, mask, perms);
7044 if (r < 0) {
7045 ldout(cct, 3) << "lstat exit on error!" << dendl;
7046 return r;
7047 }
7048 fill_stat(in, stbuf, dirstat);
7049 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7050 return r;
7051}
7052
7053int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7054{
7055 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7056 << " mode 0" << oct << in->mode << dec
7057 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7058 memset(st, 0, sizeof(struct stat));
7059 if (use_faked_inos())
7060 st->st_ino = in->faked_ino;
7061 else
7062 st->st_ino = in->ino;
7063 st->st_dev = in->snapid;
7064 st->st_mode = in->mode;
7065 st->st_rdev = in->rdev;
28e407b8
AA
7066 if (in->is_dir()) {
7067 switch (in->nlink) {
7068 case 0:
7069 st->st_nlink = 0; /* dir is unlinked */
7070 break;
7071 case 1:
7072 st->st_nlink = 1 /* parent dentry */
7073 + 1 /* <dir>/. */
7074 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7075 break;
7076 default:
7077 ceph_abort();
7078 }
7079 } else {
7080 st->st_nlink = in->nlink;
7081 }
7c673cae
FG
7082 st->st_uid = in->uid;
7083 st->st_gid = in->gid;
7084 if (in->ctime > in->mtime) {
7085 stat_set_ctime_sec(st, in->ctime.sec());
7086 stat_set_ctime_nsec(st, in->ctime.nsec());
7087 } else {
7088 stat_set_ctime_sec(st, in->mtime.sec());
7089 stat_set_ctime_nsec(st, in->mtime.nsec());
7090 }
7091 stat_set_atime_sec(st, in->atime.sec());
7092 stat_set_atime_nsec(st, in->atime.nsec());
7093 stat_set_mtime_sec(st, in->mtime.sec());
7094 stat_set_mtime_nsec(st, in->mtime.nsec());
7095 if (in->is_dir()) {
7096 if (cct->_conf->client_dirsize_rbytes)
7097 st->st_size = in->rstat.rbytes;
7098 else
7099 st->st_size = in->dirstat.size();
7100 st->st_blocks = 1;
7101 } else {
7102 st->st_size = in->size;
7103 st->st_blocks = (in->size + 511) >> 9;
7104 }
7105 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7106
7107 if (dirstat)
7108 *dirstat = in->dirstat;
7109 if (rstat)
7110 *rstat = in->rstat;
7111
7112 return in->caps_issued();
7113}
7114
7115void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7116{
7117 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7118 << " mode 0" << oct << in->mode << dec
7119 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7120 memset(stx, 0, sizeof(struct ceph_statx));
7121
7122 /*
7123 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7124 * so that all bits are set.
7125 */
7126 if (!mask)
7127 mask = ~0;
7128
7129 /* These are always considered to be available */
7130 stx->stx_dev = in->snapid;
7131 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7132
7133 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7134 stx->stx_mode = S_IFMT & in->mode;
7135 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7136 stx->stx_rdev = in->rdev;
7137 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7138
7139 if (mask & CEPH_CAP_AUTH_SHARED) {
7140 stx->stx_uid = in->uid;
7141 stx->stx_gid = in->gid;
7142 stx->stx_mode = in->mode;
7143 in->btime.to_timespec(&stx->stx_btime);
7144 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7145 }
7146
7147 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7148 if (in->is_dir()) {
7149 switch (in->nlink) {
7150 case 0:
7151 stx->stx_nlink = 0; /* dir is unlinked */
7152 break;
7153 case 1:
7154 stx->stx_nlink = 1 /* parent dentry */
7155 + 1 /* <dir>/. */
7156 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7157 break;
7158 default:
7159 ceph_abort();
7160 }
7161 } else {
7162 stx->stx_nlink = in->nlink;
7163 }
7c673cae
FG
7164 stx->stx_mask |= CEPH_STATX_NLINK;
7165 }
7166
7167 if (mask & CEPH_CAP_FILE_SHARED) {
7168
7169 in->atime.to_timespec(&stx->stx_atime);
7170 in->mtime.to_timespec(&stx->stx_mtime);
7171
7172 if (in->is_dir()) {
7173 if (cct->_conf->client_dirsize_rbytes)
7174 stx->stx_size = in->rstat.rbytes;
7175 else
7176 stx->stx_size = in->dirstat.size();
7177 stx->stx_blocks = 1;
7178 } else {
7179 stx->stx_size = in->size;
7180 stx->stx_blocks = (in->size + 511) >> 9;
7181 }
7182 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7183 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7184 }
7185
7186 /* Change time and change_attr both require all shared caps to view */
7187 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7188 stx->stx_version = in->change_attr;
7189 if (in->ctime > in->mtime)
7190 in->ctime.to_timespec(&stx->stx_ctime);
7191 else
7192 in->mtime.to_timespec(&stx->stx_ctime);
7193 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7194 }
7195
7196}
7197
7198void Client::touch_dn(Dentry *dn)
7199{
7200 lru.lru_touch(dn);
7201}
7202
7203int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7204{
7205 Mutex::Locker lock(client_lock);
7206 tout(cct) << "chmod" << std::endl;
7207 tout(cct) << relpath << std::endl;
7208 tout(cct) << mode << std::endl;
181888fb
FG
7209
7210 if (unmounting)
7211 return -ENOTCONN;
7212
7c673cae
FG
7213 filepath path(relpath);
7214 InodeRef in;
7215 int r = path_walk(path, &in, perms);
7216 if (r < 0)
7217 return r;
7218 struct stat attr;
7219 attr.st_mode = mode;
7220 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7221}
7222
7223int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7224{
7225 Mutex::Locker lock(client_lock);
7226 tout(cct) << "fchmod" << std::endl;
7227 tout(cct) << fd << std::endl;
7228 tout(cct) << mode << std::endl;
181888fb
FG
7229
7230 if (unmounting)
7231 return -ENOTCONN;
7232
7c673cae
FG
7233 Fh *f = get_filehandle(fd);
7234 if (!f)
7235 return -EBADF;
7236#if defined(__linux__) && defined(O_PATH)
7237 if (f->flags & O_PATH)
7238 return -EBADF;
7239#endif
7240 struct stat attr;
7241 attr.st_mode = mode;
7242 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7243}
7244
7245int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7246{
7247 Mutex::Locker lock(client_lock);
7248 tout(cct) << "lchmod" << std::endl;
7249 tout(cct) << relpath << std::endl;
7250 tout(cct) << mode << std::endl;
181888fb
FG
7251
7252 if (unmounting)
7253 return -ENOTCONN;
7254
7c673cae
FG
7255 filepath path(relpath);
7256 InodeRef in;
7257 // don't follow symlinks
7258 int r = path_walk(path, &in, perms, false);
7259 if (r < 0)
7260 return r;
7261 struct stat attr;
7262 attr.st_mode = mode;
7263 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7264}
7265
7266int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7267 const UserPerm& perms)
7268{
7269 Mutex::Locker lock(client_lock);
7270 tout(cct) << "chown" << std::endl;
7271 tout(cct) << relpath << std::endl;
7272 tout(cct) << new_uid << std::endl;
7273 tout(cct) << new_gid << std::endl;
181888fb
FG
7274
7275 if (unmounting)
7276 return -ENOTCONN;
7277
7c673cae
FG
7278 filepath path(relpath);
7279 InodeRef in;
7280 int r = path_walk(path, &in, perms);
7281 if (r < 0)
7282 return r;
7283 struct stat attr;
7284 attr.st_uid = new_uid;
7285 attr.st_gid = new_gid;
181888fb 7286 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7287}
7288
7289int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7290{
7291 Mutex::Locker lock(client_lock);
7292 tout(cct) << "fchown" << std::endl;
7293 tout(cct) << fd << std::endl;
7294 tout(cct) << new_uid << std::endl;
7295 tout(cct) << new_gid << std::endl;
181888fb
FG
7296
7297 if (unmounting)
7298 return -ENOTCONN;
7299
7c673cae
FG
7300 Fh *f = get_filehandle(fd);
7301 if (!f)
7302 return -EBADF;
7303#if defined(__linux__) && defined(O_PATH)
7304 if (f->flags & O_PATH)
7305 return -EBADF;
7306#endif
7307 struct stat attr;
7308 attr.st_uid = new_uid;
7309 attr.st_gid = new_gid;
7310 int mask = 0;
7311 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7312 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7313 return _setattr(f->inode, &attr, mask, perms);
7314}
7315
7316int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7317 const UserPerm& perms)
7318{
7319 Mutex::Locker lock(client_lock);
7320 tout(cct) << "lchown" << std::endl;
7321 tout(cct) << relpath << std::endl;
7322 tout(cct) << new_uid << std::endl;
7323 tout(cct) << new_gid << std::endl;
181888fb
FG
7324
7325 if (unmounting)
7326 return -ENOTCONN;
7327
7c673cae
FG
7328 filepath path(relpath);
7329 InodeRef in;
7330 // don't follow symlinks
7331 int r = path_walk(path, &in, perms, false);
7332 if (r < 0)
7333 return r;
7334 struct stat attr;
7335 attr.st_uid = new_uid;
7336 attr.st_gid = new_gid;
7337 int mask = 0;
7338 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7339 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7340 return _setattr(in, &attr, mask, perms);
7341}
7342
7343int Client::utime(const char *relpath, struct utimbuf *buf,
7344 const UserPerm& perms)
7345{
7346 Mutex::Locker lock(client_lock);
7347 tout(cct) << "utime" << std::endl;
7348 tout(cct) << relpath << std::endl;
7349 tout(cct) << buf->modtime << std::endl;
7350 tout(cct) << buf->actime << std::endl;
181888fb
FG
7351
7352 if (unmounting)
7353 return -ENOTCONN;
7354
7c673cae
FG
7355 filepath path(relpath);
7356 InodeRef in;
7357 int r = path_walk(path, &in, perms);
7358 if (r < 0)
7359 return r;
7360 struct stat attr;
7361 stat_set_mtime_sec(&attr, buf->modtime);
7362 stat_set_mtime_nsec(&attr, 0);
7363 stat_set_atime_sec(&attr, buf->actime);
7364 stat_set_atime_nsec(&attr, 0);
7365 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7366}
7367
7368int Client::lutime(const char *relpath, struct utimbuf *buf,
7369 const UserPerm& perms)
7370{
7371 Mutex::Locker lock(client_lock);
7372 tout(cct) << "lutime" << std::endl;
7373 tout(cct) << relpath << std::endl;
7374 tout(cct) << buf->modtime << std::endl;
7375 tout(cct) << buf->actime << std::endl;
181888fb
FG
7376
7377 if (unmounting)
7378 return -ENOTCONN;
7379
7c673cae
FG
7380 filepath path(relpath);
7381 InodeRef in;
7382 // don't follow symlinks
7383 int r = path_walk(path, &in, perms, false);
7384 if (r < 0)
7385 return r;
7386 struct stat attr;
7387 stat_set_mtime_sec(&attr, buf->modtime);
7388 stat_set_mtime_nsec(&attr, 0);
7389 stat_set_atime_sec(&attr, buf->actime);
7390 stat_set_atime_nsec(&attr, 0);
7391 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7392}
7393
7394int Client::flock(int fd, int operation, uint64_t owner)
7395{
7396 Mutex::Locker lock(client_lock);
7397 tout(cct) << "flock" << std::endl;
7398 tout(cct) << fd << std::endl;
7399 tout(cct) << operation << std::endl;
7400 tout(cct) << owner << std::endl;
181888fb
FG
7401
7402 if (unmounting)
7403 return -ENOTCONN;
7404
7c673cae
FG
7405 Fh *f = get_filehandle(fd);
7406 if (!f)
7407 return -EBADF;
7408
7409 return _flock(f, operation, owner);
7410}
7411
7412int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7413{
7414 Mutex::Locker lock(client_lock);
7415 tout(cct) << "opendir" << std::endl;
7416 tout(cct) << relpath << std::endl;
181888fb
FG
7417
7418 if (unmounting)
7419 return -ENOTCONN;
7420
7c673cae
FG
7421 filepath path(relpath);
7422 InodeRef in;
7423 int r = path_walk(path, &in, perms, true);
7424 if (r < 0)
7425 return r;
7426 if (cct->_conf->client_permissions) {
7427 int r = may_open(in.get(), O_RDONLY, perms);
7428 if (r < 0)
7429 return r;
7430 }
7431 r = _opendir(in.get(), dirpp, perms);
7432 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7433 if (r != -ENOTDIR)
7434 tout(cct) << (unsigned long)*dirpp << std::endl;
7435 return r;
7436}
7437
7438int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7439{
7440 if (!in->is_dir())
7441 return -ENOTDIR;
7442 *dirpp = new dir_result_t(in, perms);
7443 opened_dirs.insert(*dirpp);
7444 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7445 return 0;
7446}
7447
7448
7449int Client::closedir(dir_result_t *dir)
7450{
7451 Mutex::Locker lock(client_lock);
7452 tout(cct) << "closedir" << std::endl;
7453 tout(cct) << (unsigned long)dir << std::endl;
7454
7455 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7456 _closedir(dir);
7457 return 0;
7458}
7459
7460void Client::_closedir(dir_result_t *dirp)
7461{
7462 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7463 if (dirp->inode) {
7464 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7465 dirp->inode.reset();
7466 }
7467 _readdir_drop_dirp_buffer(dirp);
7468 opened_dirs.erase(dirp);
7469 delete dirp;
7470}
7471
7472void Client::rewinddir(dir_result_t *dirp)
7473{
7474 Mutex::Locker lock(client_lock);
7c673cae 7475 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
181888fb
FG
7476
7477 if (unmounting)
7478 return;
7479
7c673cae
FG
7480 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7481 _readdir_drop_dirp_buffer(d);
7482 d->reset();
7483}
7484
7485loff_t Client::telldir(dir_result_t *dirp)
7486{
7487 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7488 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7489 return d->offset;
7490}
7491
7492void Client::seekdir(dir_result_t *dirp, loff_t offset)
7493{
7494 Mutex::Locker lock(client_lock);
7495
7496 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7497
181888fb
FG
7498 if (unmounting)
7499 return;
7500
7c673cae
FG
7501 if (offset == dirp->offset)
7502 return;
7503
7504 if (offset > dirp->offset)
7505 dirp->release_count = 0; // bump if we do a forward seek
7506 else
7507 dirp->ordered_count = 0; // disable filling readdir cache
7508
7509 if (dirp->hash_order()) {
7510 if (dirp->offset > offset) {
7511 _readdir_drop_dirp_buffer(dirp);
7512 dirp->reset();
7513 }
7514 } else {
7515 if (offset == 0 ||
7516 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7517 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7518 _readdir_drop_dirp_buffer(dirp);
7519 dirp->reset();
7520 }
7521 }
7522
7523 dirp->offset = offset;
7524}
7525
7526
7527//struct dirent {
7528// ino_t d_ino; /* inode number */
7529// off_t d_off; /* offset to the next dirent */
7530// unsigned short d_reclen; /* length of this record */
7531// unsigned char d_type; /* type of file */
7532// char d_name[256]; /* filename */
7533//};
7534void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7535{
7536 strncpy(de->d_name, name, 255);
7537 de->d_name[255] = '\0';
7538#ifndef __CYGWIN__
7539 de->d_ino = ino;
7540#if !defined(DARWIN) && !defined(__FreeBSD__)
7541 de->d_off = next_off;
7542#endif
7543 de->d_reclen = 1;
7544 de->d_type = IFTODT(type);
7545 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7546 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7547#endif
7548}
7549
7550void Client::_readdir_next_frag(dir_result_t *dirp)
7551{
7552 frag_t fg = dirp->buffer_frag;
7553
7554 if (fg.is_rightmost()) {
7555 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7556 dirp->set_end();
7557 return;
7558 }
7559
7560 // advance
7561 fg = fg.next();
7562 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7563
7564 if (dirp->hash_order()) {
7565 // keep last_name
7566 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7567 if (dirp->offset < new_offset) // don't decrease offset
7568 dirp->offset = new_offset;
7569 } else {
7570 dirp->last_name.clear();
7571 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7572 _readdir_rechoose_frag(dirp);
7573 }
7574}
7575
7576void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7577{
7578 assert(dirp->inode);
7579
7580 if (dirp->hash_order())
7581 return;
7582
7583 frag_t cur = frag_t(dirp->offset_high());
7584 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7585 if (fg != cur) {
7586 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7587 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7588 dirp->last_name.clear();
7589 dirp->next_offset = 2;
7590 }
7591}
7592
7593void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7594{
7595 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7596 dirp->buffer.clear();
7597}
7598
7599int Client::_readdir_get_frag(dir_result_t *dirp)
7600{
7601 assert(dirp);
7602 assert(dirp->inode);
7603
7604 // get the current frag.
7605 frag_t fg;
7606 if (dirp->hash_order())
7607 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7608 else
7609 fg = frag_t(dirp->offset_high());
7610
7611 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7612 << " offset " << hex << dirp->offset << dec << dendl;
7613
7614 int op = CEPH_MDS_OP_READDIR;
7615 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7616 op = CEPH_MDS_OP_LSSNAP;
7617
7618 InodeRef& diri = dirp->inode;
7619
7620 MetaRequest *req = new MetaRequest(op);
7621 filepath path;
7622 diri->make_nosnap_relative_path(path);
7623 req->set_filepath(path);
7624 req->set_inode(diri.get());
7625 req->head.args.readdir.frag = fg;
7626 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7627 if (dirp->last_name.length()) {
94b18763 7628 req->path2.set_path(dirp->last_name);
7c673cae
FG
7629 } else if (dirp->hash_order()) {
7630 req->head.args.readdir.offset_hash = dirp->offset_high();
7631 }
7632 req->dirp = dirp;
7633
7634 bufferlist dirbl;
7635 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7636
7637 if (res == -EAGAIN) {
7638 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7639 _readdir_rechoose_frag(dirp);
7640 return _readdir_get_frag(dirp);
7641 }
7642
7643 if (res == 0) {
7644 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7645 << " size " << dirp->buffer.size() << dendl;
7646 } else {
7647 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7648 dirp->set_end();
7649 }
7650
7651 return res;
7652}
7653
7654struct dentry_off_lt {
7655 bool operator()(const Dentry* dn, int64_t off) const {
7656 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7657 }
7658};
7659
7660int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7661 int caps, bool getref)
7662{
7663 assert(client_lock.is_locked());
7664 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7665 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7666 << dendl;
7667 Dir *dir = dirp->inode->dir;
7668
7669 if (!dir) {
7670 ldout(cct, 10) << " dir is empty" << dendl;
7671 dirp->set_end();
7672 return 0;
7673 }
7674
7675 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7676 dir->readdir_cache.end(),
7677 dirp->offset, dentry_off_lt());
7678
7679 string dn_name;
7680 while (true) {
7681 if (!dirp->inode->is_complete_and_ordered())
7682 return -EAGAIN;
7683 if (pd == dir->readdir_cache.end())
7684 break;
7685 Dentry *dn = *pd;
7686 if (dn->inode == NULL) {
7687 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7688 ++pd;
7689 continue;
7690 }
7691 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7692 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7693 ++pd;
7694 continue;
7695 }
7696
7697 int r = _getattr(dn->inode, caps, dirp->perms);
7698 if (r < 0)
7699 return r;
7700
7701 struct ceph_statx stx;
7702 struct dirent de;
7703 fill_statx(dn->inode, caps, &stx);
7704
7705 uint64_t next_off = dn->offset + 1;
7706 ++pd;
7707 if (pd == dir->readdir_cache.end())
7708 next_off = dir_result_t::END;
7709
7710 Inode *in = NULL;
7711 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7712 if (getref) {
7713 in = dn->inode.get();
7714 _ll_get(in);
7715 }
7716
7717 dn_name = dn->name; // fill in name while we have lock
7718
7719 client_lock.Unlock();
7720 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7721 client_lock.Lock();
7722 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7723 << " = " << r << dendl;
7724 if (r < 0) {
7725 return r;
7726 }
7727
7728 dirp->offset = next_off;
7729 if (dirp->at_end())
7730 dirp->next_offset = 2;
7731 else
7732 dirp->next_offset = dirp->offset_low();
7733 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 7734 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
7735 if (r > 0)
7736 return r;
7737 }
7738
7739 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7740 dirp->set_end();
7741 return 0;
7742}
7743
7744int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7745 unsigned want, unsigned flags, bool getref)
7746{
7747 int caps = statx_to_mask(flags, want);
7748
7749 Mutex::Locker lock(client_lock);
7750
181888fb
FG
7751 if (unmounting)
7752 return -ENOTCONN;
7753
7c673cae
FG
7754 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7755
7756 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7757 << dec << " at_end=" << dirp->at_end()
7758 << " hash_order=" << dirp->hash_order() << dendl;
7759
7760 struct dirent de;
7761 struct ceph_statx stx;
7762 memset(&de, 0, sizeof(de));
7763 memset(&stx, 0, sizeof(stx));
7764
7765 InodeRef& diri = dirp->inode;
7766
7767 if (dirp->at_end())
7768 return 0;
7769
7770 if (dirp->offset == 0) {
7771 ldout(cct, 15) << " including ." << dendl;
7772 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7773 uint64_t next_off = 1;
7774
7775 int r;
7776 r = _getattr(diri, caps, dirp->perms);
7777 if (r < 0)
7778 return r;
7779
7780 fill_statx(diri, caps, &stx);
7781 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7782
7783 Inode *inode = NULL;
7784 if (getref) {
7785 inode = diri.get();
7786 _ll_get(inode);
7787 }
7788
7789 client_lock.Unlock();
7790 r = cb(p, &de, &stx, next_off, inode);
7791 client_lock.Lock();
7792 if (r < 0)
7793 return r;
7794
7795 dirp->offset = next_off;
7796 if (r > 0)
7797 return r;
7798 }
7799 if (dirp->offset == 1) {
7800 ldout(cct, 15) << " including .." << dendl;
7801 uint64_t next_off = 2;
7802 InodeRef in;
7803 if (diri->dn_set.empty())
7804 in = diri;
7805 else
94b18763 7806 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
7807
7808 int r;
94b18763 7809 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
7810 if (r < 0)
7811 return r;
7812
7813 fill_statx(in, caps, &stx);
7814 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7815
7816 Inode *inode = NULL;
7817 if (getref) {
7818 inode = in.get();
7819 _ll_get(inode);
7820 }
7821
7822 client_lock.Unlock();
7823 r = cb(p, &de, &stx, next_off, inode);
7824 client_lock.Lock();
7825 if (r < 0)
7826 return r;
7827
7828 dirp->offset = next_off;
7829 if (r > 0)
7830 return r;
7831 }
7832
7833 // can we read from our cache?
7834 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7835 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7836 << dirp->inode->is_complete_and_ordered()
7837 << " issued " << ccap_string(dirp->inode->caps_issued())
7838 << dendl;
7839 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7840 dirp->inode->is_complete_and_ordered() &&
94b18763 7841 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
7842 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7843 if (err != -EAGAIN)
7844 return err;
7845 }
7846
7847 while (1) {
7848 if (dirp->at_end())
7849 return 0;
7850
7851 bool check_caps = true;
7852 if (!dirp->is_cached()) {
7853 int r = _readdir_get_frag(dirp);
7854 if (r)
7855 return r;
7856 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7857 // different than the requested one. (our dirfragtree was outdated)
7858 check_caps = false;
7859 }
7860 frag_t fg = dirp->buffer_frag;
7861
7862 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7863 << " offset " << hex << dirp->offset << dendl;
7864
7865 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7866 dirp->offset, dir_result_t::dentry_off_lt());
7867 it != dirp->buffer.end();
7868 ++it) {
7869 dir_result_t::dentry &entry = *it;
7870
7871 uint64_t next_off = entry.offset + 1;
7872
7873 int r;
7874 if (check_caps) {
7875 r = _getattr(entry.inode, caps, dirp->perms);
7876 if (r < 0)
7877 return r;
7878 }
7879
7880 fill_statx(entry.inode, caps, &stx);
7881 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7882
7883 Inode *inode = NULL;
7884 if (getref) {
7885 inode = entry.inode.get();
7886 _ll_get(inode);
7887 }
7888
7889 client_lock.Unlock();
7890 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7891 client_lock.Lock();
7892
7893 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7894 << " = " << r << dendl;
7895 if (r < 0)
7896 return r;
7897
7898 dirp->offset = next_off;
7899 if (r > 0)
7900 return r;
7901 }
7902
7903 if (dirp->next_offset > 2) {
7904 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7905 _readdir_drop_dirp_buffer(dirp);
7906 continue; // more!
7907 }
7908
7909 if (!fg.is_rightmost()) {
7910 // next frag!
7911 _readdir_next_frag(dirp);
7912 continue;
7913 }
7914
7915 if (diri->shared_gen == dirp->start_shared_gen &&
7916 diri->dir_release_count == dirp->release_count) {
7917 if (diri->dir_ordered_count == dirp->ordered_count) {
7918 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7919 if (diri->dir) {
7920 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7921 diri->dir->readdir_cache.resize(dirp->cache_index);
7922 }
7923 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7924 } else {
7925 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7926 diri->flags |= I_COMPLETE;
7927 }
7928 }
7929
7930 dirp->set_end();
7931 return 0;
7932 }
7933 ceph_abort();
7934 return 0;
7935}
7936
7937
7938int Client::readdir_r(dir_result_t *d, struct dirent *de)
7939{
7940 return readdirplus_r(d, de, 0, 0, 0, NULL);
7941}
7942
7943/*
7944 * readdirplus_r
7945 *
7946 * returns
7947 * 1 if we got a dirent
7948 * 0 for end of directory
7949 * <0 on error
7950 */
7951
7952struct single_readdir {
7953 struct dirent *de;
7954 struct ceph_statx *stx;
7955 Inode *inode;
7956 bool full;
7957};
7958
7959static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7960 struct ceph_statx *stx, off_t off,
7961 Inode *in)
7962{
7963 single_readdir *c = static_cast<single_readdir *>(p);
7964
7965 if (c->full)
7966 return -1; // already filled this dirent
7967
7968 *c->de = *de;
7969 if (c->stx)
7970 *c->stx = *stx;
7971 c->inode = in;
7972 c->full = true;
7973 return 1;
7974}
7975
7976struct dirent *Client::readdir(dir_result_t *d)
7977{
7978 int ret;
7979 static struct dirent de;
7980 single_readdir sr;
7981 sr.de = &de;
7982 sr.stx = NULL;
7983 sr.inode = NULL;
7984 sr.full = false;
7985
7986 // our callback fills the dirent and sets sr.full=true on first
7987 // call, and returns -1 the second time around.
7988 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
7989 if (ret < -1) {
7990 errno = -ret; // this sucks.
7991 return (dirent *) NULL;
7992 }
7993 if (sr.full) {
7994 return &de;
7995 }
7996 return (dirent *) NULL;
7997}
7998
7999int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8000 struct ceph_statx *stx, unsigned want,
8001 unsigned flags, Inode **out)
8002{
8003 single_readdir sr;
8004 sr.de = de;
8005 sr.stx = stx;
8006 sr.inode = NULL;
8007 sr.full = false;
8008
8009 // our callback fills the dirent and sets sr.full=true on first
8010 // call, and returns -1 the second time around.
8011 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8012 if (r < -1)
8013 return r;
8014 if (out)
8015 *out = sr.inode;
8016 if (sr.full)
8017 return 1;
8018 return 0;
8019}
8020
8021
8022/* getdents */
8023struct getdents_result {
8024 char *buf;
8025 int buflen;
8026 int pos;
8027 bool fullent;
8028};
8029
8030static int _readdir_getdent_cb(void *p, struct dirent *de,
8031 struct ceph_statx *stx, off_t off, Inode *in)
8032{
8033 struct getdents_result *c = static_cast<getdents_result *>(p);
8034
8035 int dlen;
8036 if (c->fullent)
8037 dlen = sizeof(*de);
8038 else
8039 dlen = strlen(de->d_name) + 1;
8040
8041 if (c->pos + dlen > c->buflen)
8042 return -1; // doesn't fit
8043
8044 if (c->fullent) {
8045 memcpy(c->buf + c->pos, de, sizeof(*de));
8046 } else {
8047 memcpy(c->buf + c->pos, de->d_name, dlen);
8048 }
8049 c->pos += dlen;
8050 return 0;
8051}
8052
8053int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8054{
8055 getdents_result gr;
8056 gr.buf = buf;
8057 gr.buflen = buflen;
8058 gr.fullent = fullent;
8059 gr.pos = 0;
8060
8061 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8062
8063 if (r < 0) { // some error
8064 if (r == -1) { // buffer ran out of space
8065 if (gr.pos) { // but we got some entries already!
8066 return gr.pos;
8067 } // or we need a larger buffer
8068 return -ERANGE;
8069 } else { // actual error, return it
8070 return r;
8071 }
8072 }
8073 return gr.pos;
8074}
8075
8076
8077/* getdir */
8078struct getdir_result {
8079 list<string> *contents;
8080 int num;
8081};
8082
8083static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8084{
8085 getdir_result *r = static_cast<getdir_result *>(p);
8086
8087 r->contents->push_back(de->d_name);
8088 r->num++;
8089 return 0;
8090}
8091
8092int Client::getdir(const char *relpath, list<string>& contents,
8093 const UserPerm& perms)
8094{
8095 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8096 {
8097 Mutex::Locker lock(client_lock);
8098 tout(cct) << "getdir" << std::endl;
8099 tout(cct) << relpath << std::endl;
8100 }
8101
8102 dir_result_t *d;
8103 int r = opendir(relpath, &d, perms);
8104 if (r < 0)
8105 return r;
8106
8107 getdir_result gr;
8108 gr.contents = &contents;
8109 gr.num = 0;
8110 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8111
8112 closedir(d);
8113
8114 if (r < 0)
8115 return r;
8116 return gr.num;
8117}
8118
8119
8120/****** file i/o **********/
8121int Client::open(const char *relpath, int flags, const UserPerm& perms,
8122 mode_t mode, int stripe_unit, int stripe_count,
8123 int object_size, const char *data_pool)
8124{
8125 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8126 Mutex::Locker lock(client_lock);
8127 tout(cct) << "open" << std::endl;
8128 tout(cct) << relpath << std::endl;
8129 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8130
181888fb
FG
8131 if (unmounting)
8132 return -ENOTCONN;
8133
7c673cae
FG
8134 Fh *fh = NULL;
8135
8136#if defined(__linux__) && defined(O_PATH)
8137 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8138 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8139 * in kernel (fs/open.c). */
8140 if (flags & O_PATH)
8141 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8142#endif
8143
8144 filepath path(relpath);
8145 InodeRef in;
8146 bool created = false;
8147 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8148 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8149 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8150
8151 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8152 return -EEXIST;
8153
8154#if defined(__linux__) && defined(O_PATH)
8155 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8156#else
8157 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8158#endif
8159 return -ELOOP;
8160
8161 if (r == -ENOENT && (flags & O_CREAT)) {
8162 filepath dirpath = path;
8163 string dname = dirpath.last_dentry();
8164 dirpath.pop_dentry();
8165 InodeRef dir;
8166 r = path_walk(dirpath, &dir, perms, true,
8167 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8168 if (r < 0)
8169 goto out;
8170 if (cct->_conf->client_permissions) {
8171 r = may_create(dir.get(), perms);
8172 if (r < 0)
8173 goto out;
8174 }
8175 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8176 stripe_count, object_size, data_pool, &created, perms);
8177 }
8178 if (r < 0)
8179 goto out;
8180
8181 if (!created) {
8182 // posix says we can only check permissions of existing files
8183 if (cct->_conf->client_permissions) {
8184 r = may_open(in.get(), flags, perms);
8185 if (r < 0)
8186 goto out;
8187 }
8188 }
8189
8190 if (!fh)
8191 r = _open(in.get(), flags, mode, &fh, perms);
8192 if (r >= 0) {
8193 // allocate a integer file descriptor
8194 assert(fh);
8195 r = get_fd();
8196 assert(fd_map.count(r) == 0);
8197 fd_map[r] = fh;
8198 }
8199
8200 out:
8201 tout(cct) << r << std::endl;
8202 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8203 return r;
8204}
8205
8206int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8207{
8208 /* Use default file striping parameters */
8209 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8210}
8211
8212int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8213 const UserPerm& perms)
8214{
8215 Mutex::Locker lock(client_lock);
8216 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8217
181888fb
FG
8218 if (unmounting)
8219 return -ENOTCONN;
8220
7c673cae
FG
8221 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8222 filepath path(ino);
8223 req->set_filepath(path);
8224
8225 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8226 char f[30];
8227 sprintf(f, "%u", h);
8228 filepath path2(dirino);
8229 path2.push_dentry(string(f));
8230 req->set_filepath2(path2);
8231
8232 int r = make_request(req, perms, NULL, NULL,
8233 rand() % mdsmap->get_num_in_mds());
8234 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8235 return r;
8236}
8237
8238
8239/**
8240 * Load inode into local cache.
8241 *
8242 * If inode pointer is non-NULL, and take a reference on
8243 * the resulting Inode object in one operation, so that caller
8244 * can safely assume inode will still be there after return.
8245 */
8246int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8247{
8248 Mutex::Locker lock(client_lock);
8249 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8250
181888fb
FG
8251 if (unmounting)
8252 return -ENOTCONN;
8253
7c673cae
FG
8254 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8255 filepath path(ino);
8256 req->set_filepath(path);
8257
8258 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8259 if (r == 0 && inode != NULL) {
8260 vinodeno_t vino(ino, CEPH_NOSNAP);
8261 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8262 assert(p != inode_map.end());
8263 *inode = p->second;
8264 _ll_get(*inode);
8265 }
8266 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8267 return r;
8268}
8269
8270
8271
8272/**
8273 * Find the parent inode of `ino` and insert it into
8274 * our cache. Conditionally also set `parent` to a referenced
8275 * Inode* if caller provides non-NULL value.
8276 */
8277int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8278{
8279 Mutex::Locker lock(client_lock);
8280 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8281
181888fb
FG
8282 if (unmounting)
8283 return -ENOTCONN;
8284
7c673cae
FG
8285 if (!ino->dn_set.empty()) {
8286 // if we exposed the parent here, we'd need to check permissions,
8287 // but right now we just rely on the MDS doing so in make_request
8288 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8289 return 0;
8290 }
8291
8292 if (ino->is_root()) {
8293 *parent = NULL;
8294 ldout(cct, 3) << "ino is root, no parent" << dendl;
8295 return -EINVAL;
8296 }
8297
8298 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8299 filepath path(ino->ino);
8300 req->set_filepath(path);
8301
8302 InodeRef target;
8303 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8304 // Give caller a reference to the parent ino if they provided a pointer.
8305 if (parent != NULL) {
8306 if (r == 0) {
8307 *parent = target.get();
8308 _ll_get(*parent);
8309 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8310 } else {
8311 *parent = NULL;
8312 }
8313 }
8314 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8315 return r;
8316}
8317
8318
8319/**
8320 * Populate the parent dentry for `ino`, provided it is
8321 * a child of `parent`.
8322 */
8323int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8324{
8325 assert(parent->is_dir());
8326
8327 Mutex::Locker lock(client_lock);
8328 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8329
181888fb
FG
8330 if (unmounting)
8331 return -ENOTCONN;
8332
7c673cae
FG
8333 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8334 req->set_filepath2(filepath(parent->ino));
8335 req->set_filepath(filepath(ino->ino));
8336 req->set_inode(ino);
8337
8338 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8339 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8340 return r;
8341}
8342
8343
8344 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8345{
8346 assert(in);
8347 Fh *f = new Fh(in);
8348 f->mode = cmode;
8349 f->flags = flags;
8350
8351 // inode
8352 f->actor_perms = perms;
8353
8354 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8355
8356 if (in->snapid != CEPH_NOSNAP) {
8357 in->snap_cap_refs++;
8358 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8359 << ccap_string(in->caps_issued()) << dendl;
8360 }
8361
8362 const md_config_t *conf = cct->_conf;
8363 f->readahead.set_trigger_requests(1);
8364 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8365 uint64_t max_readahead = Readahead::NO_LIMIT;
8366 if (conf->client_readahead_max_bytes) {
8367 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8368 }
8369 if (conf->client_readahead_max_periods) {
8370 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8371 }
8372 f->readahead.set_max_readahead_size(max_readahead);
8373 vector<uint64_t> alignments;
8374 alignments.push_back(in->layout.get_period());
8375 alignments.push_back(in->layout.stripe_unit);
8376 f->readahead.set_alignments(alignments);
8377
8378 return f;
8379}
8380
8381int Client::_release_fh(Fh *f)
8382{
8383 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8384 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8385 Inode *in = f->inode.get();
8386 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8387
b32b8144
FG
8388 in->unset_deleg(f);
8389
7c673cae
FG
8390 if (in->snapid == CEPH_NOSNAP) {
8391 if (in->put_open_ref(f->mode)) {
8392 _flush(in, new C_Client_FlushComplete(this, in));
8393 check_caps(in, 0);
8394 }
8395 } else {
8396 assert(in->snap_cap_refs > 0);
8397 in->snap_cap_refs--;
8398 }
8399
8400 _release_filelocks(f);
8401
8402 // Finally, read any async err (i.e. from flushes)
8403 int err = f->take_async_err();
8404 if (err != 0) {
8405 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8406 << cpp_strerror(err) << dendl;
8407 } else {
8408 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8409 }
8410
8411 _put_fh(f);
8412
8413 return err;
8414}
8415
8416void Client::_put_fh(Fh *f)
8417{
8418 int left = f->put();
8419 if (!left) {
8420 delete f;
8421 }
8422}
8423
8424int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8425 const UserPerm& perms)
8426{
8427 if (in->snapid != CEPH_NOSNAP &&
8428 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8429 return -EROFS;
8430 }
8431
8432 // use normalized flags to generate cmode
8433 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8434 if (cmode < 0)
8435 return -EINVAL;
8436 int want = ceph_caps_for_mode(cmode);
8437 int result = 0;
8438
8439 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8440
b32b8144 8441 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8442 // update wanted?
8443 check_caps(in, CHECK_CAPS_NODELAY);
8444 } else {
b32b8144 8445
7c673cae
FG
8446 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8447 filepath path;
8448 in->make_nosnap_relative_path(path);
8449 req->set_filepath(path);
8450 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8451 req->head.args.open.mode = mode;
8452 req->head.args.open.pool = -1;
8453 if (cct->_conf->client_debug_getattr_caps)
8454 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8455 else
8456 req->head.args.open.mask = 0;
8457 req->head.args.open.old_size = in->size; // for O_TRUNC
8458 req->set_inode(in);
8459 result = make_request(req, perms);
b32b8144
FG
8460
8461 /*
8462 * NFS expects that delegations will be broken on a conflicting open,
8463 * not just when there is actual conflicting access to the file. SMB leases
8464 * and oplocks also have similar semantics.
8465 *
8466 * Ensure that clients that have delegations enabled will wait on minimal
8467 * caps during open, just to ensure that other clients holding delegations
8468 * return theirs first.
8469 */
8470 if (deleg_timeout && result == 0) {
8471 int need = 0, have;
8472
8473 if (cmode & CEPH_FILE_MODE_WR)
8474 need |= CEPH_CAP_FILE_WR;
8475 if (cmode & CEPH_FILE_MODE_RD)
8476 need |= CEPH_CAP_FILE_RD;
8477
8478 result = get_caps(in, need, want, &have, -1);
8479 if (result < 0) {
8480 ldout(cct, 1) << "Unable to get caps after open of inode " << *in <<
8481 " . Denying open: " <<
8482 cpp_strerror(result) << dendl;
8483 in->put_open_ref(cmode);
8484 } else {
8485 put_cap_ref(in, need);
8486 }
8487 }
7c673cae
FG
8488 }
8489
8490 // success?
8491 if (result >= 0) {
8492 if (fhp)
8493 *fhp = _create_fh(in, flags, cmode, perms);
8494 } else {
8495 in->put_open_ref(cmode);
8496 }
8497
8498 trim_cache();
8499
8500 return result;
8501}
8502
8503int Client::_renew_caps(Inode *in)
8504{
8505 int wanted = in->caps_file_wanted();
8506 if (in->is_any_caps() &&
8507 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8508 check_caps(in, CHECK_CAPS_NODELAY);
8509 return 0;
8510 }
8511
8512 int flags = 0;
8513 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8514 flags = O_RDWR;
8515 else if (wanted & CEPH_CAP_FILE_RD)
8516 flags = O_RDONLY;
8517 else if (wanted & CEPH_CAP_FILE_WR)
8518 flags = O_WRONLY;
8519
8520 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8521 filepath path;
8522 in->make_nosnap_relative_path(path);
8523 req->set_filepath(path);
8524 req->head.args.open.flags = flags;
8525 req->head.args.open.pool = -1;
8526 if (cct->_conf->client_debug_getattr_caps)
8527 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8528 else
8529 req->head.args.open.mask = 0;
8530 req->set_inode(in);
8531
8532 // duplicate in case Cap goes away; not sure if that race is a concern?
8533 const UserPerm *pperm = in->get_best_perms();
8534 UserPerm perms;
8535 if (pperm != NULL)
8536 perms = *pperm;
8537 int ret = make_request(req, perms);
8538 return ret;
8539}
8540
8541int Client::close(int fd)
8542{
8543 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8544 Mutex::Locker lock(client_lock);
8545 tout(cct) << "close" << std::endl;
8546 tout(cct) << fd << std::endl;
8547
181888fb
FG
8548 if (unmounting)
8549 return -ENOTCONN;
8550
7c673cae
FG
8551 Fh *fh = get_filehandle(fd);
8552 if (!fh)
8553 return -EBADF;
8554 int err = _release_fh(fh);
8555 fd_map.erase(fd);
8556 put_fd(fd);
8557 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8558 return err;
8559}
8560
8561
8562// ------------
8563// read, write
8564
8565loff_t Client::lseek(int fd, loff_t offset, int whence)
8566{
8567 Mutex::Locker lock(client_lock);
8568 tout(cct) << "lseek" << std::endl;
8569 tout(cct) << fd << std::endl;
8570 tout(cct) << offset << std::endl;
8571 tout(cct) << whence << std::endl;
8572
181888fb
FG
8573 if (unmounting)
8574 return -ENOTCONN;
8575
7c673cae
FG
8576 Fh *f = get_filehandle(fd);
8577 if (!f)
8578 return -EBADF;
8579#if defined(__linux__) && defined(O_PATH)
8580 if (f->flags & O_PATH)
8581 return -EBADF;
8582#endif
8583 return _lseek(f, offset, whence);
8584}
8585
8586loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8587{
8588 Inode *in = f->inode.get();
8589 int r;
8590
8591 switch (whence) {
8592 case SEEK_SET:
8593 f->pos = offset;
8594 break;
8595
8596 case SEEK_CUR:
8597 f->pos += offset;
8598 break;
8599
8600 case SEEK_END:
8601 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8602 if (r < 0)
8603 return r;
8604 f->pos = in->size + offset;
8605 break;
8606
8607 default:
8608 ceph_abort();
8609 }
8610
8611 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8612 return f->pos;
8613}
8614
8615
8616void Client::lock_fh_pos(Fh *f)
8617{
8618 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8619
8620 if (f->pos_locked || !f->pos_waiters.empty()) {
8621 Cond cond;
8622 f->pos_waiters.push_back(&cond);
8623 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8624 while (f->pos_locked || f->pos_waiters.front() != &cond)
8625 cond.Wait(client_lock);
8626 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8627 assert(f->pos_waiters.front() == &cond);
8628 f->pos_waiters.pop_front();
8629 }
8630
8631 f->pos_locked = true;
8632}
8633
8634void Client::unlock_fh_pos(Fh *f)
8635{
8636 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8637 f->pos_locked = false;
8638}
8639
8640int Client::uninline_data(Inode *in, Context *onfinish)
8641{
8642 if (!in->inline_data.length()) {
8643 onfinish->complete(0);
8644 return 0;
8645 }
8646
8647 char oid_buf[32];
8648 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8649 object_t oid = oid_buf;
8650
8651 ObjectOperation create_ops;
8652 create_ops.create(false);
8653
8654 objecter->mutate(oid,
8655 OSDMap::file_to_object_locator(in->layout),
8656 create_ops,
8657 in->snaprealm->get_snap_context(),
8658 ceph::real_clock::now(),
8659 0,
8660 NULL);
8661
8662 bufferlist inline_version_bl;
8663 ::encode(in->inline_version, inline_version_bl);
8664
8665 ObjectOperation uninline_ops;
8666 uninline_ops.cmpxattr("inline_version",
8667 CEPH_OSD_CMPXATTR_OP_GT,
8668 CEPH_OSD_CMPXATTR_MODE_U64,
8669 inline_version_bl);
8670 bufferlist inline_data = in->inline_data;
8671 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8672 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8673
8674 objecter->mutate(oid,
8675 OSDMap::file_to_object_locator(in->layout),
8676 uninline_ops,
8677 in->snaprealm->get_snap_context(),
8678 ceph::real_clock::now(),
8679 0,
8680 onfinish);
8681
8682 return 0;
8683}
8684
8685//
8686
8687// blocking osd interface
8688
8689int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8690{
8691 Mutex::Locker lock(client_lock);
8692 tout(cct) << "read" << std::endl;
8693 tout(cct) << fd << std::endl;
8694 tout(cct) << size << std::endl;
8695 tout(cct) << offset << std::endl;
8696
181888fb
FG
8697 if (unmounting)
8698 return -ENOTCONN;
8699
7c673cae
FG
8700 Fh *f = get_filehandle(fd);
8701 if (!f)
8702 return -EBADF;
8703#if defined(__linux__) && defined(O_PATH)
8704 if (f->flags & O_PATH)
8705 return -EBADF;
8706#endif
8707 bufferlist bl;
8708 int r = _read(f, offset, size, &bl);
8709 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8710 if (r >= 0) {
8711 bl.copy(0, bl.length(), buf);
8712 r = bl.length();
8713 }
8714 return r;
8715}
8716
8717int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8718{
8719 if (iovcnt < 0)
8720 return -EINVAL;
8721 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8722}
8723
8724int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8725{
8726 const md_config_t *conf = cct->_conf;
8727 Inode *in = f->inode.get();
8728
8729 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8730 return -EBADF;
8731 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8732
8733 bool movepos = false;
8734 if (offset < 0) {
8735 lock_fh_pos(f);
8736 offset = f->pos;
8737 movepos = true;
8738 }
8739 loff_t start_pos = offset;
8740
8741 if (in->inline_version == 0) {
8742 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8743 if (r < 0) {
8744 if (movepos)
8745 unlock_fh_pos(f);
7c673cae 8746 return r;
c07f9fc5 8747 }
7c673cae
FG
8748 assert(in->inline_version > 0);
8749 }
8750
8751retry:
8752 int have;
8753 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8754 if (r < 0) {
8755 if (movepos)
8756 unlock_fh_pos(f);
7c673cae 8757 return r;
c07f9fc5 8758 }
7c673cae
FG
8759 if (f->flags & O_DIRECT)
8760 have &= ~CEPH_CAP_FILE_CACHE;
8761
8762 Mutex uninline_flock("Client::_read_uninline_data flock");
8763 Cond uninline_cond;
8764 bool uninline_done = false;
8765 int uninline_ret = 0;
8766 Context *onuninline = NULL;
8767
8768 if (in->inline_version < CEPH_INLINE_NONE) {
8769 if (!(have & CEPH_CAP_FILE_CACHE)) {
8770 onuninline = new C_SafeCond(&uninline_flock,
8771 &uninline_cond,
8772 &uninline_done,
8773 &uninline_ret);
8774 uninline_data(in, onuninline);
8775 } else {
8776 uint32_t len = in->inline_data.length();
8777
8778 uint64_t endoff = offset + size;
8779 if (endoff > in->size)
8780 endoff = in->size;
8781
8782 if (offset < len) {
8783 if (endoff <= len) {
8784 bl->substr_of(in->inline_data, offset, endoff - offset);
8785 } else {
8786 bl->substr_of(in->inline_data, offset, len - offset);
8787 bl->append_zero(endoff - len);
8788 }
8789 } else if ((uint64_t)offset < endoff) {
8790 bl->append_zero(endoff - offset);
8791 }
8792
8793 goto success;
8794 }
8795 }
8796
8797 if (!conf->client_debug_force_sync_read &&
8798 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8799
8800 if (f->flags & O_RSYNC) {
8801 _flush_range(in, offset, size);
8802 }
8803 r = _read_async(f, offset, size, bl);
8804 if (r < 0)
8805 goto done;
8806 } else {
8807 if (f->flags & O_DIRECT)
8808 _flush_range(in, offset, size);
8809
8810 bool checkeof = false;
8811 r = _read_sync(f, offset, size, bl, &checkeof);
8812 if (r < 0)
8813 goto done;
8814 if (checkeof) {
8815 offset += r;
8816 size -= r;
8817
8818 put_cap_ref(in, CEPH_CAP_FILE_RD);
8819 have = 0;
8820 // reverify size
8821 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8822 if (r < 0)
8823 goto done;
8824
8825 // eof? short read.
8826 if ((uint64_t)offset < in->size)
8827 goto retry;
8828 }
8829 }
8830
8831success:
8832 if (movepos) {
8833 // adjust fd pos
8834 f->pos = start_pos + bl->length();
8835 unlock_fh_pos(f);
8836 }
8837
8838done:
8839 // done!
8840
8841 if (onuninline) {
8842 client_lock.Unlock();
8843 uninline_flock.Lock();
8844 while (!uninline_done)
8845 uninline_cond.Wait(uninline_flock);
8846 uninline_flock.Unlock();
8847 client_lock.Lock();
8848
8849 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8850 in->inline_data.clear();
8851 in->inline_version = CEPH_INLINE_NONE;
28e407b8 8852 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
8853 check_caps(in, 0);
8854 } else
8855 r = uninline_ret;
8856 }
8857
8858 if (have)
8859 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8860 if (r < 0) {
8861 if (movepos)
8862 unlock_fh_pos(f);
8863 return r;
8864 } else
8865 return bl->length();
7c673cae
FG
8866}
8867
8868Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8869 client(c), f(f) {
8870 f->get();
8871 f->readahead.inc_pending();
8872}
8873
8874Client::C_Readahead::~C_Readahead() {
8875 f->readahead.dec_pending();
8876 client->_put_fh(f);
8877}
8878
8879void Client::C_Readahead::finish(int r) {
8880 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8881 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8882}
8883
8884int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8885{
8886 const md_config_t *conf = cct->_conf;
8887 Inode *in = f->inode.get();
8888
8889 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8890
8891 // trim read based on file size?
8892 if (off >= in->size)
8893 return 0;
8894 if (len == 0)
8895 return 0;
8896 if (off + len > in->size) {
8897 len = in->size - off;
8898 }
8899
8900 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8901 << " max_bytes=" << f->readahead.get_max_readahead_size()
8902 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8903
8904 // read (and possibly block)
8905 int r, rvalue = 0;
8906 Mutex flock("Client::_read_async flock");
8907 Cond cond;
8908 bool done = false;
8909 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8910 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8911 off, len, bl, 0, onfinish);
8912 if (r == 0) {
8913 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8914 client_lock.Unlock();
8915 flock.Lock();
8916 while (!done)
8917 cond.Wait(flock);
8918 flock.Unlock();
8919 client_lock.Lock();
8920 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8921 r = rvalue;
8922 } else {
8923 // it was cached.
8924 delete onfinish;
8925 }
8926
8927 if(f->readahead.get_min_readahead_size() > 0) {
8928 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8929 if (readahead_extent.second > 0) {
8930 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8931 << " (caller wants " << off << "~" << len << ")" << dendl;
8932 Context *onfinish2 = new C_Readahead(this, f);
8933 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8934 readahead_extent.first, readahead_extent.second,
8935 NULL, 0, onfinish2);
8936 if (r2 == 0) {
8937 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8938 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8939 } else {
8940 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8941 delete onfinish2;
8942 }
8943 }
8944 }
8945
8946 return r;
8947}
8948
8949int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8950 bool *checkeof)
8951{
8952 Inode *in = f->inode.get();
8953 uint64_t pos = off;
8954 int left = len;
8955 int read = 0;
8956
8957 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8958
8959 Mutex flock("Client::_read_sync flock");
8960 Cond cond;
8961 while (left > 0) {
8962 int r = 0;
8963 bool done = false;
8964 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8965 bufferlist tbl;
8966
8967 int wanted = left;
8968 filer->read_trunc(in->ino, &in->layout, in->snapid,
8969 pos, left, &tbl, 0,
8970 in->truncate_size, in->truncate_seq,
8971 onfinish);
8972 client_lock.Unlock();
8973 flock.Lock();
8974 while (!done)
8975 cond.Wait(flock);
8976 flock.Unlock();
8977 client_lock.Lock();
8978
8979 // if we get ENOENT from OSD, assume 0 bytes returned
8980 if (r == -ENOENT)
8981 r = 0;
8982 if (r < 0)
8983 return r;
8984 if (tbl.length()) {
8985 r = tbl.length();
8986
8987 read += r;
8988 pos += r;
8989 left -= r;
8990 bl->claim_append(tbl);
8991 }
8992 // short read?
8993 if (r >= 0 && r < wanted) {
8994 if (pos < in->size) {
8995 // zero up to known EOF
8996 int64_t some = in->size - pos;
8997 if (some > left)
8998 some = left;
8999 bufferptr z(some);
9000 z.zero();
9001 bl->push_back(z);
9002 read += some;
9003 pos += some;
9004 left -= some;
9005 if (left == 0)
9006 return read;
9007 }
9008
9009 *checkeof = true;
9010 return read;
9011 }
9012 }
9013 return read;
9014}
9015
9016
9017/*
9018 * we keep count of uncommitted sync writes on the inode, so that
9019 * fsync can DDRT.
9020 */
9021void Client::_sync_write_commit(Inode *in)
9022{
9023 assert(unsafe_sync_write > 0);
9024 unsafe_sync_write--;
9025
9026 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9027
9028 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9029 if (unsafe_sync_write == 0 && unmounting) {
9030 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9031 mount_cond.Signal();
9032 }
9033}
9034
9035int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9036{
9037 Mutex::Locker lock(client_lock);
9038 tout(cct) << "write" << std::endl;
9039 tout(cct) << fd << std::endl;
9040 tout(cct) << size << std::endl;
9041 tout(cct) << offset << std::endl;
9042
181888fb
FG
9043 if (unmounting)
9044 return -ENOTCONN;
9045
7c673cae
FG
9046 Fh *fh = get_filehandle(fd);
9047 if (!fh)
9048 return -EBADF;
9049#if defined(__linux__) && defined(O_PATH)
9050 if (fh->flags & O_PATH)
9051 return -EBADF;
9052#endif
9053 int r = _write(fh, offset, size, buf, NULL, 0);
9054 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9055 return r;
9056}
9057
9058int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9059{
9060 if (iovcnt < 0)
9061 return -EINVAL;
9062 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9063}
9064
9065int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9066{
9067 Mutex::Locker lock(client_lock);
9068 tout(cct) << fd << std::endl;
9069 tout(cct) << offset << std::endl;
9070
181888fb
FG
9071 if (unmounting)
9072 return -ENOTCONN;
9073
7c673cae
FG
9074 Fh *fh = get_filehandle(fd);
9075 if (!fh)
9076 return -EBADF;
9077#if defined(__linux__) && defined(O_PATH)
9078 if (fh->flags & O_PATH)
9079 return -EBADF;
9080#endif
9081 loff_t totallen = 0;
9082 for (unsigned i = 0; i < iovcnt; i++) {
9083 totallen += iov[i].iov_len;
9084 }
9085 if (write) {
9086 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9087 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9088 return w;
9089 } else {
9090 bufferlist bl;
9091 int r = _read(fh, offset, totallen, &bl);
9092 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9093 if (r <= 0)
9094 return r;
9095
9096 int bufoff = 0;
9097 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9098 /*
9099 * This piece of code aims to handle the case that bufferlist does not have enough data
9100 * to fill in the iov
9101 */
9102 if (resid < iov[j].iov_len) {
9103 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9104 break;
9105 } else {
9106 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9107 }
9108 resid -= iov[j].iov_len;
9109 bufoff += iov[j].iov_len;
9110 }
9111 return r;
9112 }
9113}
9114
9115int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9116 const struct iovec *iov, int iovcnt)
9117{
9118 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9119 return -EFBIG;
9120
9121 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9122 Inode *in = f->inode.get();
9123
9124 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9125 return -ENOSPC;
9126 }
9127
9128 assert(in->snapid == CEPH_NOSNAP);
9129
9130 // was Fh opened as writeable?
9131 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9132 return -EBADF;
9133
9134 // check quota
9135 uint64_t endoff = offset + size;
28e407b8
AA
9136 std::list<InodeRef> quota_roots;
9137 if (endoff > in->size &&
9138 is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
7c673cae
FG
9139 return -EDQUOT;
9140 }
9141
9142 // use/adjust fd pos?
9143 if (offset < 0) {
9144 lock_fh_pos(f);
9145 /*
9146 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9147 * change out from under us.
9148 */
9149 if (f->flags & O_APPEND) {
9150 int r = _lseek(f, 0, SEEK_END);
9151 if (r < 0) {
9152 unlock_fh_pos(f);
9153 return r;
9154 }
9155 }
9156 offset = f->pos;
9157 f->pos = offset+size;
9158 unlock_fh_pos(f);
9159 }
9160
9161 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9162
9163 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9164
9165 // time it.
9166 utime_t start = ceph_clock_now();
9167
9168 if (in->inline_version == 0) {
9169 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9170 if (r < 0)
9171 return r;
9172 assert(in->inline_version > 0);
9173 }
9174
9175 // copy into fresh buffer (since our write may be resub, async)
9176 bufferlist bl;
9177 if (buf) {
9178 if (size > 0)
9179 bl.append(buf, size);
9180 } else if (iov){
9181 for (int i = 0; i < iovcnt; i++) {
9182 if (iov[i].iov_len > 0) {
9183 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9184 }
9185 }
9186 }
9187
9188 utime_t lat;
9189 uint64_t totalwritten;
9190 int have;
9191 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9192 CEPH_CAP_FILE_BUFFER, &have, endoff);
9193 if (r < 0)
9194 return r;
9195
9196 /* clear the setuid/setgid bits, if any */
181888fb 9197 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9198 struct ceph_statx stx = { 0 };
9199
9200 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9201 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9202 if (r < 0)
9203 return r;
9204 } else {
9205 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9206 }
9207
9208 if (f->flags & O_DIRECT)
9209 have &= ~CEPH_CAP_FILE_BUFFER;
9210
9211 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9212
9213 Mutex uninline_flock("Client::_write_uninline_data flock");
9214 Cond uninline_cond;
9215 bool uninline_done = false;
9216 int uninline_ret = 0;
9217 Context *onuninline = NULL;
9218
9219 if (in->inline_version < CEPH_INLINE_NONE) {
9220 if (endoff > cct->_conf->client_max_inline_size ||
9221 endoff > CEPH_INLINE_MAX_SIZE ||
9222 !(have & CEPH_CAP_FILE_BUFFER)) {
9223 onuninline = new C_SafeCond(&uninline_flock,
9224 &uninline_cond,
9225 &uninline_done,
9226 &uninline_ret);
9227 uninline_data(in, onuninline);
9228 } else {
9229 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9230
9231 uint32_t len = in->inline_data.length();
9232
9233 if (endoff < len)
9234 in->inline_data.copy(endoff, len - endoff, bl);
9235
9236 if (offset < len)
9237 in->inline_data.splice(offset, len - offset);
9238 else if (offset > len)
9239 in->inline_data.append_zero(offset - len);
9240
9241 in->inline_data.append(bl);
9242 in->inline_version++;
9243
9244 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9245
9246 goto success;
9247 }
9248 }
9249
9250 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9251 // do buffered write
9252 if (!in->oset.dirty_or_tx)
9253 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9254
9255 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9256
9257 // async, caching, non-blocking.
9258 r = objectcacher->file_write(&in->oset, &in->layout,
9259 in->snaprealm->get_snap_context(),
9260 offset, size, bl, ceph::real_clock::now(),
9261 0);
9262 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9263
9264 if (r < 0)
9265 goto done;
9266
9267 // flush cached write if O_SYNC is set on file fh
9268 // O_DSYNC == O_SYNC on linux < 2.6.33
9269 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9270 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9271 _flush_range(in, offset, size);
9272 }
9273 } else {
9274 if (f->flags & O_DIRECT)
9275 _flush_range(in, offset, size);
9276
9277 // simple, non-atomic sync write
9278 Mutex flock("Client::_write flock");
9279 Cond cond;
9280 bool done = false;
9281 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9282
9283 unsafe_sync_write++;
9284 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9285
9286 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9287 offset, size, bl, ceph::real_clock::now(), 0,
9288 in->truncate_size, in->truncate_seq,
9289 onfinish);
9290 client_lock.Unlock();
9291 flock.Lock();
9292
9293 while (!done)
9294 cond.Wait(flock);
9295 flock.Unlock();
9296 client_lock.Lock();
9297 _sync_write_commit(in);
9298 }
9299
9300 // if we get here, write was successful, update client metadata
9301success:
9302 // time
9303 lat = ceph_clock_now();
9304 lat -= start;
9305 logger->tinc(l_c_wrlat, lat);
9306
9307 totalwritten = size;
9308 r = (int)totalwritten;
9309
9310 // extend file?
9311 if (totalwritten + offset > in->size) {
9312 in->size = totalwritten + offset;
28e407b8 9313 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9314
28e407b8 9315 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 9316 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9317 } else if (is_max_size_approaching(in)) {
9318 check_caps(in, 0);
7c673cae
FG
9319 }
9320
9321 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9322 } else {
9323 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9324 }
9325
9326 // mtime
9327 in->mtime = ceph_clock_now();
9328 in->change_attr++;
28e407b8 9329 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9330
9331done:
9332
9333 if (onuninline) {
9334 client_lock.Unlock();
9335 uninline_flock.Lock();
9336 while (!uninline_done)
9337 uninline_cond.Wait(uninline_flock);
9338 uninline_flock.Unlock();
9339 client_lock.Lock();
9340
9341 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9342 in->inline_data.clear();
9343 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9344 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9345 check_caps(in, 0);
9346 } else
9347 r = uninline_ret;
9348 }
9349
9350 put_cap_ref(in, CEPH_CAP_FILE_WR);
9351 return r;
9352}
9353
9354int Client::_flush(Fh *f)
9355{
9356 Inode *in = f->inode.get();
9357 int err = f->take_async_err();
9358 if (err != 0) {
9359 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9360 << cpp_strerror(err) << dendl;
9361 } else {
9362 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9363 }
9364
9365 return err;
9366}
9367
9368int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9369{
9370 struct ceph_statx stx;
9371 stx.stx_size = length;
9372 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9373}
9374
9375int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9376{
9377 Mutex::Locker lock(client_lock);
9378 tout(cct) << "ftruncate" << std::endl;
9379 tout(cct) << fd << std::endl;
9380 tout(cct) << length << std::endl;
9381
181888fb
FG
9382 if (unmounting)
9383 return -ENOTCONN;
9384
7c673cae
FG
9385 Fh *f = get_filehandle(fd);
9386 if (!f)
9387 return -EBADF;
9388#if defined(__linux__) && defined(O_PATH)
9389 if (f->flags & O_PATH)
9390 return -EBADF;
9391#endif
9392 struct stat attr;
9393 attr.st_size = length;
9394 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9395}
9396
9397int Client::fsync(int fd, bool syncdataonly)
9398{
9399 Mutex::Locker lock(client_lock);
9400 tout(cct) << "fsync" << std::endl;
9401 tout(cct) << fd << std::endl;
9402 tout(cct) << syncdataonly << std::endl;
9403
181888fb
FG
9404 if (unmounting)
9405 return -ENOTCONN;
9406
7c673cae
FG
9407 Fh *f = get_filehandle(fd);
9408 if (!f)
9409 return -EBADF;
9410#if defined(__linux__) && defined(O_PATH)
9411 if (f->flags & O_PATH)
9412 return -EBADF;
9413#endif
9414 int r = _fsync(f, syncdataonly);
9415 if (r == 0) {
9416 // The IOs in this fsync were okay, but maybe something happened
9417 // in the background that we shoudl be reporting?
9418 r = f->take_async_err();
9419 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9420 << ") = 0, async_err = " << r << dendl;
9421 } else {
9422 // Assume that an error we encountered during fsync, even reported
9423 // synchronously, would also have applied the error to the Fh, and we
9424 // should clear it here to avoid returning the same error again on next
9425 // call.
9426 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9427 << r << dendl;
9428 f->take_async_err();
9429 }
9430 return r;
9431}
9432
9433int Client::_fsync(Inode *in, bool syncdataonly)
9434{
9435 int r = 0;
9436 Mutex lock("Client::_fsync::lock");
9437 Cond cond;
9438 bool done = false;
9439 C_SafeCond *object_cacher_completion = NULL;
9440 ceph_tid_t flush_tid = 0;
9441 InodeRef tmp_ref;
9442
9443 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9444
9445 if (cct->_conf->client_oc) {
9446 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9447 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9448 _flush(in, object_cacher_completion);
9449 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9450 }
9451
9452 if (!syncdataonly && in->dirty_caps) {
9453 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9454 if (in->flushing_caps)
9455 flush_tid = last_flush_tid;
9456 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9457
9458 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9459 flush_mdlog_sync();
9460
7c673cae
FG
9461 MetaRequest *req = in->unsafe_ops.back();
9462 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9463
9464 req->get();
9465 wait_on_list(req->waitfor_safe);
9466 put_request(req);
9467 }
9468
9469 if (object_cacher_completion) { // wait on a real reply instead of guessing
9470 client_lock.Unlock();
9471 lock.Lock();
9472 ldout(cct, 15) << "waiting on data to flush" << dendl;
9473 while (!done)
9474 cond.Wait(lock);
9475 lock.Unlock();
9476 client_lock.Lock();
9477 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9478 } else {
9479 // FIXME: this can starve
9480 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9481 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9482 << " uncommitted, waiting" << dendl;
9483 wait_on_list(in->waitfor_commit);
9484 }
9485 }
9486
9487 if (!r) {
9488 if (flush_tid > 0)
9489 wait_sync_caps(in, flush_tid);
9490
9491 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9492 } else {
9493 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9494 << cpp_strerror(-r) << dendl;
9495 }
9496
9497 return r;
9498}
9499
9500int Client::_fsync(Fh *f, bool syncdataonly)
9501{
9502 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9503 return _fsync(f->inode.get(), syncdataonly);
9504}
9505
9506int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9507{
9508 Mutex::Locker lock(client_lock);
9509 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9510 tout(cct) << fd << std::endl;
9511
181888fb
FG
9512 if (unmounting)
9513 return -ENOTCONN;
9514
7c673cae
FG
9515 Fh *f = get_filehandle(fd);
9516 if (!f)
9517 return -EBADF;
9518 int r = _getattr(f->inode, mask, perms);
9519 if (r < 0)
9520 return r;
9521 fill_stat(f->inode, stbuf, NULL);
9522 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9523 return r;
9524}
9525
9526int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9527 unsigned int want, unsigned int flags)
9528{
9529 Mutex::Locker lock(client_lock);
9530 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9531 tout(cct) << fd << std::endl;
9532
181888fb
FG
9533 if (unmounting)
9534 return -ENOTCONN;
9535
7c673cae
FG
9536 Fh *f = get_filehandle(fd);
9537 if (!f)
9538 return -EBADF;
9539
9540 unsigned mask = statx_to_mask(flags, want);
9541
9542 int r = 0;
94b18763 9543 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9544 r = _getattr(f->inode, mask, perms);
9545 if (r < 0) {
9546 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9547 return r;
9548 }
9549 }
9550
9551 fill_statx(f->inode, mask, stx);
9552 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9553 return r;
9554}
9555
9556// not written yet, but i want to link!
9557
9558int Client::chdir(const char *relpath, std::string &new_cwd,
9559 const UserPerm& perms)
9560{
9561 Mutex::Locker lock(client_lock);
9562 tout(cct) << "chdir" << std::endl;
9563 tout(cct) << relpath << std::endl;
181888fb
FG
9564
9565 if (unmounting)
9566 return -ENOTCONN;
9567
7c673cae
FG
9568 filepath path(relpath);
9569 InodeRef in;
9570 int r = path_walk(path, &in, perms);
9571 if (r < 0)
9572 return r;
9573 if (cwd != in)
9574 cwd.swap(in);
9575 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9576
b5b8bbf5 9577 _getcwd(new_cwd, perms);
7c673cae
FG
9578 return 0;
9579}
9580
b5b8bbf5 9581void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9582{
9583 filepath path;
9584 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9585
9586 Inode *in = cwd.get();
9587 while (in != root) {
9588 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9589
9590 // A cwd or ancester is unlinked
9591 if (in->dn_set.empty()) {
9592 return;
9593 }
9594
9595 Dentry *dn = in->get_first_parent();
9596
9597
9598 if (!dn) {
9599 // look it up
9600 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9601 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9602 filepath path(in->ino);
9603 req->set_filepath(path);
9604 req->set_inode(in);
9605 int res = make_request(req, perms);
9606 if (res < 0)
9607 break;
9608
9609 // start over
9610 path = filepath();
9611 in = cwd.get();
9612 continue;
9613 }
9614 path.push_front_dentry(dn->name);
9615 in = dn->dir->parent_inode;
9616 }
9617 dir = "/";
9618 dir += path.get_path();
9619}
9620
b5b8bbf5
FG
9621void Client::getcwd(string& dir, const UserPerm& perms)
9622{
9623 Mutex::Locker l(client_lock);
181888fb
FG
9624 if (!unmounting)
9625 _getcwd(dir, perms);
b5b8bbf5
FG
9626}
9627
7c673cae
FG
9628int Client::statfs(const char *path, struct statvfs *stbuf,
9629 const UserPerm& perms)
9630{
9631 Mutex::Locker l(client_lock);
9632 tout(cct) << "statfs" << std::endl;
9633
181888fb
FG
9634 if (unmounting)
9635 return -ENOTCONN;
9636
7c673cae
FG
9637 ceph_statfs stats;
9638 C_SaferCond cond;
d2e6a577
FG
9639
9640 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9641 if (data_pools.size() == 1) {
9642 objecter->get_fs_stats(stats, data_pools[0], &cond);
9643 } else {
9644 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9645 }
7c673cae
FG
9646
9647 client_lock.Unlock();
9648 int rval = cond.wait();
9649 client_lock.Lock();
9650
9651 if (rval < 0) {
9652 ldout(cct, 1) << "underlying call to statfs returned error: "
9653 << cpp_strerror(rval)
9654 << dendl;
9655 return rval;
9656 }
9657
9658 memset(stbuf, 0, sizeof(*stbuf));
9659
9660 /*
9661 * we're going to set a block size of 4MB so we can represent larger
9662 * FSes without overflowing. Additionally convert the space
9663 * measurements from KB to bytes while making them in terms of
9664 * blocks. We use 4MB only because it is big enough, and because it
9665 * actually *is* the (ceph) default block size.
9666 */
9667 const int CEPH_BLOCK_SHIFT = 22;
9668 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9669 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9670 stbuf->f_files = stats.num_objects;
9671 stbuf->f_ffree = -1;
9672 stbuf->f_favail = -1;
9673 stbuf->f_fsid = -1; // ??
9674 stbuf->f_flag = 0; // ??
9675 stbuf->f_namemax = NAME_MAX;
9676
9677 // Usually quota_root will == root_ancestor, but if the mount root has no
9678 // quota but we can see a parent of it that does have a quota, we'll
9679 // respect that one instead.
9680 assert(root != nullptr);
9681 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9682
9683 // get_quota_root should always give us something
9684 // because client quotas are always enabled
9685 assert(quota_root != nullptr);
9686
9687 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9688
9689 // Skip the getattr if any sessions are stale, as we don't want to
9690 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9691 // is unhealthy.
9692 if (!_any_stale_sessions()) {
9693 int r = _getattr(quota_root, 0, perms, true);
9694 if (r != 0) {
9695 // Ignore return value: error getting latest inode metadata is not a good
9696 // reason to break "df".
9697 lderr(cct) << "Error in getattr on quota root 0x"
9698 << std::hex << quota_root->ino << std::dec
9699 << " statfs result may be outdated" << dendl;
9700 }
9701 }
9702
9703 // Special case: if there is a size quota set on the Inode acting
9704 // as the root for this client mount, then report the quota status
9705 // as the filesystem statistics.
9706 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9707 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9708 // It is possible for a quota to be exceeded: arithmetic here must
9709 // handle case where used > total.
9710 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9711
9712 stbuf->f_blocks = total;
9713 stbuf->f_bfree = free;
9714 stbuf->f_bavail = free;
9715 } else {
d2e6a577 9716 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9717 // multiple pools may be used without one filesystem namespace via
9718 // layouts, this is the most correct thing we can do.
9719 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9720 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9721 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9722 }
9723
9724 return rval;
9725}
9726
9727int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9728 struct flock *fl, uint64_t owner, bool removing)
9729{
9730 ldout(cct, 10) << "_do_filelock ino " << in->ino
9731 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9732 << " type " << fl->l_type << " owner " << owner
9733 << " " << fl->l_start << "~" << fl->l_len << dendl;
9734
9735 int lock_cmd;
9736 if (F_RDLCK == fl->l_type)
9737 lock_cmd = CEPH_LOCK_SHARED;
9738 else if (F_WRLCK == fl->l_type)
9739 lock_cmd = CEPH_LOCK_EXCL;
9740 else if (F_UNLCK == fl->l_type)
9741 lock_cmd = CEPH_LOCK_UNLOCK;
9742 else
9743 return -EIO;
9744
9745 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9746 sleep = 0;
9747
9748 /*
9749 * Set the most significant bit, so that MDS knows the 'owner'
9750 * is sufficient to identify the owner of lock. (old code uses
9751 * both 'owner' and 'pid')
9752 */
9753 owner |= (1ULL << 63);
9754
9755 MetaRequest *req = new MetaRequest(op);
9756 filepath path;
9757 in->make_nosnap_relative_path(path);
9758 req->set_filepath(path);
9759 req->set_inode(in);
9760
9761 req->head.args.filelock_change.rule = lock_type;
9762 req->head.args.filelock_change.type = lock_cmd;
9763 req->head.args.filelock_change.owner = owner;
9764 req->head.args.filelock_change.pid = fl->l_pid;
9765 req->head.args.filelock_change.start = fl->l_start;
9766 req->head.args.filelock_change.length = fl->l_len;
9767 req->head.args.filelock_change.wait = sleep;
9768
9769 int ret;
9770 bufferlist bl;
9771
9772 if (sleep && switch_interrupt_cb) {
9773 // enable interrupt
9774 switch_interrupt_cb(callback_handle, req->get());
9775 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9776 // disable interrupt
9777 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9778 if (ret == 0 && req->aborted()) {
9779 // effect of this lock request has been revoked by the 'lock intr' request
9780 ret = req->get_abort_code();
9781 }
7c673cae
FG
9782 put_request(req);
9783 } else {
9784 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9785 }
9786
9787 if (ret == 0) {
9788 if (op == CEPH_MDS_OP_GETFILELOCK) {
9789 ceph_filelock filelock;
9790 bufferlist::iterator p = bl.begin();
9791 ::decode(filelock, p);
9792
9793 if (CEPH_LOCK_SHARED == filelock.type)
9794 fl->l_type = F_RDLCK;
9795 else if (CEPH_LOCK_EXCL == filelock.type)
9796 fl->l_type = F_WRLCK;
9797 else
9798 fl->l_type = F_UNLCK;
9799
9800 fl->l_whence = SEEK_SET;
9801 fl->l_start = filelock.start;
9802 fl->l_len = filelock.length;
9803 fl->l_pid = filelock.pid;
9804 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9805 ceph_lock_state_t *lock_state;
9806 if (lock_type == CEPH_LOCK_FCNTL) {
9807 if (!in->fcntl_locks)
9808 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9809 lock_state = in->fcntl_locks;
9810 } else if (lock_type == CEPH_LOCK_FLOCK) {
9811 if (!in->flock_locks)
9812 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9813 lock_state = in->flock_locks;
9814 } else {
9815 ceph_abort();
9816 return -EINVAL;
9817 }
9818 _update_lock_state(fl, owner, lock_state);
9819
9820 if (!removing) {
9821 if (lock_type == CEPH_LOCK_FCNTL) {
9822 if (!fh->fcntl_locks)
9823 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9824 lock_state = fh->fcntl_locks;
9825 } else {
9826 if (!fh->flock_locks)
9827 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9828 lock_state = fh->flock_locks;
9829 }
9830 _update_lock_state(fl, owner, lock_state);
9831 }
9832 } else
9833 ceph_abort();
9834 }
9835 return ret;
9836}
9837
9838int Client::_interrupt_filelock(MetaRequest *req)
9839{
31f18b77
FG
9840 // Set abort code, but do not kick. The abort code prevents the request
9841 // from being re-sent.
9842 req->abort(-EINTR);
9843 if (req->mds < 0)
9844 return 0; // haven't sent the request
9845
7c673cae
FG
9846 Inode *in = req->inode();
9847
9848 int lock_type;
9849 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9850 lock_type = CEPH_LOCK_FLOCK_INTR;
9851 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9852 lock_type = CEPH_LOCK_FCNTL_INTR;
9853 else {
9854 ceph_abort();
9855 return -EINVAL;
9856 }
9857
9858 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9859 filepath path;
9860 in->make_nosnap_relative_path(path);
9861 intr_req->set_filepath(path);
9862 intr_req->set_inode(in);
9863 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9864 intr_req->head.args.filelock_change.rule = lock_type;
9865 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9866
9867 UserPerm perms(req->get_uid(), req->get_gid());
9868 return make_request(intr_req, perms, NULL, NULL, -1);
9869}
9870
9871void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9872{
9873 if (!in->fcntl_locks && !in->flock_locks)
9874 return;
9875
9876 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9877 ::encode(nr_fcntl_locks, bl);
9878 if (nr_fcntl_locks) {
9879 ceph_lock_state_t* lock_state = in->fcntl_locks;
9880 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9881 p != lock_state->held_locks.end();
9882 ++p)
9883 ::encode(p->second, bl);
9884 }
9885
9886 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9887 ::encode(nr_flock_locks, bl);
9888 if (nr_flock_locks) {
9889 ceph_lock_state_t* lock_state = in->flock_locks;
9890 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9891 p != lock_state->held_locks.end();
9892 ++p)
9893 ::encode(p->second, bl);
9894 }
9895
9896 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9897 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9898}
9899
9900void Client::_release_filelocks(Fh *fh)
9901{
9902 if (!fh->fcntl_locks && !fh->flock_locks)
9903 return;
9904
9905 Inode *in = fh->inode.get();
9906 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9907
9908 list<pair<int, ceph_filelock> > to_release;
9909
9910 if (fh->fcntl_locks) {
9911 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9912 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9913 p != lock_state->held_locks.end();
9914 ++p)
9915 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9916 delete fh->fcntl_locks;
9917 }
9918 if (fh->flock_locks) {
9919 ceph_lock_state_t* lock_state = fh->flock_locks;
9920 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9921 p != lock_state->held_locks.end();
9922 ++p)
9923 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9924 delete fh->flock_locks;
9925 }
9926
9927 if (to_release.empty())
9928 return;
9929
9930 struct flock fl;
9931 memset(&fl, 0, sizeof(fl));
9932 fl.l_whence = SEEK_SET;
9933 fl.l_type = F_UNLCK;
9934
9935 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9936 p != to_release.end();
9937 ++p) {
9938 fl.l_start = p->second.start;
9939 fl.l_len = p->second.length;
9940 fl.l_pid = p->second.pid;
9941 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9942 p->second.owner, true);
9943 }
9944}
9945
9946void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9947 ceph_lock_state_t *lock_state)
9948{
9949 int lock_cmd;
9950 if (F_RDLCK == fl->l_type)
9951 lock_cmd = CEPH_LOCK_SHARED;
9952 else if (F_WRLCK == fl->l_type)
9953 lock_cmd = CEPH_LOCK_EXCL;
9954 else
9955 lock_cmd = CEPH_LOCK_UNLOCK;;
9956
9957 ceph_filelock filelock;
9958 filelock.start = fl->l_start;
9959 filelock.length = fl->l_len;
9960 filelock.client = 0;
9961 // see comment in _do_filelock()
9962 filelock.owner = owner | (1ULL << 63);
9963 filelock.pid = fl->l_pid;
9964 filelock.type = lock_cmd;
9965
9966 if (filelock.type == CEPH_LOCK_UNLOCK) {
9967 list<ceph_filelock> activated_locks;
9968 lock_state->remove_lock(filelock, activated_locks);
9969 } else {
9970 bool r = lock_state->add_lock(filelock, false, false, NULL);
9971 assert(r);
9972 }
9973}
9974
9975int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9976{
9977 Inode *in = fh->inode.get();
9978 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9979 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9980 return ret;
9981}
9982
9983int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9984{
9985 Inode *in = fh->inode.get();
9986 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
9987 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
9988 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
9989 return ret;
9990}
9991
9992int Client::_flock(Fh *fh, int cmd, uint64_t owner)
9993{
9994 Inode *in = fh->inode.get();
9995 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
9996
9997 int sleep = !(cmd & LOCK_NB);
9998 cmd &= ~LOCK_NB;
9999
10000 int type;
10001 switch (cmd) {
10002 case LOCK_SH:
10003 type = F_RDLCK;
10004 break;
10005 case LOCK_EX:
10006 type = F_WRLCK;
10007 break;
10008 case LOCK_UN:
10009 type = F_UNLCK;
10010 break;
10011 default:
10012 return -EINVAL;
10013 }
10014
10015 struct flock fl;
10016 memset(&fl, 0, sizeof(fl));
10017 fl.l_type = type;
10018 fl.l_whence = SEEK_SET;
10019
10020 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10021 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10022 return ret;
10023}
10024
10025int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10026{
10027 /* Since the only thing this does is wrap a call to statfs, and
10028 statfs takes a lock, it doesn't seem we have a need to split it
10029 out. */
10030 return statfs(0, stbuf, perms);
10031}
10032
10033void Client::ll_register_callbacks(struct client_callback_args *args)
10034{
10035 if (!args)
10036 return;
10037 Mutex::Locker l(client_lock);
10038 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10039 << " invalidate_ino_cb " << args->ino_cb
10040 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10041 << " switch_interrupt_cb " << args->switch_intr_cb
10042 << " remount_cb " << args->remount_cb
10043 << dendl;
10044 callback_handle = args->handle;
10045 if (args->ino_cb) {
10046 ino_invalidate_cb = args->ino_cb;
10047 async_ino_invalidator.start();
10048 }
10049 if (args->dentry_cb) {
10050 dentry_invalidate_cb = args->dentry_cb;
10051 async_dentry_invalidator.start();
10052 }
10053 if (args->switch_intr_cb) {
10054 switch_interrupt_cb = args->switch_intr_cb;
10055 interrupt_finisher.start();
10056 }
10057 if (args->remount_cb) {
10058 remount_cb = args->remount_cb;
10059 remount_finisher.start();
10060 }
7c673cae
FG
10061 umask_cb = args->umask_cb;
10062}
10063
10064int Client::test_dentry_handling(bool can_invalidate)
10065{
10066 int r = 0;
10067
10068 can_invalidate_dentries = can_invalidate;
10069
10070 if (can_invalidate_dentries) {
10071 assert(dentry_invalidate_cb);
10072 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10073 r = 0;
7c673cae
FG
10074 } else if (remount_cb) {
10075 ldout(cct, 1) << "using remount_cb" << dendl;
b32b8144
FG
10076 r = _do_remount();
10077 }
10078 if (r) {
10079 bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10080 if (should_abort) {
10081 lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
7c673cae 10082 ceph_abort();
b32b8144
FG
10083 } else {
10084 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10085 }
7c673cae
FG
10086 }
10087 return r;
10088}
10089
10090int Client::_sync_fs()
10091{
10092 ldout(cct, 10) << "_sync_fs" << dendl;
10093
10094 // flush file data
10095 Mutex lock("Client::_fsync::lock");
10096 Cond cond;
10097 bool flush_done = false;
10098 if (cct->_conf->client_oc)
10099 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10100 else
10101 flush_done = true;
10102
10103 // flush caps
10104 flush_caps_sync();
10105 ceph_tid_t flush_tid = last_flush_tid;
10106
10107 // wait for unsafe mds requests
10108 wait_unsafe_requests();
10109
10110 wait_sync_caps(flush_tid);
10111
10112 if (!flush_done) {
10113 client_lock.Unlock();
10114 lock.Lock();
10115 ldout(cct, 15) << "waiting on data to flush" << dendl;
10116 while (!flush_done)
10117 cond.Wait(lock);
10118 lock.Unlock();
10119 client_lock.Lock();
10120 }
10121
10122 return 0;
10123}
10124
10125int Client::sync_fs()
10126{
10127 Mutex::Locker l(client_lock);
181888fb
FG
10128
10129 if (unmounting)
10130 return -ENOTCONN;
10131
7c673cae
FG
10132 return _sync_fs();
10133}
10134
10135int64_t Client::drop_caches()
10136{
10137 Mutex::Locker l(client_lock);
10138 return objectcacher->release_all();
10139}
10140
10141
10142int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10143{
10144 Mutex::Locker l(client_lock);
10145 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10146 << ", " << offset << ", " << count << ")" << dendl;
10147
10148 Fh *f = get_filehandle(fd);
10149 if (!f)
10150 return -EBADF;
10151
10152 // for now
10153 _fsync(f, true);
10154
10155 return 0;
10156}
10157
10158int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10159{
10160 Mutex::Locker l(client_lock);
10161 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10162 << ", " << offset << ", " << count << ")" << dendl;
10163
10164 Fh *f = get_filehandle(fd);
10165 if (!f)
10166 return -EBADF;
10167 Inode *in = f->inode.get();
10168
10169 _fsync(f, true);
10170 if (_release(in))
10171 check_caps(in, 0);
10172 return 0;
10173}
10174
10175
10176// =============================
10177// snaps
10178
10179int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10180{
10181 Mutex::Locker l(client_lock);
181888fb
FG
10182
10183 if (unmounting)
10184 return -ENOTCONN;
10185
7c673cae
FG
10186 filepath path(relpath);
10187 InodeRef in;
10188 int r = path_walk(path, &in, perm);
10189 if (r < 0)
10190 return r;
10191 if (cct->_conf->client_permissions) {
10192 r = may_create(in.get(), perm);
10193 if (r < 0)
10194 return r;
10195 }
10196 Inode *snapdir = open_snapdir(in.get());
10197 return _mkdir(snapdir, name, 0, perm);
10198}
181888fb 10199
7c673cae
FG
10200int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10201{
10202 Mutex::Locker l(client_lock);
181888fb
FG
10203
10204 if (unmounting)
10205 return -ENOTCONN;
10206
7c673cae
FG
10207 filepath path(relpath);
10208 InodeRef in;
10209 int r = path_walk(path, &in, perms);
10210 if (r < 0)
10211 return r;
10212 if (cct->_conf->client_permissions) {
10213 r = may_delete(in.get(), NULL, perms);
10214 if (r < 0)
10215 return r;
10216 }
10217 Inode *snapdir = open_snapdir(in.get());
10218 return _rmdir(snapdir, name, perms);
10219}
10220
10221// =============================
10222// expose caps
10223
10224int Client::get_caps_issued(int fd) {
10225
10226 Mutex::Locker lock(client_lock);
10227
181888fb
FG
10228 if (unmounting)
10229 return -ENOTCONN;
10230
7c673cae
FG
10231 Fh *f = get_filehandle(fd);
10232 if (!f)
10233 return -EBADF;
10234
10235 return f->inode->caps_issued();
10236}
10237
10238int Client::get_caps_issued(const char *path, const UserPerm& perms)
10239{
10240 Mutex::Locker lock(client_lock);
181888fb
FG
10241
10242 if (unmounting)
10243 return -ENOTCONN;
10244
7c673cae
FG
10245 filepath p(path);
10246 InodeRef in;
10247 int r = path_walk(p, &in, perms, true);
10248 if (r < 0)
10249 return r;
10250 return in->caps_issued();
10251}
10252
10253// =========================================
10254// low level
10255
10256Inode *Client::open_snapdir(Inode *diri)
10257{
10258 Inode *in;
10259 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10260 if (!inode_map.count(vino)) {
10261 in = new Inode(this, vino, &diri->layout);
10262
10263 in->ino = diri->ino;
10264 in->snapid = CEPH_SNAPDIR;
10265 in->mode = diri->mode;
10266 in->uid = diri->uid;
10267 in->gid = diri->gid;
10268 in->mtime = diri->mtime;
10269 in->ctime = diri->ctime;
10270 in->btime = diri->btime;
10271 in->size = diri->size;
10272 in->change_attr = diri->change_attr;
10273
10274 in->dirfragtree.clear();
10275 in->snapdir_parent = diri;
10276 diri->flags |= I_SNAPDIR_OPEN;
10277 inode_map[vino] = in;
10278 if (use_faked_inos())
10279 _assign_faked_ino(in);
10280 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10281 } else {
10282 in = inode_map[vino];
10283 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10284 }
10285 return in;
10286}
10287
10288int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10289 Inode **out, const UserPerm& perms)
10290{
10291 Mutex::Locker lock(client_lock);
31f18b77
FG
10292 vinodeno_t vparent = _get_vino(parent);
10293 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10294 tout(cct) << "ll_lookup" << std::endl;
10295 tout(cct) << name << std::endl;
10296
181888fb
FG
10297 if (unmounting)
10298 return -ENOTCONN;
10299
7c673cae
FG
10300 int r = 0;
10301 if (!cct->_conf->fuse_default_permissions) {
10302 r = may_lookup(parent, perms);
10303 if (r < 0)
10304 return r;
10305 }
10306
10307 string dname(name);
10308 InodeRef in;
10309
10310 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10311 if (r < 0) {
10312 attr->st_ino = 0;
10313 goto out;
10314 }
10315
10316 assert(in);
10317 fill_stat(in, attr);
10318 _ll_get(in.get());
10319
10320 out:
31f18b77 10321 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10322 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10323 tout(cct) << attr->st_ino << std::endl;
10324 *out = in.get();
10325 return r;
10326}
10327
10328int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10329 struct ceph_statx *stx, unsigned want, unsigned flags,
10330 const UserPerm& perms)
10331{
10332 Mutex::Locker lock(client_lock);
31f18b77
FG
10333 vinodeno_t vparent = _get_vino(parent);
10334 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10335 tout(cct) << "ll_lookupx" << std::endl;
10336 tout(cct) << name << std::endl;
10337
181888fb
FG
10338 if (unmounting)
10339 return -ENOTCONN;
10340
7c673cae
FG
10341 int r = 0;
10342 if (!cct->_conf->fuse_default_permissions) {
10343 r = may_lookup(parent, perms);
10344 if (r < 0)
10345 return r;
10346 }
10347
10348 string dname(name);
10349 InodeRef in;
10350
10351 unsigned mask = statx_to_mask(flags, want);
10352 r = _lookup(parent, dname, mask, &in, perms);
10353 if (r < 0) {
10354 stx->stx_ino = 0;
10355 stx->stx_mask = 0;
10356 } else {
10357 assert(in);
10358 fill_statx(in, mask, stx);
10359 _ll_get(in.get());
10360 }
10361
31f18b77 10362 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10363 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10364 tout(cct) << stx->stx_ino << std::endl;
10365 *out = in.get();
10366 return r;
10367}
10368
10369int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10370 unsigned int want, unsigned int flags, const UserPerm& perms)
10371{
10372 Mutex::Locker lock(client_lock);
181888fb
FG
10373
10374 if (unmounting)
10375 return -ENOTCONN;
10376
7c673cae
FG
10377 filepath fp(name, 0);
10378 InodeRef in;
10379 int rc;
10380 unsigned mask = statx_to_mask(flags, want);
10381
10382 ldout(cct, 3) << "ll_walk" << name << dendl;
10383 tout(cct) << "ll_walk" << std::endl;
10384 tout(cct) << name << std::endl;
10385
10386 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10387 if (rc < 0) {
10388 /* zero out mask, just in case... */
10389 stx->stx_mask = 0;
10390 stx->stx_ino = 0;
10391 *out = NULL;
10392 return rc;
10393 } else {
10394 assert(in);
10395 fill_statx(in, mask, stx);
10396 _ll_get(in.get());
10397 *out = in.get();
10398 return 0;
10399 }
10400}
10401
10402void Client::_ll_get(Inode *in)
10403{
10404 if (in->ll_ref == 0) {
10405 in->get();
10406 if (in->is_dir() && !in->dn_set.empty()) {
10407 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10408 in->get_first_parent()->get(); // pin dentry
10409 }
10410 }
10411 in->ll_get();
10412 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10413}
10414
10415int Client::_ll_put(Inode *in, int num)
10416{
10417 in->ll_put(num);
10418 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10419 if (in->ll_ref == 0) {
10420 if (in->is_dir() && !in->dn_set.empty()) {
10421 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10422 in->get_first_parent()->put(); // unpin dentry
10423 }
10424 put_inode(in);
10425 return 0;
10426 } else {
10427 return in->ll_ref;
10428 }
10429}
10430
10431void Client::_ll_drop_pins()
10432{
10433 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10434 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10435 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10436 it != inode_map.end();
10437 it = next) {
10438 Inode *in = it->second;
10439 next = it;
10440 ++next;
10441 if (in->ll_ref)
10442 _ll_put(in, in->ll_ref);
10443 }
10444}
10445
10446bool Client::ll_forget(Inode *in, int count)
10447{
10448 Mutex::Locker lock(client_lock);
10449 inodeno_t ino = _get_inodeno(in);
10450
10451 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10452 tout(cct) << "ll_forget" << std::endl;
10453 tout(cct) << ino.val << std::endl;
10454 tout(cct) << count << std::endl;
10455
181888fb
FG
10456 // Ignore forget if we're no longer mounted
10457 if (unmounting)
10458 return true;
10459
7c673cae
FG
10460 if (ino == 1) return true; // ignore forget on root.
10461
10462 bool last = false;
10463 if (in->ll_ref < count) {
10464 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10465 << ", which only has ll_ref=" << in->ll_ref << dendl;
10466 _ll_put(in, in->ll_ref);
10467 last = true;
10468 } else {
10469 if (_ll_put(in, count) == 0)
10470 last = true;
10471 }
10472
10473 return last;
10474}
10475
10476bool Client::ll_put(Inode *in)
10477{
10478 /* ll_forget already takes the lock */
10479 return ll_forget(in, 1);
10480}
10481
10482snapid_t Client::ll_get_snapid(Inode *in)
10483{
10484 Mutex::Locker lock(client_lock);
10485 return in->snapid;
10486}
10487
10488Inode *Client::ll_get_inode(ino_t ino)
10489{
10490 Mutex::Locker lock(client_lock);
181888fb
FG
10491
10492 if (unmounting)
10493 return NULL;
10494
7c673cae
FG
10495 vinodeno_t vino = _map_faked_ino(ino);
10496 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10497 if (p == inode_map.end())
10498 return NULL;
10499 Inode *in = p->second;
10500 _ll_get(in);
10501 return in;
10502}
10503
10504Inode *Client::ll_get_inode(vinodeno_t vino)
10505{
10506 Mutex::Locker lock(client_lock);
181888fb
FG
10507
10508 if (unmounting)
10509 return NULL;
10510
7c673cae
FG
10511 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10512 if (p == inode_map.end())
10513 return NULL;
10514 Inode *in = p->second;
10515 _ll_get(in);
10516 return in;
10517}
10518
10519int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10520{
10521 vinodeno_t vino = _get_vino(in);
10522
10523 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10524 tout(cct) << "ll_getattr" << std::endl;
10525 tout(cct) << vino.ino.val << std::endl;
10526
10527 if (vino.snapid < CEPH_NOSNAP)
10528 return 0;
10529 else
10530 return _getattr(in, caps, perms);
10531}
10532
10533int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10534{
10535 Mutex::Locker lock(client_lock);
10536
181888fb
FG
10537 if (unmounting)
10538 return -ENOTCONN;
10539
7c673cae
FG
10540 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10541
10542 if (res == 0)
10543 fill_stat(in, attr);
10544 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10545 return res;
10546}
10547
10548int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10549 unsigned int flags, const UserPerm& perms)
10550{
10551 Mutex::Locker lock(client_lock);
10552
181888fb
FG
10553 if (unmounting)
10554 return -ENOTCONN;
10555
7c673cae
FG
10556 int res = 0;
10557 unsigned mask = statx_to_mask(flags, want);
10558
94b18763 10559 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10560 res = _ll_getattr(in, mask, perms);
10561
10562 if (res == 0)
10563 fill_statx(in, mask, stx);
10564 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10565 return res;
10566}
10567
10568int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10569 const UserPerm& perms, InodeRef *inp)
10570{
10571 vinodeno_t vino = _get_vino(in);
10572
10573 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10574 << dendl;
10575 tout(cct) << "ll_setattrx" << std::endl;
10576 tout(cct) << vino.ino.val << std::endl;
10577 tout(cct) << stx->stx_mode << std::endl;
10578 tout(cct) << stx->stx_uid << std::endl;
10579 tout(cct) << stx->stx_gid << std::endl;
10580 tout(cct) << stx->stx_size << std::endl;
10581 tout(cct) << stx->stx_mtime << std::endl;
10582 tout(cct) << stx->stx_atime << std::endl;
10583 tout(cct) << stx->stx_btime << std::endl;
10584 tout(cct) << mask << std::endl;
10585
10586 if (!cct->_conf->fuse_default_permissions) {
10587 int res = may_setattr(in, stx, mask, perms);
10588 if (res < 0)
10589 return res;
10590 }
10591
10592 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10593
10594 return __setattrx(in, stx, mask, perms, inp);
10595}
10596
10597int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10598 const UserPerm& perms)
10599{
10600 Mutex::Locker lock(client_lock);
181888fb
FG
10601
10602 if (unmounting)
10603 return -ENOTCONN;
10604
7c673cae
FG
10605 InodeRef target(in);
10606 int res = _ll_setattrx(in, stx, mask, perms, &target);
10607 if (res == 0) {
10608 assert(in == target.get());
10609 fill_statx(in, in->caps_issued(), stx);
10610 }
10611
10612 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10613 return res;
10614}
10615
10616int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10617 const UserPerm& perms)
10618{
10619 struct ceph_statx stx;
10620 stat_to_statx(attr, &stx);
10621
10622 Mutex::Locker lock(client_lock);
181888fb
FG
10623
10624 if (unmounting)
10625 return -ENOTCONN;
10626
7c673cae
FG
10627 InodeRef target(in);
10628 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10629 if (res == 0) {
10630 assert(in == target.get());
10631 fill_stat(in, attr);
10632 }
10633
10634 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10635 return res;
10636}
10637
10638
10639// ----------
10640// xattrs
10641
10642int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10643 const UserPerm& perms)
10644{
10645 Mutex::Locker lock(client_lock);
181888fb
FG
10646
10647 if (unmounting)
10648 return -ENOTCONN;
10649
7c673cae
FG
10650 InodeRef in;
10651 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10652 if (r < 0)
10653 return r;
10654 return _getxattr(in, name, value, size, perms);
10655}
10656
10657int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10658 const UserPerm& perms)
10659{
10660 Mutex::Locker lock(client_lock);
181888fb
FG
10661
10662 if (unmounting)
10663 return -ENOTCONN;
10664
7c673cae
FG
10665 InodeRef in;
10666 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10667 if (r < 0)
10668 return r;
10669 return _getxattr(in, name, value, size, perms);
10670}
10671
10672int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10673 const UserPerm& perms)
10674{
10675 Mutex::Locker lock(client_lock);
181888fb
FG
10676
10677 if (unmounting)
10678 return -ENOTCONN;
10679
7c673cae
FG
10680 Fh *f = get_filehandle(fd);
10681 if (!f)
10682 return -EBADF;
10683 return _getxattr(f->inode, name, value, size, perms);
10684}
10685
10686int Client::listxattr(const char *path, char *list, size_t size,
10687 const UserPerm& perms)
10688{
10689 Mutex::Locker lock(client_lock);
181888fb
FG
10690
10691 if (unmounting)
10692 return -ENOTCONN;
10693
7c673cae
FG
10694 InodeRef in;
10695 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10696 if (r < 0)
10697 return r;
10698 return Client::_listxattr(in.get(), list, size, perms);
10699}
10700
10701int Client::llistxattr(const char *path, char *list, size_t size,
10702 const UserPerm& perms)
10703{
10704 Mutex::Locker lock(client_lock);
181888fb
FG
10705
10706 if (unmounting)
10707 return -ENOTCONN;
10708
7c673cae
FG
10709 InodeRef in;
10710 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10711 if (r < 0)
10712 return r;
10713 return Client::_listxattr(in.get(), list, size, perms);
10714}
10715
10716int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10717{
10718 Mutex::Locker lock(client_lock);
181888fb
FG
10719
10720 if (unmounting)
10721 return -ENOTCONN;
10722
7c673cae
FG
10723 Fh *f = get_filehandle(fd);
10724 if (!f)
10725 return -EBADF;
10726 return Client::_listxattr(f->inode.get(), list, size, perms);
10727}
10728
10729int Client::removexattr(const char *path, const char *name,
10730 const UserPerm& perms)
10731{
10732 Mutex::Locker lock(client_lock);
181888fb
FG
10733
10734 if (unmounting)
10735 return -ENOTCONN;
10736
7c673cae
FG
10737 InodeRef in;
10738 int r = Client::path_walk(path, &in, perms, true);
10739 if (r < 0)
10740 return r;
10741 return _removexattr(in, name, perms);
10742}
10743
10744int Client::lremovexattr(const char *path, const char *name,
10745 const UserPerm& perms)
10746{
10747 Mutex::Locker lock(client_lock);
181888fb
FG
10748
10749 if (unmounting)
10750 return -ENOTCONN;
10751
7c673cae
FG
10752 InodeRef in;
10753 int r = Client::path_walk(path, &in, perms, false);
10754 if (r < 0)
10755 return r;
10756 return _removexattr(in, name, perms);
10757}
10758
10759int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10760{
10761 Mutex::Locker lock(client_lock);
181888fb
FG
10762
10763 if (unmounting)
10764 return -ENOTCONN;
10765
7c673cae
FG
10766 Fh *f = get_filehandle(fd);
10767 if (!f)
10768 return -EBADF;
10769 return _removexattr(f->inode, name, perms);
10770}
10771
10772int Client::setxattr(const char *path, const char *name, const void *value,
10773 size_t size, int flags, const UserPerm& perms)
10774{
10775 _setxattr_maybe_wait_for_osdmap(name, value, size);
10776
10777 Mutex::Locker lock(client_lock);
181888fb
FG
10778
10779 if (unmounting)
10780 return -ENOTCONN;
10781
7c673cae
FG
10782 InodeRef in;
10783 int r = Client::path_walk(path, &in, perms, true);
10784 if (r < 0)
10785 return r;
10786 return _setxattr(in, name, value, size, flags, perms);
10787}
10788
10789int Client::lsetxattr(const char *path, const char *name, const void *value,
10790 size_t size, int flags, const UserPerm& perms)
10791{
10792 _setxattr_maybe_wait_for_osdmap(name, value, size);
10793
10794 Mutex::Locker lock(client_lock);
181888fb
FG
10795
10796 if (unmounting)
10797 return -ENOTCONN;
10798
7c673cae
FG
10799 InodeRef in;
10800 int r = Client::path_walk(path, &in, perms, false);
10801 if (r < 0)
10802 return r;
10803 return _setxattr(in, name, value, size, flags, perms);
10804}
10805
10806int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10807 int flags, const UserPerm& perms)
10808{
10809 _setxattr_maybe_wait_for_osdmap(name, value, size);
10810
10811 Mutex::Locker lock(client_lock);
181888fb
FG
10812
10813 if (unmounting)
10814 return -ENOTCONN;
10815
7c673cae
FG
10816 Fh *f = get_filehandle(fd);
10817 if (!f)
10818 return -EBADF;
10819 return _setxattr(f->inode, name, value, size, flags, perms);
10820}
10821
10822int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10823 const UserPerm& perms)
10824{
10825 int r;
10826
10827 const VXattr *vxattr = _match_vxattr(in, name);
10828 if (vxattr) {
10829 r = -ENODATA;
10830
10831 // Do a force getattr to get the latest quota before returning
10832 // a value to userspace.
28e407b8
AA
10833 int flags = 0;
10834 if (vxattr->flags & VXATTR_RSTAT) {
10835 flags |= CEPH_STAT_RSTAT;
10836 }
10837 r = _getattr(in, flags, perms, true);
7c673cae
FG
10838 if (r != 0) {
10839 // Error from getattr!
10840 return r;
10841 }
10842
10843 // call pointer-to-member function
10844 char buf[256];
10845 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10846 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10847 } else {
10848 r = -ENODATA;
10849 }
10850
10851 if (size != 0) {
10852 if (r > (int)size) {
10853 r = -ERANGE;
10854 } else if (r > 0) {
10855 memcpy(value, buf, r);
10856 }
10857 }
10858 goto out;
10859 }
10860
10861 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10862 r = -EOPNOTSUPP;
10863 goto out;
10864 }
10865
10866 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10867 if (r == 0) {
10868 string n(name);
10869 r = -ENODATA;
10870 if (in->xattrs.count(n)) {
10871 r = in->xattrs[n].length();
10872 if (r > 0 && size != 0) {
10873 if (size >= (unsigned)r)
10874 memcpy(value, in->xattrs[n].c_str(), r);
10875 else
10876 r = -ERANGE;
10877 }
10878 }
10879 }
10880 out:
10881 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10882 return r;
10883}
10884
10885int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10886 const UserPerm& perms)
10887{
10888 if (cct->_conf->client_permissions) {
10889 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10890 if (r < 0)
10891 return r;
10892 }
10893 return _getxattr(in.get(), name, value, size, perms);
10894}
10895
10896int Client::ll_getxattr(Inode *in, const char *name, void *value,
10897 size_t size, const UserPerm& perms)
10898{
10899 Mutex::Locker lock(client_lock);
10900
181888fb
FG
10901 if (unmounting)
10902 return -ENOTCONN;
10903
7c673cae
FG
10904 vinodeno_t vino = _get_vino(in);
10905
10906 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10907 tout(cct) << "ll_getxattr" << std::endl;
10908 tout(cct) << vino.ino.val << std::endl;
10909 tout(cct) << name << std::endl;
10910
10911 if (!cct->_conf->fuse_default_permissions) {
10912 int r = xattr_permission(in, name, MAY_READ, perms);
10913 if (r < 0)
10914 return r;
10915 }
10916
10917 return _getxattr(in, name, value, size, perms);
10918}
10919
10920int Client::_listxattr(Inode *in, char *name, size_t size,
10921 const UserPerm& perms)
10922{
10923 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10924 if (r == 0) {
10925 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10926 p != in->xattrs.end();
10927 ++p)
10928 r += p->first.length() + 1;
10929
10930 const VXattr *vxattrs = _get_vxattrs(in);
10931 r += _vxattrs_name_size(vxattrs);
10932
10933 if (size != 0) {
10934 if (size >= (unsigned)r) {
10935 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10936 p != in->xattrs.end();
10937 ++p) {
10938 memcpy(name, p->first.c_str(), p->first.length());
10939 name += p->first.length();
10940 *name = '\0';
10941 name++;
10942 }
10943 if (vxattrs) {
10944 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10945 const VXattr& vxattr = vxattrs[i];
10946 if (vxattr.hidden)
10947 continue;
10948 // call pointer-to-member function
10949 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10950 continue;
10951 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10952 name += vxattr.name.length();
10953 *name = '\0';
10954 name++;
10955 }
10956 }
10957 } else
10958 r = -ERANGE;
10959 }
10960 }
10961 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10962 return r;
10963}
10964
10965int Client::ll_listxattr(Inode *in, char *names, size_t size,
10966 const UserPerm& perms)
10967{
10968 Mutex::Locker lock(client_lock);
10969
181888fb
FG
10970 if (unmounting)
10971 return -ENOTCONN;
10972
7c673cae
FG
10973 vinodeno_t vino = _get_vino(in);
10974
10975 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10976 tout(cct) << "ll_listxattr" << std::endl;
10977 tout(cct) << vino.ino.val << std::endl;
10978 tout(cct) << size << std::endl;
10979
10980 return _listxattr(in, names, size, perms);
10981}
10982
10983int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10984 size_t size, int flags, const UserPerm& perms)
10985{
10986
10987 int xattr_flags = 0;
10988 if (!value)
10989 xattr_flags |= CEPH_XATTR_REMOVE;
10990 if (flags & XATTR_CREATE)
10991 xattr_flags |= CEPH_XATTR_CREATE;
10992 if (flags & XATTR_REPLACE)
10993 xattr_flags |= CEPH_XATTR_REPLACE;
10994
10995 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
10996 filepath path;
10997 in->make_nosnap_relative_path(path);
10998 req->set_filepath(path);
10999 req->set_string2(name);
11000 req->set_inode(in);
11001 req->head.args.setxattr.flags = xattr_flags;
11002
11003 bufferlist bl;
11004 bl.append((const char*)value, size);
11005 req->set_data(bl);
11006
11007 int res = make_request(req, perms);
11008
11009 trim_cache();
11010 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11011 res << dendl;
11012 return res;
11013}
11014
11015int Client::_setxattr(Inode *in, const char *name, const void *value,
11016 size_t size, int flags, const UserPerm& perms)
11017{
11018 if (in->snapid != CEPH_NOSNAP) {
11019 return -EROFS;
11020 }
11021
11022 bool posix_acl_xattr = false;
11023 if (acl_type == POSIX_ACL)
11024 posix_acl_xattr = !strncmp(name, "system.", 7);
11025
11026 if (strncmp(name, "user.", 5) &&
11027 strncmp(name, "security.", 9) &&
11028 strncmp(name, "trusted.", 8) &&
11029 strncmp(name, "ceph.", 5) &&
11030 !posix_acl_xattr)
11031 return -EOPNOTSUPP;
11032
11033 if (posix_acl_xattr) {
11034 if (!strcmp(name, ACL_EA_ACCESS)) {
11035 mode_t new_mode = in->mode;
11036 if (value) {
11037 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11038 if (ret < 0)
11039 return ret;
11040 if (ret == 0) {
11041 value = NULL;
11042 size = 0;
11043 }
11044 if (new_mode != in->mode) {
11045 struct ceph_statx stx;
11046 stx.stx_mode = new_mode;
11047 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11048 if (ret < 0)
11049 return ret;
11050 }
11051 }
11052 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11053 if (value) {
11054 if (!S_ISDIR(in->mode))
11055 return -EACCES;
11056 int ret = posix_acl_check(value, size);
11057 if (ret < 0)
11058 return -EINVAL;
11059 if (ret == 0) {
11060 value = NULL;
11061 size = 0;
11062 }
11063 }
11064 } else {
11065 return -EOPNOTSUPP;
11066 }
11067 } else {
11068 const VXattr *vxattr = _match_vxattr(in, name);
11069 if (vxattr && vxattr->readonly)
11070 return -EOPNOTSUPP;
11071 }
11072
11073 return _do_setxattr(in, name, value, size, flags, perms);
11074}
11075
11076int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11077 size_t size, int flags, const UserPerm& perms)
11078{
11079 if (cct->_conf->client_permissions) {
11080 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11081 if (r < 0)
11082 return r;
11083 }
11084 return _setxattr(in.get(), name, value, size, flags, perms);
11085}
11086
11087int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11088{
11089 string tmp;
11090 if (name == "layout") {
11091 string::iterator begin = value.begin();
11092 string::iterator end = value.end();
11093 keys_and_values<string::iterator> p; // create instance of parser
11094 std::map<string, string> m; // map to receive results
11095 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11096 return -EINVAL;
11097 }
11098 if (begin != end)
11099 return -EINVAL;
11100 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11101 if (q->first == "pool") {
11102 tmp = q->second;
11103 break;
11104 }
11105 }
11106 } else if (name == "layout.pool") {
11107 tmp = value;
11108 }
11109
11110 if (tmp.length()) {
11111 int64_t pool;
11112 try {
11113 pool = boost::lexical_cast<unsigned>(tmp);
11114 if (!osdmap->have_pg_pool(pool))
11115 return -ENOENT;
11116 } catch (boost::bad_lexical_cast const&) {
11117 pool = osdmap->lookup_pg_pool_name(tmp);
11118 if (pool < 0) {
11119 return -ENOENT;
11120 }
11121 }
11122 }
11123
11124 return 0;
11125}
11126
11127void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11128{
11129 // For setting pool of layout, MetaRequest need osdmap epoch.
11130 // There is a race which create a new data pool but client and mds both don't have.
11131 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11132 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11133 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11134 string rest(strstr(name, "layout"));
11135 string v((const char*)value, size);
11136 int r = objecter->with_osdmap([&](const OSDMap& o) {
11137 return _setxattr_check_data_pool(rest, v, &o);
11138 });
11139
11140 if (r == -ENOENT) {
11141 C_SaferCond ctx;
11142 objecter->wait_for_latest_osdmap(&ctx);
11143 ctx.wait();
11144 }
11145 }
11146}
11147
11148int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11149 size_t size, int flags, const UserPerm& perms)
11150{
11151 _setxattr_maybe_wait_for_osdmap(name, value, size);
11152
11153 Mutex::Locker lock(client_lock);
11154
181888fb
FG
11155 if (unmounting)
11156 return -ENOTCONN;
11157
7c673cae
FG
11158 vinodeno_t vino = _get_vino(in);
11159
11160 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11161 tout(cct) << "ll_setxattr" << std::endl;
11162 tout(cct) << vino.ino.val << std::endl;
11163 tout(cct) << name << std::endl;
11164
11165 if (!cct->_conf->fuse_default_permissions) {
11166 int r = xattr_permission(in, name, MAY_WRITE, perms);
11167 if (r < 0)
11168 return r;
11169 }
11170 return _setxattr(in, name, value, size, flags, perms);
11171}
11172
11173int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11174{
11175 if (in->snapid != CEPH_NOSNAP) {
11176 return -EROFS;
11177 }
11178
11179 // same xattrs supported by kernel client
11180 if (strncmp(name, "user.", 5) &&
11181 strncmp(name, "system.", 7) &&
11182 strncmp(name, "security.", 9) &&
11183 strncmp(name, "trusted.", 8) &&
11184 strncmp(name, "ceph.", 5))
11185 return -EOPNOTSUPP;
11186
11187 const VXattr *vxattr = _match_vxattr(in, name);
11188 if (vxattr && vxattr->readonly)
11189 return -EOPNOTSUPP;
11190
11191 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11192 filepath path;
11193 in->make_nosnap_relative_path(path);
11194 req->set_filepath(path);
11195 req->set_filepath2(name);
11196 req->set_inode(in);
11197
11198 int res = make_request(req, perms);
11199
11200 trim_cache();
11201 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11202 return res;
11203}
11204
11205int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11206{
11207 if (cct->_conf->client_permissions) {
11208 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11209 if (r < 0)
11210 return r;
11211 }
11212 return _removexattr(in.get(), name, perms);
11213}
11214
11215int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11216{
11217 Mutex::Locker lock(client_lock);
11218
181888fb
FG
11219 if (unmounting)
11220 return -ENOTCONN;
11221
7c673cae
FG
11222 vinodeno_t vino = _get_vino(in);
11223
11224 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11225 tout(cct) << "ll_removexattr" << std::endl;
11226 tout(cct) << vino.ino.val << std::endl;
11227 tout(cct) << name << std::endl;
11228
11229 if (!cct->_conf->fuse_default_permissions) {
11230 int r = xattr_permission(in, name, MAY_WRITE, perms);
11231 if (r < 0)
11232 return r;
11233 }
11234
11235 return _removexattr(in, name, perms);
11236}
11237
11238bool Client::_vxattrcb_quota_exists(Inode *in)
11239{
11240 return in->quota.is_enable();
11241}
11242size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11243{
11244 return snprintf(val, size,
11245 "max_bytes=%lld max_files=%lld",
11246 (long long int)in->quota.max_bytes,
11247 (long long int)in->quota.max_files);
11248}
11249size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11250{
11251 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11252}
11253size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11254{
11255 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11256}
11257
11258bool Client::_vxattrcb_layout_exists(Inode *in)
11259{
11260 return in->layout != file_layout_t();
11261}
11262size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11263{
11264 int r = snprintf(val, size,
11265 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11266 (unsigned long long)in->layout.stripe_unit,
11267 (unsigned long long)in->layout.stripe_count,
11268 (unsigned long long)in->layout.object_size);
11269 objecter->with_osdmap([&](const OSDMap& o) {
11270 if (o.have_pg_pool(in->layout.pool_id))
11271 r += snprintf(val + r, size - r, "%s",
11272 o.get_pool_name(in->layout.pool_id).c_str());
11273 else
11274 r += snprintf(val + r, size - r, "%" PRIu64,
11275 (uint64_t)in->layout.pool_id);
11276 });
11277 if (in->layout.pool_ns.length())
11278 r += snprintf(val + r, size - r, " pool_namespace=%s",
11279 in->layout.pool_ns.c_str());
11280 return r;
11281}
11282size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11283{
11284 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11285}
11286size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11287{
11288 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11289}
11290size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11291{
11292 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11293}
11294size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11295{
11296 size_t r;
11297 objecter->with_osdmap([&](const OSDMap& o) {
11298 if (o.have_pg_pool(in->layout.pool_id))
11299 r = snprintf(val, size, "%s", o.get_pool_name(
11300 in->layout.pool_id).c_str());
11301 else
11302 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11303 });
11304 return r;
11305}
11306size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11307{
11308 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11309}
11310size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11311{
11312 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11313}
11314size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11315{
11316 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11317}
11318size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11319{
11320 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11321}
11322size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11323{
11324 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11325}
11326size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11327{
11328 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11329}
11330size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11331{
11332 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11333}
11334size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11335{
11336 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11337}
11338size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11339{
11340 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11341 (long)in->rstat.rctime.nsec());
11342}
11343
11344#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11345#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11346
11347#define XATTR_NAME_CEPH(_type, _name) \
11348{ \
11349 name: CEPH_XATTR_NAME(_type, _name), \
11350 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11351 readonly: true, \
11352 hidden: false, \
11353 exists_cb: NULL, \
28e407b8
AA
11354 flags: 0, \
11355}
11356#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11357{ \
11358 name: CEPH_XATTR_NAME(_type, _name), \
11359 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11360 readonly: true, \
11361 hidden: false, \
11362 exists_cb: NULL, \
11363 flags: _flags, \
7c673cae
FG
11364}
11365#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11366{ \
11367 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11368 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11369 readonly: false, \
11370 hidden: true, \
11371 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11372 flags: 0, \
7c673cae
FG
11373}
11374#define XATTR_QUOTA_FIELD(_type, _name) \
11375{ \
11376 name: CEPH_XATTR_NAME(_type, _name), \
11377 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11378 readonly: false, \
11379 hidden: true, \
11380 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11381 flags: 0, \
7c673cae
FG
11382}
11383
11384const Client::VXattr Client::_dir_vxattrs[] = {
11385 {
11386 name: "ceph.dir.layout",
11387 getxattr_cb: &Client::_vxattrcb_layout,
11388 readonly: false,
11389 hidden: true,
11390 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11391 flags: 0,
7c673cae
FG
11392 },
11393 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11394 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11395 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11396 XATTR_LAYOUT_FIELD(dir, layout, pool),
11397 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11398 XATTR_NAME_CEPH(dir, entries),
11399 XATTR_NAME_CEPH(dir, files),
11400 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11401 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11402 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11403 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11404 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11405 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11406 {
11407 name: "ceph.quota",
11408 getxattr_cb: &Client::_vxattrcb_quota,
11409 readonly: false,
11410 hidden: true,
11411 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11412 flags: 0,
7c673cae
FG
11413 },
11414 XATTR_QUOTA_FIELD(quota, max_bytes),
11415 XATTR_QUOTA_FIELD(quota, max_files),
11416 { name: "" } /* Required table terminator */
11417};
11418
11419const Client::VXattr Client::_file_vxattrs[] = {
11420 {
11421 name: "ceph.file.layout",
11422 getxattr_cb: &Client::_vxattrcb_layout,
11423 readonly: false,
11424 hidden: true,
11425 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11426 flags: 0,
7c673cae
FG
11427 },
11428 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11429 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11430 XATTR_LAYOUT_FIELD(file, layout, object_size),
11431 XATTR_LAYOUT_FIELD(file, layout, pool),
11432 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11433 { name: "" } /* Required table terminator */
11434};
11435
11436const Client::VXattr *Client::_get_vxattrs(Inode *in)
11437{
11438 if (in->is_dir())
11439 return _dir_vxattrs;
11440 else if (in->is_file())
11441 return _file_vxattrs;
11442 return NULL;
11443}
11444
11445const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11446{
11447 if (strncmp(name, "ceph.", 5) == 0) {
11448 const VXattr *vxattr = _get_vxattrs(in);
11449 if (vxattr) {
11450 while (!vxattr->name.empty()) {
11451 if (vxattr->name == name)
11452 return vxattr;
11453 vxattr++;
11454 }
11455 }
11456 }
11457 return NULL;
11458}
11459
11460size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11461{
11462 size_t len = 0;
11463 while (!vxattr->name.empty()) {
11464 if (!vxattr->hidden)
11465 len += vxattr->name.length() + 1;
11466 vxattr++;
11467 }
11468 return len;
11469}
11470
11471int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11472{
11473 Mutex::Locker lock(client_lock);
11474
181888fb
FG
11475 if (unmounting)
11476 return -ENOTCONN;
11477
7c673cae
FG
11478 vinodeno_t vino = _get_vino(in);
11479
11480 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11481 tout(cct) << "ll_readlink" << std::endl;
11482 tout(cct) << vino.ino.val << std::endl;
11483
11484 set<Dentry*>::iterator dn = in->dn_set.begin();
11485 while (dn != in->dn_set.end()) {
11486 touch_dn(*dn);
11487 ++dn;
11488 }
11489
11490 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11491 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11492 return r;
11493}
11494
11495int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11496 const UserPerm& perms, InodeRef *inp)
11497{
11498 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11499 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11500 << ", gid " << perms.gid() << ")" << dendl;
11501
11502 if (strlen(name) > NAME_MAX)
11503 return -ENAMETOOLONG;
11504
11505 if (dir->snapid != CEPH_NOSNAP) {
11506 return -EROFS;
11507 }
11508 if (is_quota_files_exceeded(dir, perms)) {
11509 return -EDQUOT;
11510 }
11511
11512 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11513
11514 filepath path;
11515 dir->make_nosnap_relative_path(path);
11516 path.push_dentry(name);
11517 req->set_filepath(path);
11518 req->set_inode(dir);
11519 req->head.args.mknod.rdev = rdev;
11520 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11521 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11522
11523 bufferlist xattrs_bl;
11524 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11525 if (res < 0)
11526 goto fail;
11527 req->head.args.mknod.mode = mode;
11528 if (xattrs_bl.length() > 0)
11529 req->set_data(xattrs_bl);
11530
11531 Dentry *de;
11532 res = get_or_create(dir, name, &de);
11533 if (res < 0)
11534 goto fail;
11535 req->set_dentry(de);
11536
11537 res = make_request(req, perms, inp);
11538
11539 trim_cache();
11540
11541 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11542 return res;
11543
11544 fail:
11545 put_request(req);
11546 return res;
11547}
11548
11549int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11550 dev_t rdev, struct stat *attr, Inode **out,
11551 const UserPerm& perms)
11552{
11553 Mutex::Locker lock(client_lock);
11554
181888fb
FG
11555 if (unmounting)
11556 return -ENOTCONN;
11557
7c673cae
FG
11558 vinodeno_t vparent = _get_vino(parent);
11559
11560 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11561 tout(cct) << "ll_mknod" << std::endl;
11562 tout(cct) << vparent.ino.val << std::endl;
11563 tout(cct) << name << std::endl;
11564 tout(cct) << mode << std::endl;
11565 tout(cct) << rdev << std::endl;
11566
11567 if (!cct->_conf->fuse_default_permissions) {
11568 int r = may_create(parent, perms);
11569 if (r < 0)
11570 return r;
11571 }
11572
11573 InodeRef in;
11574 int r = _mknod(parent, name, mode, rdev, perms, &in);
11575 if (r == 0) {
11576 fill_stat(in, attr);
11577 _ll_get(in.get());
11578 }
11579 tout(cct) << attr->st_ino << std::endl;
11580 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11581 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11582 *out = in.get();
11583 return r;
11584}
11585
11586int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11587 dev_t rdev, Inode **out,
11588 struct ceph_statx *stx, unsigned want, unsigned flags,
11589 const UserPerm& perms)
11590{
11591 unsigned caps = statx_to_mask(flags, want);
11592 Mutex::Locker lock(client_lock);
11593
181888fb
FG
11594 if (unmounting)
11595 return -ENOTCONN;
11596
7c673cae
FG
11597 vinodeno_t vparent = _get_vino(parent);
11598
11599 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11600 tout(cct) << "ll_mknodx" << std::endl;
11601 tout(cct) << vparent.ino.val << std::endl;
11602 tout(cct) << name << std::endl;
11603 tout(cct) << mode << std::endl;
11604 tout(cct) << rdev << std::endl;
11605
11606 if (!cct->_conf->fuse_default_permissions) {
11607 int r = may_create(parent, perms);
11608 if (r < 0)
11609 return r;
11610 }
11611
11612 InodeRef in;
11613 int r = _mknod(parent, name, mode, rdev, perms, &in);
11614 if (r == 0) {
11615 fill_statx(in, caps, stx);
11616 _ll_get(in.get());
11617 }
11618 tout(cct) << stx->stx_ino << std::endl;
11619 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11620 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11621 *out = in.get();
11622 return r;
11623}
11624
11625int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11626 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11627 int object_size, const char *data_pool, bool *created,
11628 const UserPerm& perms)
11629{
11630 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11631 mode << dec << ")" << dendl;
11632
11633 if (strlen(name) > NAME_MAX)
11634 return -ENAMETOOLONG;
11635 if (dir->snapid != CEPH_NOSNAP) {
11636 return -EROFS;
11637 }
11638 if (is_quota_files_exceeded(dir, perms)) {
11639 return -EDQUOT;
11640 }
11641
11642 // use normalized flags to generate cmode
11643 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11644 if (cmode < 0)
11645 return -EINVAL;
11646
11647 int64_t pool_id = -1;
11648 if (data_pool && *data_pool) {
11649 pool_id = objecter->with_osdmap(
11650 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11651 if (pool_id < 0)
11652 return -EINVAL;
11653 if (pool_id > 0xffffffffll)
11654 return -ERANGE; // bummer!
11655 }
11656
11657 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11658
11659 filepath path;
11660 dir->make_nosnap_relative_path(path);
11661 path.push_dentry(name);
11662 req->set_filepath(path);
11663 req->set_inode(dir);
11664 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11665
11666 req->head.args.open.stripe_unit = stripe_unit;
11667 req->head.args.open.stripe_count = stripe_count;
11668 req->head.args.open.object_size = object_size;
11669 if (cct->_conf->client_debug_getattr_caps)
11670 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11671 else
11672 req->head.args.open.mask = 0;
11673 req->head.args.open.pool = pool_id;
11674 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11675 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11676
11677 mode |= S_IFREG;
11678 bufferlist xattrs_bl;
11679 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11680 if (res < 0)
11681 goto fail;
11682 req->head.args.open.mode = mode;
11683 if (xattrs_bl.length() > 0)
11684 req->set_data(xattrs_bl);
11685
11686 Dentry *de;
11687 res = get_or_create(dir, name, &de);
11688 if (res < 0)
11689 goto fail;
11690 req->set_dentry(de);
11691
11692 res = make_request(req, perms, inp, created);
11693 if (res < 0) {
11694 goto reply_error;
11695 }
11696
11697 /* If the caller passed a value in fhp, do the open */
11698 if(fhp) {
11699 (*inp)->get_open_ref(cmode);
11700 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11701 }
11702
11703 reply_error:
11704 trim_cache();
11705
11706 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11707 << " layout " << stripe_unit
11708 << ' ' << stripe_count
11709 << ' ' << object_size
11710 <<") = " << res << dendl;
11711 return res;
11712
11713 fail:
11714 put_request(req);
11715 return res;
11716}
11717
11718
11719int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11720 InodeRef *inp)
11721{
11722 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11723 << mode << dec << ", uid " << perm.uid()
11724 << ", gid " << perm.gid() << ")" << dendl;
11725
11726 if (strlen(name) > NAME_MAX)
11727 return -ENAMETOOLONG;
11728
11729 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11730 return -EROFS;
11731 }
11732 if (is_quota_files_exceeded(dir, perm)) {
11733 return -EDQUOT;
11734 }
11735 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11736 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11737
11738 filepath path;
11739 dir->make_nosnap_relative_path(path);
11740 path.push_dentry(name);
11741 req->set_filepath(path);
11742 req->set_inode(dir);
11743 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11744 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11745
11746 mode |= S_IFDIR;
11747 bufferlist xattrs_bl;
11748 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11749 if (res < 0)
11750 goto fail;
11751 req->head.args.mkdir.mode = mode;
11752 if (xattrs_bl.length() > 0)
11753 req->set_data(xattrs_bl);
11754
11755 Dentry *de;
11756 res = get_or_create(dir, name, &de);
11757 if (res < 0)
11758 goto fail;
11759 req->set_dentry(de);
11760
11761 ldout(cct, 10) << "_mkdir: making request" << dendl;
11762 res = make_request(req, perm, inp);
11763 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11764
11765 trim_cache();
11766
11767 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11768 return res;
11769
11770 fail:
11771 put_request(req);
11772 return res;
11773}
11774
11775int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11776 struct stat *attr, Inode **out, const UserPerm& perm)
11777{
11778 Mutex::Locker lock(client_lock);
11779
181888fb
FG
11780 if (unmounting)
11781 return -ENOTCONN;
11782
7c673cae
FG
11783 vinodeno_t vparent = _get_vino(parent);
11784
11785 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11786 tout(cct) << "ll_mkdir" << std::endl;
11787 tout(cct) << vparent.ino.val << std::endl;
11788 tout(cct) << name << std::endl;
11789 tout(cct) << mode << std::endl;
11790
11791 if (!cct->_conf->fuse_default_permissions) {
11792 int r = may_create(parent, perm);
11793 if (r < 0)
11794 return r;
11795 }
11796
11797 InodeRef in;
11798 int r = _mkdir(parent, name, mode, perm, &in);
11799 if (r == 0) {
11800 fill_stat(in, attr);
11801 _ll_get(in.get());
11802 }
11803 tout(cct) << attr->st_ino << std::endl;
11804 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11805 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11806 *out = in.get();
11807 return r;
11808}
11809
11810int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11811 struct ceph_statx *stx, unsigned want, unsigned flags,
11812 const UserPerm& perms)
11813{
11814 Mutex::Locker lock(client_lock);
11815
181888fb
FG
11816 if (unmounting)
11817 return -ENOTCONN;
11818
7c673cae
FG
11819 vinodeno_t vparent = _get_vino(parent);
11820
11821 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11822 tout(cct) << "ll_mkdirx" << std::endl;
11823 tout(cct) << vparent.ino.val << std::endl;
11824 tout(cct) << name << std::endl;
11825 tout(cct) << mode << std::endl;
11826
11827 if (!cct->_conf->fuse_default_permissions) {
11828 int r = may_create(parent, perms);
11829 if (r < 0)
11830 return r;
11831 }
11832
11833 InodeRef in;
11834 int r = _mkdir(parent, name, mode, perms, &in);
11835 if (r == 0) {
11836 fill_statx(in, statx_to_mask(flags, want), stx);
11837 _ll_get(in.get());
11838 } else {
11839 stx->stx_ino = 0;
11840 stx->stx_mask = 0;
11841 }
11842 tout(cct) << stx->stx_ino << std::endl;
11843 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11844 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11845 *out = in.get();
11846 return r;
11847}
11848
11849int Client::_symlink(Inode *dir, const char *name, const char *target,
11850 const UserPerm& perms, InodeRef *inp)
11851{
11852 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11853 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11854 << dendl;
11855
11856 if (strlen(name) > NAME_MAX)
11857 return -ENAMETOOLONG;
11858
11859 if (dir->snapid != CEPH_NOSNAP) {
11860 return -EROFS;
11861 }
11862 if (is_quota_files_exceeded(dir, perms)) {
11863 return -EDQUOT;
11864 }
11865
11866 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11867
11868 filepath path;
11869 dir->make_nosnap_relative_path(path);
11870 path.push_dentry(name);
11871 req->set_filepath(path);
11872 req->set_inode(dir);
11873 req->set_string2(target);
11874 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11875 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11876
11877 Dentry *de;
11878 int res = get_or_create(dir, name, &de);
11879 if (res < 0)
11880 goto fail;
11881 req->set_dentry(de);
11882
11883 res = make_request(req, perms, inp);
11884
11885 trim_cache();
11886 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11887 res << dendl;
11888 return res;
11889
11890 fail:
11891 put_request(req);
11892 return res;
11893}
11894
11895int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11896 struct stat *attr, Inode **out, const UserPerm& perms)
11897{
11898 Mutex::Locker lock(client_lock);
11899
181888fb
FG
11900 if (unmounting)
11901 return -ENOTCONN;
11902
7c673cae
FG
11903 vinodeno_t vparent = _get_vino(parent);
11904
11905 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11906 << dendl;
11907 tout(cct) << "ll_symlink" << std::endl;
11908 tout(cct) << vparent.ino.val << std::endl;
11909 tout(cct) << name << std::endl;
11910 tout(cct) << value << std::endl;
11911
11912 if (!cct->_conf->fuse_default_permissions) {
11913 int r = may_create(parent, perms);
11914 if (r < 0)
11915 return r;
11916 }
11917
11918 InodeRef in;
11919 int r = _symlink(parent, name, value, perms, &in);
11920 if (r == 0) {
11921 fill_stat(in, attr);
11922 _ll_get(in.get());
11923 }
11924 tout(cct) << attr->st_ino << std::endl;
11925 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11926 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11927 *out = in.get();
11928 return r;
11929}
11930
11931int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11932 Inode **out, struct ceph_statx *stx, unsigned want,
11933 unsigned flags, const UserPerm& perms)
11934{
11935 Mutex::Locker lock(client_lock);
11936
181888fb
FG
11937 if (unmounting)
11938 return -ENOTCONN;
11939
7c673cae
FG
11940 vinodeno_t vparent = _get_vino(parent);
11941
11942 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11943 << dendl;
11944 tout(cct) << "ll_symlinkx" << std::endl;
11945 tout(cct) << vparent.ino.val << std::endl;
11946 tout(cct) << name << std::endl;
11947 tout(cct) << value << std::endl;
11948
11949 if (!cct->_conf->fuse_default_permissions) {
11950 int r = may_create(parent, perms);
11951 if (r < 0)
11952 return r;
11953 }
11954
11955 InodeRef in;
11956 int r = _symlink(parent, name, value, perms, &in);
11957 if (r == 0) {
11958 fill_statx(in, statx_to_mask(flags, want), stx);
11959 _ll_get(in.get());
11960 }
11961 tout(cct) << stx->stx_ino << std::endl;
11962 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11963 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11964 *out = in.get();
11965 return r;
11966}
11967
11968int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11969{
11970 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11971 << " uid " << perm.uid() << " gid " << perm.gid()
11972 << ")" << dendl;
11973
11974 if (dir->snapid != CEPH_NOSNAP) {
11975 return -EROFS;
11976 }
11977
11978 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11979
11980 filepath path;
11981 dir->make_nosnap_relative_path(path);
11982 path.push_dentry(name);
11983 req->set_filepath(path);
11984
11985 InodeRef otherin;
b32b8144 11986 Inode *in;
7c673cae 11987 Dentry *de;
b32b8144 11988
7c673cae
FG
11989 int res = get_or_create(dir, name, &de);
11990 if (res < 0)
11991 goto fail;
11992 req->set_dentry(de);
11993 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11994 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11995
11996 res = _lookup(dir, name, 0, &otherin, perm);
11997 if (res < 0)
11998 goto fail;
b32b8144
FG
11999
12000 in = otherin.get();
12001 req->set_other_inode(in);
12002 in->break_all_delegs();
7c673cae
FG
12003 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12004
12005 req->set_inode(dir);
12006
12007 res = make_request(req, perm);
12008
12009 trim_cache();
12010 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
12011 return res;
12012
12013 fail:
12014 put_request(req);
12015 return res;
12016}
12017
12018int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12019{
12020 Mutex::Locker lock(client_lock);
12021
181888fb
FG
12022 if (unmounting)
12023 return -ENOTCONN;
12024
7c673cae
FG
12025 vinodeno_t vino = _get_vino(in);
12026
12027 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12028 tout(cct) << "ll_unlink" << std::endl;
12029 tout(cct) << vino.ino.val << std::endl;
12030 tout(cct) << name << std::endl;
12031
12032 if (!cct->_conf->fuse_default_permissions) {
12033 int r = may_delete(in, name, perm);
12034 if (r < 0)
12035 return r;
12036 }
12037 return _unlink(in, name, perm);
12038}
12039
12040int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12041{
12042 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
12043 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12044
12045 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12046 return -EROFS;
12047 }
b32b8144
FG
12048
12049 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12050 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12051 filepath path;
12052 dir->make_nosnap_relative_path(path);
12053 path.push_dentry(name);
12054 req->set_filepath(path);
12055
12056 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12057 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12058 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12059
12060 InodeRef in;
12061
12062 Dentry *de;
12063 int res = get_or_create(dir, name, &de);
12064 if (res < 0)
12065 goto fail;
b32b8144
FG
12066 if (op == CEPH_MDS_OP_RMDIR)
12067 req->set_dentry(de);
12068 else
12069 de->get();
12070
7c673cae
FG
12071 res = _lookup(dir, name, 0, &in, perms);
12072 if (res < 0)
12073 goto fail;
b32b8144 12074 if (op == CEPH_MDS_OP_RMDIR) {
7c673cae 12075 req->set_inode(dir);
7c673cae
FG
12076 req->set_other_inode(in.get());
12077 } else {
12078 unlink(de, true, true);
b32b8144 12079 de->put();
7c673cae
FG
12080 req->set_other_inode(in.get());
12081 }
12082
12083 res = make_request(req, perms);
12084
12085 trim_cache();
12086 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
12087 return res;
12088
12089 fail:
12090 put_request(req);
12091 return res;
12092}
12093
12094int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12095{
12096 Mutex::Locker lock(client_lock);
12097
181888fb
FG
12098 if (unmounting)
12099 return -ENOTCONN;
12100
7c673cae
FG
12101 vinodeno_t vino = _get_vino(in);
12102
12103 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12104 tout(cct) << "ll_rmdir" << std::endl;
12105 tout(cct) << vino.ino.val << std::endl;
12106 tout(cct) << name << std::endl;
12107
12108 if (!cct->_conf->fuse_default_permissions) {
12109 int r = may_delete(in, name, perms);
12110 if (r < 0)
12111 return r;
12112 }
12113
12114 return _rmdir(in, name, perms);
12115}
12116
12117int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12118{
12119 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
12120 << todir->ino << " " << toname
12121 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12122 << dendl;
12123
12124 if (fromdir->snapid != todir->snapid)
12125 return -EXDEV;
12126
12127 int op = CEPH_MDS_OP_RENAME;
12128 if (fromdir->snapid != CEPH_NOSNAP) {
12129 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12130 op = CEPH_MDS_OP_RENAMESNAP;
12131 else
12132 return -EROFS;
12133 }
12134 if (fromdir != todir) {
12135 Inode *fromdir_root =
12136 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12137 Inode *todir_root =
12138 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12139 if (fromdir_root != todir_root) {
12140 return -EXDEV;
12141 }
12142 }
12143
12144 InodeRef target;
12145 MetaRequest *req = new MetaRequest(op);
12146
12147 filepath from;
12148 fromdir->make_nosnap_relative_path(from);
12149 from.push_dentry(fromname);
12150 filepath to;
12151 todir->make_nosnap_relative_path(to);
12152 to.push_dentry(toname);
12153 req->set_filepath(to);
12154 req->set_filepath2(from);
12155
12156 Dentry *oldde;
12157 int res = get_or_create(fromdir, fromname, &oldde);
12158 if (res < 0)
12159 goto fail;
12160 Dentry *de;
12161 res = get_or_create(todir, toname, &de);
12162 if (res < 0)
12163 goto fail;
12164
12165 if (op == CEPH_MDS_OP_RENAME) {
12166 req->set_old_dentry(oldde);
12167 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12168 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12169
12170 req->set_dentry(de);
12171 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12172 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12173
12174 InodeRef oldin, otherin;
12175 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12176 if (res < 0)
12177 goto fail;
b32b8144
FG
12178
12179 Inode *oldinode = oldin.get();
12180 oldinode->break_all_delegs();
12181 req->set_old_inode(oldinode);
7c673cae
FG
12182 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12183
12184 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12185 switch (res) {
12186 case 0:
12187 {
12188 Inode *in = otherin.get();
12189 req->set_other_inode(in);
12190 in->break_all_delegs();
12191 }
7c673cae 12192 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12193 break;
12194 case -ENOENT:
12195 break;
12196 default:
12197 goto fail;
7c673cae
FG
12198 }
12199
12200 req->set_inode(todir);
12201 } else {
12202 // renamesnap reply contains no tracedn, so we need to invalidate
12203 // dentry manually
12204 unlink(oldde, true, true);
12205 unlink(de, true, true);
12206 }
12207
12208 res = make_request(req, perm, &target);
12209 ldout(cct, 10) << "rename result is " << res << dendl;
12210
12211 // renamed item from our cache
12212
12213 trim_cache();
12214 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12215 return res;
12216
12217 fail:
12218 put_request(req);
12219 return res;
12220}
12221
12222int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12223 const char *newname, const UserPerm& perm)
12224{
12225 Mutex::Locker lock(client_lock);
12226
181888fb
FG
12227 if (unmounting)
12228 return -ENOTCONN;
12229
7c673cae
FG
12230 vinodeno_t vparent = _get_vino(parent);
12231 vinodeno_t vnewparent = _get_vino(newparent);
12232
12233 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12234 << vnewparent << " " << newname << dendl;
12235 tout(cct) << "ll_rename" << std::endl;
12236 tout(cct) << vparent.ino.val << std::endl;
12237 tout(cct) << name << std::endl;
12238 tout(cct) << vnewparent.ino.val << std::endl;
12239 tout(cct) << newname << std::endl;
12240
12241 if (!cct->_conf->fuse_default_permissions) {
12242 int r = may_delete(parent, name, perm);
12243 if (r < 0)
12244 return r;
12245 r = may_delete(newparent, newname, perm);
12246 if (r < 0 && r != -ENOENT)
12247 return r;
12248 }
12249
12250 return _rename(parent, name, newparent, newname, perm);
12251}
12252
12253int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12254{
12255 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12256 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12257
12258 if (strlen(newname) > NAME_MAX)
12259 return -ENAMETOOLONG;
12260
12261 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12262 return -EROFS;
12263 }
12264 if (is_quota_files_exceeded(dir, perm)) {
12265 return -EDQUOT;
12266 }
12267
b32b8144 12268 in->break_all_delegs();
7c673cae
FG
12269 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12270
12271 filepath path(newname, dir->ino);
12272 req->set_filepath(path);
12273 filepath existing(in->ino);
12274 req->set_filepath2(existing);
12275
12276 req->set_inode(dir);
12277 req->inode_drop = CEPH_CAP_FILE_SHARED;
12278 req->inode_unless = CEPH_CAP_FILE_EXCL;
12279
12280 Dentry *de;
12281 int res = get_or_create(dir, newname, &de);
12282 if (res < 0)
12283 goto fail;
12284 req->set_dentry(de);
12285
12286 res = make_request(req, perm, inp);
12287 ldout(cct, 10) << "link result is " << res << dendl;
12288
12289 trim_cache();
12290 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
12291 return res;
12292
12293 fail:
12294 put_request(req);
12295 return res;
12296}
12297
12298int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12299 const UserPerm& perm)
12300{
12301 Mutex::Locker lock(client_lock);
12302
181888fb
FG
12303 if (unmounting)
12304 return -ENOTCONN;
12305
7c673cae
FG
12306 vinodeno_t vino = _get_vino(in);
12307 vinodeno_t vnewparent = _get_vino(newparent);
12308
31f18b77 12309 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12310 newname << dendl;
12311 tout(cct) << "ll_link" << std::endl;
12312 tout(cct) << vino.ino.val << std::endl;
12313 tout(cct) << vnewparent << std::endl;
12314 tout(cct) << newname << std::endl;
12315
12316 int r = 0;
12317 InodeRef target;
12318
12319 if (!cct->_conf->fuse_default_permissions) {
12320 if (S_ISDIR(in->mode))
12321 return -EPERM;
12322
12323 r = may_hardlink(in, perm);
12324 if (r < 0)
12325 return r;
12326
12327 r = may_create(newparent, perm);
12328 if (r < 0)
12329 return r;
12330 }
12331
12332 return _link(in, newparent, newname, perm, &target);
12333}
12334
12335int Client::ll_num_osds(void)
12336{
12337 Mutex::Locker lock(client_lock);
12338 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12339}
12340
12341int Client::ll_osdaddr(int osd, uint32_t *addr)
12342{
12343 Mutex::Locker lock(client_lock);
181888fb 12344
7c673cae
FG
12345 entity_addr_t g;
12346 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12347 if (!o.exists(osd))
12348 return false;
12349 g = o.get_addr(osd);
12350 return true;
12351 });
12352 if (!exists)
12353 return -1;
12354 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12355 *addr = ntohl(nb_addr);
12356 return 0;
12357}
181888fb 12358
7c673cae
FG
12359uint32_t Client::ll_stripe_unit(Inode *in)
12360{
12361 Mutex::Locker lock(client_lock);
12362 return in->layout.stripe_unit;
12363}
12364
12365uint64_t Client::ll_snap_seq(Inode *in)
12366{
12367 Mutex::Locker lock(client_lock);
12368 return in->snaprealm->seq;
12369}
12370
12371int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12372{
12373 Mutex::Locker lock(client_lock);
12374 *layout = in->layout;
12375 return 0;
12376}
12377
12378int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12379{
12380 return ll_file_layout(fh->inode.get(), layout);
12381}
12382
12383/* Currently we cannot take advantage of redundancy in reads, since we
12384 would have to go through all possible placement groups (a
12385 potentially quite large number determined by a hash), and use CRUSH
12386 to calculate the appropriate set of OSDs for each placement group,
12387 then index into that. An array with one entry per OSD is much more
12388 tractable and works for demonstration purposes. */
12389
12390int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12391 file_layout_t* layout)
12392{
12393 Mutex::Locker lock(client_lock);
181888fb 12394
28e407b8 12395 inodeno_t ino = in->ino;
7c673cae
FG
12396 uint32_t object_size = layout->object_size;
12397 uint32_t su = layout->stripe_unit;
12398 uint32_t stripe_count = layout->stripe_count;
12399 uint64_t stripes_per_object = object_size / su;
12400
12401 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12402 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12403 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12404 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12405
12406 object_t oid = file_object_t(ino, objectno);
12407 return objecter->with_osdmap([&](const OSDMap& o) {
12408 ceph_object_layout olayout =
12409 o.file_to_object_layout(oid, *layout);
12410 pg_t pg = (pg_t)olayout.ol_pgid;
12411 vector<int> osds;
12412 int primary;
12413 o.pg_to_acting_osds(pg, &osds, &primary);
12414 return primary;
12415 });
12416}
12417
12418/* Return the offset of the block, internal to the object */
12419
12420uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12421{
12422 Mutex::Locker lock(client_lock);
12423 file_layout_t *layout=&(in->layout);
12424 uint32_t object_size = layout->object_size;
12425 uint32_t su = layout->stripe_unit;
12426 uint64_t stripes_per_object = object_size / su;
12427
12428 return (blockno % stripes_per_object) * su;
12429}
12430
12431int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12432 const UserPerm& perms)
12433{
12434 Mutex::Locker lock(client_lock);
12435
181888fb
FG
12436 if (unmounting)
12437 return -ENOTCONN;
12438
7c673cae
FG
12439 vinodeno_t vino = _get_vino(in);
12440
12441 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12442 tout(cct) << "ll_opendir" << std::endl;
12443 tout(cct) << vino.ino.val << std::endl;
12444
12445 if (!cct->_conf->fuse_default_permissions) {
12446 int r = may_open(in, flags, perms);
12447 if (r < 0)
12448 return r;
12449 }
12450
12451 int r = _opendir(in, dirpp, perms);
12452 tout(cct) << (unsigned long)*dirpp << std::endl;
12453
12454 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12455 << dendl;
12456 return r;
12457}
12458
12459int Client::ll_releasedir(dir_result_t *dirp)
12460{
12461 Mutex::Locker lock(client_lock);
12462 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12463 tout(cct) << "ll_releasedir" << std::endl;
12464 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12465
12466 if (unmounting)
12467 return -ENOTCONN;
12468
7c673cae
FG
12469 _closedir(dirp);
12470 return 0;
12471}
12472
12473int Client::ll_fsyncdir(dir_result_t *dirp)
12474{
12475 Mutex::Locker lock(client_lock);
12476 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12477 tout(cct) << "ll_fsyncdir" << std::endl;
12478 tout(cct) << (unsigned long)dirp << std::endl;
12479
181888fb
FG
12480 if (unmounting)
12481 return -ENOTCONN;
12482
7c673cae
FG
12483 return _fsync(dirp->inode.get(), false);
12484}
12485
12486int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12487{
12488 assert(!(flags & O_CREAT));
12489
12490 Mutex::Locker lock(client_lock);
12491
181888fb
FG
12492 if (unmounting)
12493 return -ENOTCONN;
12494
7c673cae
FG
12495 vinodeno_t vino = _get_vino(in);
12496
12497 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12498 tout(cct) << "ll_open" << std::endl;
12499 tout(cct) << vino.ino.val << std::endl;
12500 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12501
12502 int r;
12503 if (!cct->_conf->fuse_default_permissions) {
12504 r = may_open(in, flags, perms);
12505 if (r < 0)
12506 goto out;
12507 }
12508
12509 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12510
12511 out:
12512 Fh *fhptr = fhp ? *fhp : NULL;
12513 if (fhptr) {
12514 ll_unclosed_fh_set.insert(fhptr);
12515 }
12516 tout(cct) << (unsigned long)fhptr << std::endl;
12517 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12518 " = " << r << " (" << fhptr << ")" << dendl;
12519 return r;
12520}
12521
12522int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12523 int flags, InodeRef *in, int caps, Fh **fhp,
12524 const UserPerm& perms)
12525{
12526 *fhp = NULL;
12527
12528 vinodeno_t vparent = _get_vino(parent);
12529
12530 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12531 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12532 << ", gid " << perms.gid() << dendl;
12533 tout(cct) << "ll_create" << std::endl;
12534 tout(cct) << vparent.ino.val << std::endl;
12535 tout(cct) << name << std::endl;
12536 tout(cct) << mode << std::endl;
12537 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12538
12539 bool created = false;
12540 int r = _lookup(parent, name, caps, in, perms);
12541
12542 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12543 return -EEXIST;
12544
12545 if (r == -ENOENT && (flags & O_CREAT)) {
12546 if (!cct->_conf->fuse_default_permissions) {
12547 r = may_create(parent, perms);
12548 if (r < 0)
12549 goto out;
12550 }
12551 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12552 perms);
12553 if (r < 0)
12554 goto out;
12555 }
12556
12557 if (r < 0)
12558 goto out;
12559
12560 assert(*in);
12561
12562 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12563 if (!created) {
12564 if (!cct->_conf->fuse_default_permissions) {
12565 r = may_open(in->get(), flags, perms);
12566 if (r < 0) {
12567 if (*fhp) {
12568 int release_r = _release_fh(*fhp);
12569 assert(release_r == 0); // during create, no async data ops should have happened
12570 }
12571 goto out;
12572 }
12573 }
12574 if (*fhp == NULL) {
12575 r = _open(in->get(), flags, mode, fhp, perms);
12576 if (r < 0)
12577 goto out;
12578 }
12579 }
12580
12581out:
12582 if (*fhp) {
12583 ll_unclosed_fh_set.insert(*fhp);
12584 }
12585
12586 ino_t ino = 0;
12587 if (r >= 0) {
12588 Inode *inode = in->get();
12589 if (use_faked_inos())
12590 ino = inode->faked_ino;
12591 else
12592 ino = inode->ino;
12593 }
12594
12595 tout(cct) << (unsigned long)*fhp << std::endl;
12596 tout(cct) << ino << std::endl;
31f18b77 12597 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12598 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12599 *fhp << " " << hex << ino << dec << ")" << dendl;
12600
12601 return r;
12602}
12603
12604int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12605 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12606 const UserPerm& perms)
12607{
12608 Mutex::Locker lock(client_lock);
12609 InodeRef in;
12610
181888fb
FG
12611 if (unmounting)
12612 return -ENOTCONN;
12613
7c673cae
FG
12614 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12615 fhp, perms);
12616 if (r >= 0) {
12617 assert(in);
12618
12619 // passing an Inode in outp requires an additional ref
12620 if (outp) {
12621 _ll_get(in.get());
12622 *outp = in.get();
12623 }
12624 fill_stat(in, attr);
12625 } else {
12626 attr->st_ino = 0;
12627 }
12628
12629 return r;
12630}
12631
12632int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12633 int oflags, Inode **outp, Fh **fhp,
12634 struct ceph_statx *stx, unsigned want, unsigned lflags,
12635 const UserPerm& perms)
12636{
12637 unsigned caps = statx_to_mask(lflags, want);
12638 Mutex::Locker lock(client_lock);
12639 InodeRef in;
12640
181888fb
FG
12641 if (unmounting)
12642 return -ENOTCONN;
7c673cae
FG
12643
12644 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12645 if (r >= 0) {
12646 assert(in);
12647
12648 // passing an Inode in outp requires an additional ref
12649 if (outp) {
12650 _ll_get(in.get());
12651 *outp = in.get();
12652 }
12653 fill_statx(in, caps, stx);
12654 } else {
12655 stx->stx_ino = 0;
12656 stx->stx_mask = 0;
12657 }
12658
12659 return r;
12660}
12661
12662loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12663{
12664 Mutex::Locker lock(client_lock);
12665 tout(cct) << "ll_lseek" << std::endl;
12666 tout(cct) << offset << std::endl;
12667 tout(cct) << whence << std::endl;
12668
181888fb
FG
12669 if (unmounting)
12670 return -ENOTCONN;
12671
7c673cae
FG
12672 return _lseek(fh, offset, whence);
12673}
12674
12675int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12676{
12677 Mutex::Locker lock(client_lock);
12678 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12679 tout(cct) << "ll_read" << std::endl;
12680 tout(cct) << (unsigned long)fh << std::endl;
12681 tout(cct) << off << std::endl;
12682 tout(cct) << len << std::endl;
12683
181888fb
FG
12684 if (unmounting)
12685 return -ENOTCONN;
12686
7c673cae
FG
12687 return _read(fh, off, len, bl);
12688}
12689
12690int Client::ll_read_block(Inode *in, uint64_t blockid,
12691 char *buf,
12692 uint64_t offset,
12693 uint64_t length,
12694 file_layout_t* layout)
12695{
12696 Mutex::Locker lock(client_lock);
181888fb
FG
12697
12698 if (unmounting)
12699 return -ENOTCONN;
12700
b32b8144 12701 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12702 object_t oid = file_object_t(vino.ino, blockid);
12703 C_SaferCond onfinish;
12704 bufferlist bl;
12705
12706 objecter->read(oid,
12707 object_locator_t(layout->pool_id),
12708 offset,
12709 length,
12710 vino.snapid,
12711 &bl,
12712 CEPH_OSD_FLAG_READ,
12713 &onfinish);
12714
12715 client_lock.Unlock();
12716 int r = onfinish.wait();
12717 client_lock.Lock();
12718
12719 if (r >= 0) {
12720 bl.copy(0, bl.length(), buf);
12721 r = bl.length();
12722 }
12723
12724 return r;
12725}
12726
12727/* It appears that the OSD doesn't return success unless the entire
12728 buffer was written, return the write length on success. */
12729
12730int Client::ll_write_block(Inode *in, uint64_t blockid,
12731 char* buf, uint64_t offset,
12732 uint64_t length, file_layout_t* layout,
12733 uint64_t snapseq, uint32_t sync)
12734{
12735 Mutex flock("Client::ll_write_block flock");
12736 vinodeno_t vino = ll_get_vino(in);
12737 Cond cond;
12738 bool done;
12739 int r = 0;
181888fb 12740 Context *onsafe = nullptr;
7c673cae
FG
12741
12742 if (length == 0) {
12743 return -EINVAL;
12744 }
12745 if (true || sync) {
12746 /* if write is stable, the epilogue is waiting on
12747 * flock */
12748 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12749 done = false;
12750 } else {
12751 /* if write is unstable, we just place a barrier for
12752 * future commits to wait on */
12753 /*onsafe = new C_Block_Sync(this, vino.ino,
12754 barrier_interval(offset, offset + length), &r);
12755 */
12756 done = true;
12757 }
12758 object_t oid = file_object_t(vino.ino, blockid);
12759 SnapContext fakesnap;
12760 bufferptr bp;
12761 if (length > 0) bp = buffer::copy(buf, length);
12762 bufferlist bl;
12763 bl.push_back(bp);
12764
12765 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12766 << dendl;
12767
12768 fakesnap.seq = snapseq;
12769
12770 /* lock just in time */
12771 client_lock.Lock();
181888fb
FG
12772 if (unmounting) {
12773 client_lock.Unlock();
12774 delete onsafe;
12775 return -ENOTCONN;
12776 }
7c673cae
FG
12777
12778 objecter->write(oid,
12779 object_locator_t(layout->pool_id),
12780 offset,
12781 length,
12782 fakesnap,
12783 bl,
12784 ceph::real_clock::now(),
12785 0,
12786 onsafe);
12787
12788 client_lock.Unlock();
12789 if (!done /* also !sync */) {
12790 flock.Lock();
12791 while (! done)
12792 cond.Wait(flock);
12793 flock.Unlock();
12794 }
12795
12796 if (r < 0) {
12797 return r;
12798 } else {
12799 return length;
12800 }
12801}
12802
12803int Client::ll_commit_blocks(Inode *in,
12804 uint64_t offset,
12805 uint64_t length)
12806{
12807 Mutex::Locker lock(client_lock);
12808 /*
12809 BarrierContext *bctx;
b32b8144 12810 vinodeno_t vino = _get_vino(in);
7c673cae
FG
12811 uint64_t ino = vino.ino;
12812
12813 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12814 << offset << " to " << length << dendl;
12815
12816 if (length == 0) {
12817 return -EINVAL;
12818 }
12819
12820 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12821 if (p != barriers.end()) {
12822 barrier_interval civ(offset, offset + length);
12823 p->second->commit_barrier(civ);
12824 }
12825 */
12826 return 0;
12827}
12828
12829int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12830{
12831 Mutex::Locker lock(client_lock);
12832 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12833 "~" << len << dendl;
12834 tout(cct) << "ll_write" << std::endl;
12835 tout(cct) << (unsigned long)fh << std::endl;
12836 tout(cct) << off << std::endl;
12837 tout(cct) << len << std::endl;
12838
181888fb
FG
12839 if (unmounting)
12840 return -ENOTCONN;
12841
7c673cae
FG
12842 int r = _write(fh, off, len, data, NULL, 0);
12843 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12844 << dendl;
12845 return r;
12846}
12847
12848int Client::ll_flush(Fh *fh)
12849{
12850 Mutex::Locker lock(client_lock);
12851 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12852 tout(cct) << "ll_flush" << std::endl;
12853 tout(cct) << (unsigned long)fh << std::endl;
12854
181888fb
FG
12855 if (unmounting)
12856 return -ENOTCONN;
12857
7c673cae
FG
12858 return _flush(fh);
12859}
12860
12861int Client::ll_fsync(Fh *fh, bool syncdataonly)
12862{
12863 Mutex::Locker lock(client_lock);
12864 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12865 tout(cct) << "ll_fsync" << std::endl;
12866 tout(cct) << (unsigned long)fh << std::endl;
12867
181888fb
FG
12868 if (unmounting)
12869 return -ENOTCONN;
12870
7c673cae
FG
12871 int r = _fsync(fh, syncdataonly);
12872 if (r) {
12873 // If we're returning an error, clear it from the FH
12874 fh->take_async_err();
12875 }
12876 return r;
12877}
12878
28e407b8
AA
12879int Client::ll_sync_inode(Inode *in, bool syncdataonly)
12880{
12881 Mutex::Locker lock(client_lock);
12882 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
12883 tout(cct) << "ll_sync_inode" << std::endl;
12884 tout(cct) << (unsigned long)in << std::endl;
12885
12886 if (unmounting)
12887 return -ENOTCONN;
12888
12889 return _fsync(in, syncdataonly);
12890}
12891
7c673cae
FG
12892#ifdef FALLOC_FL_PUNCH_HOLE
12893
12894int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12895{
12896 if (offset < 0 || length <= 0)
12897 return -EINVAL;
12898
12899 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12900 return -EOPNOTSUPP;
12901
12902 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12903 return -EOPNOTSUPP;
12904
12905 Inode *in = fh->inode.get();
12906
12907 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12908 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12909 return -ENOSPC;
12910 }
12911
12912 if (in->snapid != CEPH_NOSNAP)
12913 return -EROFS;
12914
12915 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12916 return -EBADF;
12917
12918 uint64_t size = offset + length;
28e407b8 12919 std::list<InodeRef> quota_roots;
7c673cae
FG
12920 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12921 size > in->size &&
28e407b8 12922 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
7c673cae
FG
12923 return -EDQUOT;
12924 }
12925
12926 int have;
12927 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12928 if (r < 0)
12929 return r;
12930
12931 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12932 Cond uninline_cond;
12933 bool uninline_done = false;
12934 int uninline_ret = 0;
12935 Context *onuninline = NULL;
12936
12937 if (mode & FALLOC_FL_PUNCH_HOLE) {
12938 if (in->inline_version < CEPH_INLINE_NONE &&
12939 (have & CEPH_CAP_FILE_BUFFER)) {
12940 bufferlist bl;
12941 int len = in->inline_data.length();
12942 if (offset < len) {
12943 if (offset > 0)
12944 in->inline_data.copy(0, offset, bl);
12945 int size = length;
12946 if (offset + size > len)
12947 size = len - offset;
12948 if (size > 0)
12949 bl.append_zero(size);
12950 if (offset + size < len)
12951 in->inline_data.copy(offset + size, len - offset - size, bl);
12952 in->inline_data = bl;
12953 in->inline_version++;
12954 }
12955 in->mtime = ceph_clock_now();
12956 in->change_attr++;
28e407b8 12957 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
12958 } else {
12959 if (in->inline_version < CEPH_INLINE_NONE) {
12960 onuninline = new C_SafeCond(&uninline_flock,
12961 &uninline_cond,
12962 &uninline_done,
12963 &uninline_ret);
12964 uninline_data(in, onuninline);
12965 }
12966
12967 Mutex flock("Client::_punch_hole flock");
12968 Cond cond;
12969 bool done = false;
12970 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12971
12972 unsafe_sync_write++;
12973 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12974
12975 _invalidate_inode_cache(in, offset, length);
12976 filer->zero(in->ino, &in->layout,
12977 in->snaprealm->get_snap_context(),
12978 offset, length,
12979 ceph::real_clock::now(),
12980 0, true, onfinish);
12981 in->mtime = ceph_clock_now();
12982 in->change_attr++;
28e407b8 12983 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
12984
12985 client_lock.Unlock();
12986 flock.Lock();
12987 while (!done)
12988 cond.Wait(flock);
12989 flock.Unlock();
12990 client_lock.Lock();
12991 _sync_write_commit(in);
12992 }
12993 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12994 uint64_t size = offset + length;
12995 if (size > in->size) {
12996 in->size = size;
12997 in->mtime = ceph_clock_now();
12998 in->change_attr++;
28e407b8 12999 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13000
28e407b8 13001 if (is_quota_bytes_approaching(in, quota_roots)) {
7c673cae 13002 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13003 } else if (is_max_size_approaching(in)) {
13004 check_caps(in, 0);
7c673cae
FG
13005 }
13006 }
13007 }
13008
13009 if (onuninline) {
13010 client_lock.Unlock();
13011 uninline_flock.Lock();
13012 while (!uninline_done)
13013 uninline_cond.Wait(uninline_flock);
13014 uninline_flock.Unlock();
13015 client_lock.Lock();
13016
13017 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13018 in->inline_data.clear();
13019 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13020 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13021 check_caps(in, 0);
13022 } else
13023 r = uninline_ret;
13024 }
13025
13026 put_cap_ref(in, CEPH_CAP_FILE_WR);
13027 return r;
13028}
13029#else
13030
13031int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13032{
13033 return -EOPNOTSUPP;
13034}
13035
13036#endif
13037
13038
13039int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13040{
13041 Mutex::Locker lock(client_lock);
13042 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13043 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13044 tout(cct) << (unsigned long)fh << std::endl;
13045
181888fb
FG
13046 if (unmounting)
13047 return -ENOTCONN;
13048
7c673cae
FG
13049 return _fallocate(fh, mode, offset, length);
13050}
13051
13052int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13053{
13054 Mutex::Locker lock(client_lock);
13055 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13056
181888fb
FG
13057 if (unmounting)
13058 return -ENOTCONN;
13059
7c673cae
FG
13060 Fh *fh = get_filehandle(fd);
13061 if (!fh)
13062 return -EBADF;
13063#if defined(__linux__) && defined(O_PATH)
13064 if (fh->flags & O_PATH)
13065 return -EBADF;
13066#endif
13067 return _fallocate(fh, mode, offset, length);
13068}
13069
13070int Client::ll_release(Fh *fh)
13071{
13072 Mutex::Locker lock(client_lock);
13073 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13074 dendl;
13075 tout(cct) << "ll_release (fh)" << std::endl;
13076 tout(cct) << (unsigned long)fh << std::endl;
13077
181888fb
FG
13078 if (unmounting)
13079 return -ENOTCONN;
13080
7c673cae
FG
13081 if (ll_unclosed_fh_set.count(fh))
13082 ll_unclosed_fh_set.erase(fh);
13083 return _release_fh(fh);
13084}
13085
13086int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13087{
13088 Mutex::Locker lock(client_lock);
13089
13090 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13091 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13092
181888fb
FG
13093 if (unmounting)
13094 return -ENOTCONN;
13095
7c673cae
FG
13096 return _getlk(fh, fl, owner);
13097}
13098
13099int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13100{
13101 Mutex::Locker lock(client_lock);
13102
13103 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
13104 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13105
181888fb
FG
13106 if (unmounting)
13107 return -ENOTCONN;
13108
7c673cae
FG
13109 return _setlk(fh, fl, owner, sleep);
13110}
13111
13112int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13113{
13114 Mutex::Locker lock(client_lock);
13115
13116 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13117 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13118
181888fb
FG
13119 if (unmounting)
13120 return -ENOTCONN;
13121
7c673cae
FG
13122 return _flock(fh, cmd, owner);
13123}
13124
b32b8144
FG
13125int Client::set_deleg_timeout(uint32_t timeout)
13126{
13127 Mutex::Locker lock(client_lock);
13128
13129 /*
13130 * The whole point is to prevent blacklisting so we must time out the
13131 * delegation before the session autoclose timeout kicks in.
13132 */
13133 if (timeout >= mdsmap->get_session_autoclose())
13134 return -EINVAL;
13135
13136 deleg_timeout = timeout;
13137 return 0;
13138}
13139
13140int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13141{
13142 int ret = -EINVAL;
13143
13144 Mutex::Locker lock(client_lock);
13145
13146 if (!mounted)
13147 return -ENOTCONN;
13148
13149 Inode *inode = fh->inode.get();
13150
13151 switch(cmd) {
13152 case CEPH_DELEGATION_NONE:
13153 inode->unset_deleg(fh);
13154 ret = 0;
13155 break;
13156 default:
13157 try {
13158 ret = inode->set_deleg(fh, cmd, cb, priv);
13159 } catch (std::bad_alloc) {
13160 ret = -ENOMEM;
13161 }
13162 break;
13163 }
13164 return ret;
13165}
13166
7c673cae
FG
13167class C_Client_RequestInterrupt : public Context {
13168private:
13169 Client *client;
13170 MetaRequest *req;
13171public:
13172 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13173 req->get();
13174 }
13175 void finish(int r) override {
13176 Mutex::Locker l(client->client_lock);
13177 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13178 client->_interrupt_filelock(req);
13179 client->put_request(req);
13180 }
13181};
13182
13183void Client::ll_interrupt(void *d)
13184{
13185 MetaRequest *req = static_cast<MetaRequest*>(d);
13186 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13187 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13188 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13189}
13190
13191// =========================================
13192// layout
13193
13194// expose file layouts
13195
13196int Client::describe_layout(const char *relpath, file_layout_t *lp,
13197 const UserPerm& perms)
13198{
13199 Mutex::Locker lock(client_lock);
13200
181888fb
FG
13201 if (unmounting)
13202 return -ENOTCONN;
13203
7c673cae
FG
13204 filepath path(relpath);
13205 InodeRef in;
13206 int r = path_walk(path, &in, perms);
13207 if (r < 0)
13208 return r;
13209
13210 *lp = in->layout;
13211
13212 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13213 return 0;
13214}
13215
13216int Client::fdescribe_layout(int fd, file_layout_t *lp)
13217{
13218 Mutex::Locker lock(client_lock);
13219
181888fb
FG
13220 if (unmounting)
13221 return -ENOTCONN;
13222
7c673cae
FG
13223 Fh *f = get_filehandle(fd);
13224 if (!f)
13225 return -EBADF;
13226 Inode *in = f->inode.get();
13227
13228 *lp = in->layout;
13229
13230 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13231 return 0;
13232}
13233
d2e6a577
FG
13234int64_t Client::get_default_pool_id()
13235{
13236 Mutex::Locker lock(client_lock);
181888fb
FG
13237
13238 if (unmounting)
13239 return -ENOTCONN;
13240
d2e6a577
FG
13241 /* first data pool is the default */
13242 return mdsmap->get_first_data_pool();
13243}
7c673cae
FG
13244
13245// expose osdmap
13246
13247int64_t Client::get_pool_id(const char *pool_name)
13248{
13249 Mutex::Locker lock(client_lock);
181888fb
FG
13250
13251 if (unmounting)
13252 return -ENOTCONN;
13253
7c673cae
FG
13254 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13255 pool_name);
13256}
13257
13258string Client::get_pool_name(int64_t pool)
13259{
13260 Mutex::Locker lock(client_lock);
181888fb
FG
13261
13262 if (unmounting)
13263 return string();
13264
7c673cae
FG
13265 return objecter->with_osdmap([pool](const OSDMap& o) {
13266 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13267 });
13268}
13269
13270int Client::get_pool_replication(int64_t pool)
13271{
13272 Mutex::Locker lock(client_lock);
181888fb
FG
13273
13274 if (unmounting)
13275 return -ENOTCONN;
13276
7c673cae
FG
13277 return objecter->with_osdmap([pool](const OSDMap& o) {
13278 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13279 });
13280}
13281
13282int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13283{
13284 Mutex::Locker lock(client_lock);
13285
181888fb
FG
13286 if (unmounting)
13287 return -ENOTCONN;
13288
7c673cae
FG
13289 Fh *f = get_filehandle(fd);
13290 if (!f)
13291 return -EBADF;
13292 Inode *in = f->inode.get();
13293
13294 vector<ObjectExtent> extents;
13295 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13296 assert(extents.size() == 1);
13297
13298 objecter->with_osdmap([&](const OSDMap& o) {
13299 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13300 o.pg_to_acting_osds(pg, osds);
13301 });
13302
13303 if (osds.empty())
13304 return -EINVAL;
13305
13306 /*
13307 * Return the remainder of the extent (stripe unit)
13308 *
13309 * If length = 1 is passed to Striper::file_to_extents we get a single
13310 * extent back, but its length is one so we still need to compute the length
13311 * to the end of the stripe unit.
13312 *
13313 * If length = su then we may get 1 or 2 objects back in the extents vector
13314 * which would have to be examined. Even then, the offsets are local to the
13315 * object, so matching up to the file offset is extra work.
13316 *
13317 * It seems simpler to stick with length = 1 and manually compute the
13318 * remainder.
13319 */
13320 if (len) {
13321 uint64_t su = in->layout.stripe_unit;
13322 *len = su - (off % su);
13323 }
13324
13325 return 0;
13326}
13327
13328int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13329{
13330 Mutex::Locker lock(client_lock);
181888fb
FG
13331
13332 if (unmounting)
13333 return -ENOTCONN;
13334
7c673cae
FG
13335 if (id < 0)
13336 return -EINVAL;
13337 return objecter->with_osdmap([&](const OSDMap& o) {
13338 return o.crush->get_full_location_ordered(id, path);
13339 });
13340}
13341
13342int Client::get_file_stripe_address(int fd, loff_t offset,
13343 vector<entity_addr_t>& address)
13344{
13345 Mutex::Locker lock(client_lock);
13346
181888fb
FG
13347 if (unmounting)
13348 return -ENOTCONN;
13349
7c673cae
FG
13350 Fh *f = get_filehandle(fd);
13351 if (!f)
13352 return -EBADF;
13353 Inode *in = f->inode.get();
13354
13355 // which object?
13356 vector<ObjectExtent> extents;
13357 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13358 in->truncate_size, extents);
13359 assert(extents.size() == 1);
13360
13361 // now we have the object and its 'layout'
13362 return objecter->with_osdmap([&](const OSDMap& o) {
13363 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13364 vector<int> osds;
13365 o.pg_to_acting_osds(pg, osds);
13366 if (osds.empty())
13367 return -EINVAL;
13368 for (unsigned i = 0; i < osds.size(); i++) {
13369 entity_addr_t addr = o.get_addr(osds[i]);
13370 address.push_back(addr);
13371 }
13372 return 0;
13373 });
13374}
13375
13376int Client::get_osd_addr(int osd, entity_addr_t& addr)
13377{
13378 Mutex::Locker lock(client_lock);
181888fb
FG
13379
13380 if (unmounting)
13381 return -ENOTCONN;
13382
7c673cae
FG
13383 return objecter->with_osdmap([&](const OSDMap& o) {
13384 if (!o.exists(osd))
13385 return -ENOENT;
13386
13387 addr = o.get_addr(osd);
13388 return 0;
13389 });
13390}
13391
13392int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13393 loff_t length, loff_t offset)
13394{
13395 Mutex::Locker lock(client_lock);
13396
181888fb
FG
13397 if (unmounting)
13398 return -ENOTCONN;
13399
7c673cae
FG
13400 Fh *f = get_filehandle(fd);
13401 if (!f)
13402 return -EBADF;
13403 Inode *in = f->inode.get();
13404
13405 // map to a list of extents
13406 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13407
13408 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13409 return 0;
13410}
13411
13412
b32b8144 13413/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13414int Client::get_local_osd()
13415{
13416 Mutex::Locker lock(client_lock);
181888fb
FG
13417
13418 if (unmounting)
13419 return -ENOTCONN;
13420
7c673cae
FG
13421 objecter->with_osdmap([this](const OSDMap& o) {
13422 if (o.get_epoch() != local_osd_epoch) {
13423 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13424 local_osd_epoch = o.get_epoch();
13425 }
13426 });
13427 return local_osd;
13428}
13429
13430
13431
13432
13433
13434
13435// ===============================
13436
13437void Client::ms_handle_connect(Connection *con)
13438{
13439 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13440}
13441
13442bool Client::ms_handle_reset(Connection *con)
13443{
13444 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13445 return false;
13446}
13447
13448void Client::ms_handle_remote_reset(Connection *con)
13449{
13450 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13451 Mutex::Locker l(client_lock);
13452 switch (con->get_peer_type()) {
13453 case CEPH_ENTITY_TYPE_MDS:
13454 {
13455 // kludge to figure out which mds this is; fixme with a Connection* state
13456 mds_rank_t mds = MDS_RANK_NONE;
13457 MetaSession *s = NULL;
13458 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13459 p != mds_sessions.end();
13460 ++p) {
13461 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13462 mds = p->first;
13463 s = p->second;
13464 }
13465 }
13466 if (mds >= 0) {
d2e6a577 13467 assert (s != NULL);
7c673cae
FG
13468 switch (s->state) {
13469 case MetaSession::STATE_CLOSING:
13470 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13471 _closed_mds_session(s);
13472 break;
13473
13474 case MetaSession::STATE_OPENING:
13475 {
13476 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13477 list<Context*> waiters;
13478 waiters.swap(s->waiting_for_open);
13479 _closed_mds_session(s);
13480 MetaSession *news = _get_or_open_mds_session(mds);
13481 news->waiting_for_open.swap(waiters);
13482 }
13483 break;
13484
13485 case MetaSession::STATE_OPEN:
13486 {
28e407b8 13487 objecter->maybe_request_map(); /* to check if we are blacklisted */
7c673cae
FG
13488 const md_config_t *conf = cct->_conf;
13489 if (conf->client_reconnect_stale) {
13490 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13491 _closed_mds_session(s);
13492 } else {
13493 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13494 s->state = MetaSession::STATE_STALE;
13495 }
13496 }
13497 break;
13498
13499 case MetaSession::STATE_NEW:
13500 case MetaSession::STATE_CLOSED:
13501 default:
13502 break;
13503 }
13504 }
13505 }
13506 break;
13507 }
13508}
13509
13510bool Client::ms_handle_refused(Connection *con)
13511{
13512 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13513 return false;
13514}
13515
13516bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13517{
13518 if (dest_type == CEPH_ENTITY_TYPE_MON)
13519 return true;
13520 *authorizer = monclient->build_authorizer(dest_type);
13521 return true;
13522}
13523
13524Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13525{
13526 Inode *cur = in;
13527 utime_t now = ceph_clock_now();
13528
13529 while (cur) {
13530 if (cur != in && cur->quota.is_enable())
13531 break;
13532
13533 Inode *parent_in = NULL;
13534 if (!cur->dn_set.empty()) {
13535 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13536 Dentry *dn = *p;
13537 if (dn->lease_mds >= 0 &&
13538 dn->lease_ttl > now &&
13539 mds_sessions.count(dn->lease_mds)) {
13540 parent_in = dn->dir->parent_inode;
13541 } else {
13542 Inode *diri = dn->dir->parent_inode;
13543 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13544 diri->shared_gen == dn->cap_shared_gen) {
13545 parent_in = dn->dir->parent_inode;
13546 }
13547 }
13548 if (parent_in)
13549 break;
13550 }
13551 } else if (root_parents.count(cur)) {
13552 parent_in = root_parents[cur].get();
13553 }
13554
13555 if (parent_in) {
13556 cur = parent_in;
13557 continue;
13558 }
13559
13560 if (cur == root_ancestor)
13561 break;
13562
181888fb
FG
13563 // deleted inode
13564 if (cur->nlink == 0) {
13565 cur = root_ancestor;
13566 break;
13567 }
13568
7c673cae
FG
13569 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13570 filepath path(cur->ino);
13571 req->set_filepath(path);
13572 req->set_inode(cur);
13573
13574 InodeRef parent_ref;
13575 int ret = make_request(req, perms, &parent_ref);
13576 if (ret < 0) {
13577 ldout(cct, 1) << __func__ << " " << in->vino()
13578 << " failed to find parent of " << cur->vino()
13579 << " err " << ret << dendl;
13580 // FIXME: what to do?
13581 cur = root_ancestor;
13582 break;
13583 }
13584
13585 now = ceph_clock_now();
13586 if (cur == in)
13587 cur = parent_ref.get();
13588 else
13589 cur = in; // start over
13590 }
13591
13592 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13593 return cur;
13594}
13595
13596/**
13597 * Traverse quota ancestors of the Inode, return true
13598 * if any of them passes the passed function
13599 */
13600bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13601 std::function<bool (const Inode &in)> test)
13602{
13603 while (true) {
13604 assert(in != NULL);
13605 if (test(*in)) {
13606 return true;
13607 }
13608
13609 if (in == root_ancestor) {
13610 // We're done traversing, drop out
13611 return false;
13612 } else {
13613 // Continue up the tree
13614 in = get_quota_root(in, perms);
13615 }
13616 }
13617
13618 return false;
13619}
13620
13621bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13622{
13623 return check_quota_condition(in, perms,
13624 [](const Inode &in) {
13625 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13626 });
13627}
13628
13629bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
28e407b8
AA
13630 const UserPerm& perms,
13631 std::list<InodeRef>* quota_roots)
7c673cae
FG
13632{
13633 return check_quota_condition(in, perms,
28e407b8
AA
13634 [&new_bytes, quota_roots](const Inode &in) {
13635 if (quota_roots)
13636 quota_roots->emplace_back(const_cast<Inode*>(&in));
7c673cae
FG
13637 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13638 > in.quota.max_bytes;
13639 });
13640}
13641
28e407b8 13642bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
7c673cae 13643{
28e407b8
AA
13644 assert(in->size >= in->reported_size);
13645 const uint64_t size = in->size - in->reported_size;
13646
13647 for (auto& diri : quota_roots) {
13648 if (diri->quota.max_bytes) {
13649 if (diri->rstat.rbytes >= diri->quota.max_bytes)
13650 return true;
13651
13652 uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
13653 if ((space >> 4) < size)
13654 return true;
13655 }
13656 }
13657 return false;
7c673cae
FG
13658}
13659
13660enum {
13661 POOL_CHECKED = 1,
13662 POOL_CHECKING = 2,
13663 POOL_READ = 4,
13664 POOL_WRITE = 8,
13665};
13666
13667int Client::check_pool_perm(Inode *in, int need)
13668{
13669 if (!cct->_conf->client_check_pool_perm)
13670 return 0;
13671
13672 int64_t pool_id = in->layout.pool_id;
13673 std::string pool_ns = in->layout.pool_ns;
13674 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13675 int have = 0;
13676 while (true) {
13677 auto it = pool_perms.find(perm_key);
13678 if (it == pool_perms.end())
13679 break;
13680 if (it->second == POOL_CHECKING) {
13681 // avoid concurrent checkings
13682 wait_on_list(waiting_for_pool_perm);
13683 } else {
13684 have = it->second;
13685 assert(have & POOL_CHECKED);
13686 break;
13687 }
13688 }
13689
13690 if (!have) {
13691 if (in->snapid != CEPH_NOSNAP) {
13692 // pool permission check needs to write to the first object. But for snapshot,
13693 // head of the first object may have alread been deleted. To avoid creating
13694 // orphan object, skip the check for now.
13695 return 0;
13696 }
13697
13698 pool_perms[perm_key] = POOL_CHECKING;
13699
13700 char oid_buf[32];
13701 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13702 object_t oid = oid_buf;
13703
13704 SnapContext nullsnapc;
13705
13706 C_SaferCond rd_cond;
13707 ObjectOperation rd_op;
13708 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13709
13710 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13711 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13712
13713 C_SaferCond wr_cond;
13714 ObjectOperation wr_op;
13715 wr_op.create(true);
13716
13717 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13718 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13719
13720 client_lock.Unlock();
13721 int rd_ret = rd_cond.wait();
13722 int wr_ret = wr_cond.wait();
13723 client_lock.Lock();
13724
13725 bool errored = false;
13726
13727 if (rd_ret == 0 || rd_ret == -ENOENT)
13728 have |= POOL_READ;
13729 else if (rd_ret != -EPERM) {
13730 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13731 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13732 errored = true;
13733 }
13734
13735 if (wr_ret == 0 || wr_ret == -EEXIST)
13736 have |= POOL_WRITE;
13737 else if (wr_ret != -EPERM) {
13738 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13739 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13740 errored = true;
13741 }
13742
13743 if (errored) {
13744 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13745 // Raise EIO because actual error code might be misleading for
13746 // userspace filesystem user.
13747 pool_perms.erase(perm_key);
13748 signal_cond_list(waiting_for_pool_perm);
13749 return -EIO;
13750 }
13751
13752 pool_perms[perm_key] = have | POOL_CHECKED;
13753 signal_cond_list(waiting_for_pool_perm);
13754 }
13755
13756 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13757 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13758 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13759 return -EPERM;
13760 }
13761 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13762 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13763 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13764 return -EPERM;
13765 }
13766
13767 return 0;
13768}
13769
13770int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13771{
13772 if (acl_type == POSIX_ACL) {
13773 if (in->xattrs.count(ACL_EA_ACCESS)) {
13774 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13775
13776 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13777 }
13778 }
13779 return -EAGAIN;
13780}
13781
13782int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13783{
13784 if (acl_type == NO_ACL)
13785 return 0;
13786
13787 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13788 if (r < 0)
13789 goto out;
13790
13791 if (acl_type == POSIX_ACL) {
13792 if (in->xattrs.count(ACL_EA_ACCESS)) {
13793 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13794 bufferptr acl(access_acl.c_str(), access_acl.length());
13795 r = posix_acl_access_chmod(acl, mode);
13796 if (r < 0)
13797 goto out;
13798 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13799 } else {
13800 r = 0;
13801 }
13802 }
13803out:
13804 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13805 return r;
13806}
13807
13808int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13809 const UserPerm& perms)
13810{
13811 if (acl_type == NO_ACL)
13812 return 0;
13813
13814 if (S_ISLNK(*mode))
13815 return 0;
13816
13817 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13818 if (r < 0)
13819 goto out;
13820
13821 if (acl_type == POSIX_ACL) {
13822 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13823 map<string, bufferptr> xattrs;
13824
13825 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13826 bufferptr acl(default_acl.c_str(), default_acl.length());
13827 r = posix_acl_inherit_mode(acl, mode);
13828 if (r < 0)
13829 goto out;
13830
13831 if (r > 0) {
13832 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13833 if (r < 0)
13834 goto out;
13835 if (r > 0)
13836 xattrs[ACL_EA_ACCESS] = acl;
13837 }
13838
13839 if (S_ISDIR(*mode))
13840 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13841
13842 r = xattrs.size();
13843 if (r > 0)
13844 ::encode(xattrs, xattrs_bl);
13845 } else {
13846 if (umask_cb)
13847 *mode &= ~umask_cb(callback_handle);
13848 r = 0;
13849 }
13850 }
13851out:
13852 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13853 return r;
13854}
13855
13856void Client::set_filer_flags(int flags)
13857{
13858 Mutex::Locker l(client_lock);
13859 assert(flags == 0 ||
13860 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13861 objecter->add_global_op_flags(flags);
13862}
13863
13864void Client::clear_filer_flags(int flags)
13865{
13866 Mutex::Locker l(client_lock);
13867 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13868 objecter->clear_global_op_flag(flags);
13869}
13870
13871/**
13872 * This is included in cap release messages, to cause
13873 * the MDS to wait until this OSD map epoch. It is necessary
13874 * in corner cases where we cancel RADOS ops, so that
13875 * nobody else tries to do IO to the same objects in
13876 * the same epoch as the cancelled ops.
13877 */
13878void Client::set_cap_epoch_barrier(epoch_t e)
13879{
13880 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13881 cap_epoch_barrier = e;
13882}
13883
13884const char** Client::get_tracked_conf_keys() const
13885{
13886 static const char* keys[] = {
13887 "client_cache_size",
13888 "client_cache_mid",
13889 "client_acl_type",
b32b8144
FG
13890 "client_deleg_timeout",
13891 "client_deleg_break_on_open",
7c673cae
FG
13892 NULL
13893 };
13894 return keys;
13895}
13896
13897void Client::handle_conf_change(const struct md_config_t *conf,
13898 const std::set <std::string> &changed)
13899{
13900 Mutex::Locker lock(client_lock);
13901
181888fb 13902 if (changed.count("client_cache_mid")) {
7c673cae
FG
13903 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13904 }
13905 if (changed.count("client_acl_type")) {
13906 acl_type = NO_ACL;
13907 if (cct->_conf->client_acl_type == "posix_acl")
13908 acl_type = POSIX_ACL;
13909 }
13910}
13911
7c673cae
FG
13912void intrusive_ptr_add_ref(Inode *in)
13913{
13914 in->get();
13915}
13916
13917void intrusive_ptr_release(Inode *in)
13918{
13919 in->client->put_inode(in);
13920}
13921
13922mds_rank_t Client::_get_random_up_mds() const
13923{
13924 assert(client_lock.is_locked_by_me());
13925
13926 std::set<mds_rank_t> up;
13927 mdsmap->get_up_mds_set(up);
13928
13929 if (up.empty())
13930 return MDS_RANK_NONE;
13931 std::set<mds_rank_t>::const_iterator p = up.begin();
13932 for (int n = rand() % up.size(); n; n--)
13933 ++p;
13934 return *p;
13935}
13936
13937
13938StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13939 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13940{
13941 monclient->set_messenger(m);
13942 objecter->set_client_incarnation(0);
13943}
13944
13945StandaloneClient::~StandaloneClient()
13946{
13947 delete objecter;
13948 objecter = nullptr;
13949}
13950
13951int StandaloneClient::init()
13952{
13953 timer.init();
13954 objectcacher->start();
13955 objecter->init();
13956
13957 client_lock.Lock();
13958 assert(!initialized);
13959
13960 messenger->add_dispatcher_tail(objecter);
13961 messenger->add_dispatcher_tail(this);
13962
13963 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13964 int r = monclient->init();
13965 if (r < 0) {
13966 // need to do cleanup because we're in an intermediate init state
13967 timer.shutdown();
13968 client_lock.Unlock();
13969 objecter->shutdown();
13970 objectcacher->stop();
13971 monclient->shutdown();
13972 return r;
13973 }
13974 objecter->start();
13975
13976 client_lock.Unlock();
13977 _finish_init();
13978
13979 return 0;
13980}
13981
13982void StandaloneClient::shutdown()
13983{
13984 Client::shutdown();
13985 objecter->shutdown();
13986 monclient->shutdown();
13987}